Is L1:A the machine translation of L2:B? Is L2:B the machine translation of L1:A?

### Load section information from the source language

In [85]:
import pandas as pd
import networkx as nx
import json
from collections import Counter
import gzip,json
import re


### Select the sections name to be translate

* define targetLang
* use 'frequency' to filter
* use percent to filter by %

In [143]:
def getSectionList(sourceLang,frequency=1,percent=False):
    """
    Use frequency to filter for the number of documents where the section name appears
    use percent to the top % sections
    """
    sectionsTargetAll = []
    sectionsTarget = {}
    c=0
    with gzip.open('../gap/multiLanguageFromDumps/sections-articles_%s.json.gz' % sourceLang) as f: #this for the freq hq articles
        for l in f:
            c+=1
            if c%1000000 == 0: print(c)
            tmp = json.loads(l.decode())
            sectionsTarget[tmp[1]] = tmp[2]
            for secName in tmp[2]:
                    sectionsTargetAll.append(secName.strip())
    sectionsTargetFreq = Counter(sectionsTargetAll)
    #sectionsEnFreq.most_common(50)
    total = sum(sectionsTargetFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsTargetFreq.most_common()):
        acc+= freq
        secsToEval.append((sec,freq))
        if percent:
            if acc/total > percent: #using sections that cover 80% of total
                print(n)
                print(freq)
                break
        else:
            if freq < frequency:
                print(n)
                print(acc/total)
                break
    return secsToEval


Translator

In [218]:
from googletrans import Translator
from random import random
from time import sleep

def getSecsPair(seclist=secsToEval,translateTo='es'):
    """ 
    Source language is detected automatically
    Return a section list in the target language
    """
    print('Wait, translating')
    translator = Translator()
    words = ''
    results = ''
    for n,(section,rank) in enumerate(seclist):
        if len(words) + len(section) < 4900: #max allowed by google
            words += '%d) %s \n' % (n , section)
        else:
            results+= translator.translate(words,dest=translateTo).text +'\n'
            sleep(random() * 5)
            #print(results)
            words = '%d) %s \n' % (n , section)

    results+= translator.translate(words,dest=translateTo).text
    d  = dict(re.findall("(\d+)\) (.+)\n",results))
    translated ={}
    for n,(word,freq)  in enumerate(seclist):
        try:
            translated[word] = d[str(n)]
        except:
            pass
    df = pd.DataFrame.from_dict(translated,orient='index')
    rank = dict([(x,n) for n,(x,y) in enumerate((sorted(seclist,key=lambda s: s[1], reverse=True)))])
    df  = df.join(pd.DataFrame.from_dict(rank,orient='index'),rsuffix='rank')
    df['source'] = df.index
    df = df.sort_values(["0rank"])
    df = df.rename(columns={'0':'result',"0rank":'rank'})
    df = df.set_index("rank")
    print('Finish')
    return df

In [263]:
def checkBack(langFrom,translateTo,frequency =1):
    """ 
    langFrom: original language
    translateTo: translateTo
    frequency: just consider sections appearing 'frequency' times in the original language
    """
    secsToEval = getSectionList(sourceLang = langFrom,frequency = frequency )
    pairsFirst = getSecsPair(secsToEval,translateTo)
    secsToCheckBack = list(zip(pairsFirst['result'], pairsFirst.index))
    pairsBack = getSecsPair(secsToCheckBack,translateTo=langFrom)
    output = pairsFirst.set_index('result').join(pairsBack.set_index('source'))
    output['compared'] = output.apply(lambda x :True if x['result'] == x['source']  else False, axis=1)
    output['translate'] = output.index
    output.index = output['source'] 
    #sorting back by rank
    pairsFirst['rank'] = pairsFirst.index
    outputSorted =   output.join(pairsFirst[['source','rank']].set_index('source')).set_index('rank').sort_index()
    return outputSorted
    

###  test

In [275]:
#Catalan to English, min freq=50
catToEn = checkBack('ca','en',100)

515
0.7810210701625164
Wait, translating
Finish
Wait, translating
Finish


In [276]:
catToEn

Unnamed: 0_level_0,source,result,compared,translate
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Referències,Referències,True,References
1,Demografia,Demografia,True,Demography
2,Bibliografia,Bibliografia,True,Bibliography
3,Història,Història,True,History
4,Poblacions més properes,Pobles més propers,False,Nearest towns
5,Vegeu també,Vegeu també,True,See also
6,Biografia,Biografia,True,Biography
7,Economia,Economia,True,Economy
8,Distribució geogràfica,Distribució geogràfica,True,Geographic distribution
9,Descripció,Descripció,True,Description


In [277]:
catToEn.compared.value_counts()

True     414
False    101
Name: compared, dtype: int64

In [278]:
#German to English, min freq=100
deToEn = checkBack('de','en',100)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
3938
0.84050065556761
Wait, translating
Finish
Wait, translating
Finish


In [279]:
deToEn

Unnamed: 0_level_0,source,result,compared,translate
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Weblinks,Weblinks,True,Web links
1,Literatur,Literatur,True,literature
2,Geschichte,Geschichte,True,story
3,Leben,Leben,True,life
4,Einzelnachweise,individuelle Beweise,False,individual proofs
5,Siehe auch,Siehe auch,True,See also
6,Geographie,Geographie,True,Geography
7,Karriere,Karriere,True,career
8,Auszeichnungen,Auszeichnungen,True,Awards
9,Werke,funktioniert,False,works


In [282]:
deToEn.compared.value_counts()

False    2063
True     1870
Name: compared, dtype: int64

In [None]:
#Farsi to English

In [283]:
faToEn = checkBack('fa','en',100)

1000000
2000000
251
0.7644290801033017
Wait, translating
Finish
Wait, translating
Finish


In [285]:
faToEn

Unnamed: 0_level_0,source,result,compared,translate
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,جستارهای وابسته,سوالات مرتبط,False,Related Questions
1,منابع,منابع,True,Resources
2,جمعیت,جمعیت,True,population
3,خصوصیات,مشخصات,False,Specifications
4,پانویس,پاورقی,False,Footnote
5,نگارخانه,گالری,False,Gallery
6,بازیگران,بازیگران,True,Cast
7,تاریخچه,تاریخچه,True,History
8,پیوند به بیرون,لینک به خارج,False,Link to the outside
9,مشخصات قنات,مشاهده آبشار,False,View of the aqueduct


In [284]:
faToEn.compared.value_counts()

False    134
True     116
Name: compared, dtype: int64