In [20]:
import json
import requests
import pandas as pd

In [21]:
def get_bing_available_languages():
    params = (
        ('api-version', '3.0'),
        ('scope', 'translation'),
    )
    response = requests.get('https://api.cognitive.microsofttranslator.com/languages', params=params)
    return [(lang, extra_info['name']) for lang, extra_info in
            json.loads(response.text)['translation'].items()]

bing_languages = get_bing_available_languages()

In [22]:
bing_languages = pd.DataFrame(bing_languages, columns=["bing_BCP_47", "name"])
bing_languages

Unnamed: 0,bing_BCP_47,name
0,af,Afrikaans
1,am,Amharic
2,ar,Arabic
3,as,Assamese
4,az,Azerbaijani
...,...,...
86,vi,Vietnamese
87,yua,Yucatec Maya
88,yue,Cantonese (Traditional)
89,zh-Hans,Chinese Simplified


In [23]:
language_mapping = pd.read_csv('../dataset/mbertlangs.txt', sep='\t')
language_mapping

Unnamed: 0,wiki,name,iso,googletranslate
0,af,Afrikaans,afr,af
1,sq,Albanian,sqi,sq
2,ar,Arabic,"ara,arb",ar
3,an,Aragonese,arg,
4,hy,Armenian,hye,hy
...,...,...,...,...
99,fy,West Frisian,fry,fy
100,pnb,Western Punjabi,"pnb,pan",
101,yo,Yoruba,yor,yo
102,th,Thai,tha,th


In [32]:
df_merge = pd.merge(
    language_mapping, bing_languages['bing_BCP_47'], left_on='wiki',
    right_on='bing_BCP_47', how='left')
df_merge

Unnamed: 0,wiki,name,iso,googletranslate,bing_BCP_47
0,af,Afrikaans,afr,af,af
1,sq,Albanian,sqi,sq,sq
2,ar,Arabic,"ara,arb",ar,ar
3,an,Aragonese,arg,,
4,hy,Armenian,hye,hy,hy
...,...,...,...,...,...
99,fy,West Frisian,fry,fy,
100,pnb,Western Punjabi,"pnb,pan",,
101,yo,Yoruba,yor,yo,
102,th,Thai,tha,th,th


In [33]:
# Languages supported by bing but not in the wiki list.
not_in_wiki = set(bing_languages.bing_BCP_47.values).difference(set(df_merge.wiki.values))
len(not_in_wiki), not_in_wiki

(32,
 {'am',
  'as',
  'fil',
  'fj',
  'fr-CA',
  'iu',
  'km',
  'kmr',
  'ku',
  'lo',
  'lzh',
  'mi',
  'mt',
  'mww',
  'nb',
  'or',
  'otq',
  'prs',
  'ps',
  'pt-PT',
  'sm',
  'sr-Cyrl',
  'sr-Latn',
  'ti',
  'tlh-Latn',
  'tlh-Piqd',
  'to',
  'ty',
  'yua',
  'yue',
  'zh-Hans',
  'zh-Hant'})

In [34]:
m2m_100_langs = "af, am, ar, ast, az, ba, be, bg, bn, br, bs, ca, ceb, cs, cy, da, de, el, en, es, et, fa, ff, fi, fr, fy, ga, gd, gl, gu, ha, he, hi, hr, ht, hu, hy, id, ig, ilo, is, it, ja, jv, ka, kk, km, kn, ko, lb, lg, ln, lo, lt, lv, mg, mk, ml, mn, mr, ms, my, ne, nl, no, ns, oc, or, pa, pl, ps, pt, ro, ru, sd, si, sk, sl, so, sq, sr, ss, su, sv, sw, ta, th, tl, tn, tr, uk, ur, uz, vi, wo, xh, yi, yo, zh, zu"
m2m_100_langs = pd.DataFrame(m2m_100_langs.split(', '), columns=["m2m_100"])

In [35]:
df_merge = pd.merge(
    df_merge, m2m_100_langs, left_on='wiki', right_on='m2m_100',
    how='left')
df_merge

Unnamed: 0,wiki,name,iso,googletranslate,bing_BCP_47,m2m_100
0,af,Afrikaans,afr,af,af,af
1,sq,Albanian,sqi,sq,sq,sq
2,ar,Arabic,"ara,arb",ar,ar,ar
3,an,Aragonese,arg,,,
4,hy,Armenian,hye,hy,hy,hy
...,...,...,...,...,...,...
99,fy,West Frisian,fry,fy,,fy
100,pnb,Western Punjabi,"pnb,pan",,,
101,yo,Yoruba,yor,yo,,yo
102,th,Thai,tha,th,th,th


In [37]:
# Languages supported by bing but not in the wiki list.
not_in_wiki = set(m2m_100_langs.m2m_100.values).difference(set(df_merge.wiki.values))
len(not_in_wiki), not_in_wiki

(22,
 {'am',
  'ff',
  'gd',
  'ha',
  'ig',
  'ilo',
  'km',
  'lg',
  'ln',
  'lo',
  'ns',
  'or',
  'ps',
  'sd',
  'si',
  'so',
  'ss',
  'tn',
  'wo',
  'xh',
  'yi',
  'zu'})

In [38]:
mbart_50 = "af, ar, az, bn, cs, de, en, es, et, fa, fi, fr, gl, gu, he, hi, hr, id, it, ja, ka, kk, km, ko, lt, lv, mk, ml, mn, mr, my, ne, nl, pl, ps, pt, ro, ru, si, sl, sv, sw, ta, te, th, tl, tr, uk, ur, vi, xh, zh"
mbart_50 = pd.DataFrame(mbart_50.split(', '), columns=["mbart_50"])

In [39]:
df_merge = pd.merge(
    df_merge, mbart_50, left_on='wiki', right_on='mbart_50', how='left')
df_merge

Unnamed: 0,wiki,name,iso,googletranslate,bing_BCP_47,m2m_100,mbart_50
0,af,Afrikaans,afr,af,af,af,af
1,sq,Albanian,sqi,sq,sq,sq,
2,ar,Arabic,"ara,arb",ar,ar,ar,ar
3,an,Aragonese,arg,,,,
4,hy,Armenian,hye,hy,hy,hy,
...,...,...,...,...,...,...,...
99,fy,West Frisian,fry,fy,,fy,
100,pnb,Western Punjabi,"pnb,pan",,,,
101,yo,Yoruba,yor,yo,,yo,
102,th,Thai,tha,th,th,th,th


In [41]:
# Languages supported by bing but not in the wiki list.
not_in_wiki = set(mbart_50.mbart_50.values).difference(set(df_merge.wiki.values))
len(not_in_wiki), not_in_wiki

(4, {'km', 'ps', 'si', 'xh'})

In [42]:
opus_mt = "aav, aed, af, alv, am, ar, art, ase, az, bat, bcl, be, bem, ber, bg, bi, bn, bnt, bzs, ca, cau, ccs, ceb, cel, chk, cpf, crs, cs, csg, csn, cus, cy, da, de, dra, ee, efi, el, en, eo, es, et, eu, euq, fi, fj, fr, fse, ga, gaa, gil, gl, grk, guw, gv, ha, he, hi, hil, ho, hr, ht, hu, hy, id, ig, ilo, is, iso, it, ja, jap, ka, kab, kg, kj, kl, ko, kqn, kwn, kwy, lg, ln, loz, lt, lu, lua, lue, lun, luo, lus, lv, map, mfe, mfs, mg, mh, mk, mkh, ml, mos, mr, ms, mt, mul, ng, nic, niu, nl, no, nso, ny, nyk, om, pa, pag, pap, phi, pis, pl, pon, poz, pqe, pqw, prl, pt, rn, rnd, ro, roa, ru, run, rw, sal, sg, sh, sit, sk, sl, sm, sn, sq, srn, ss, ssp, st, sv, sw, swc, taw, tdt, th, ti, tiv, tl, tll, tn, to, toi, tpi, tr, trk, ts, tum, tut, tvl, tw, ty, tzo, uk, umb, ur, ve, vi, vsl, wa, wal, war, wls, xh, yap, yo, yua, zai, zh, zne"
opus_mt = pd.DataFrame(opus_mt.split(', '), columns=["opus_mt"])
df_merge = pd.merge(
    df_merge, opus_mt, left_on='wiki', right_on='opus_mt', how='left')
df_merge

Unnamed: 0,wiki,name,iso,googletranslate,bing_BCP_47,m2m_100,mbart_50,opus_mt
0,af,Afrikaans,afr,af,af,af,af,af
1,sq,Albanian,sqi,sq,sq,sq,,sq
2,ar,Arabic,"ara,arb",ar,ar,ar,ar,ar
3,an,Aragonese,arg,,,,,
4,hy,Armenian,hye,hy,hy,hy,,hy
...,...,...,...,...,...,...,...,...
99,fy,West Frisian,fry,fy,,fy,,
100,pnb,Western Punjabi,"pnb,pan",,,,,
101,yo,Yoruba,yor,yo,,yo,,yo
102,th,Thai,tha,th,th,th,th,th


In [44]:
# Languages supported by bing but not in the wiki list.
not_in_wiki = set(opus_mt.opus_mt.values).difference(set(df_merge.wiki.values))
len(not_in_wiki), not_in_wiki

(124,
 {'aav',
  'aed',
  'alv',
  'am',
  'art',
  'ase',
  'bat',
  'bcl',
  'bem',
  'ber',
  'bi',
  'bnt',
  'bzs',
  'cau',
  'ccs',
  'cel',
  'chk',
  'cpf',
  'crs',
  'csg',
  'csn',
  'cus',
  'dra',
  'ee',
  'efi',
  'eo',
  'euq',
  'fj',
  'fse',
  'gaa',
  'gil',
  'grk',
  'guw',
  'gv',
  'ha',
  'hil',
  'ho',
  'ig',
  'ilo',
  'iso',
  'jap',
  'kab',
  'kg',
  'kj',
  'kl',
  'kqn',
  'kwn',
  'kwy',
  'lg',
  'ln',
  'loz',
  'lu',
  'lua',
  'lue',
  'lun',
  'luo',
  'lus',
  'map',
  'mfe',
  'mfs',
  'mh',
  'mkh',
  'mos',
  'mt',
  'mul',
  'ng',
  'nic',
  'niu',
  'nso',
  'ny',
  'nyk',
  'om',
  'pag',
  'pap',
  'phi',
  'pis',
  'pon',
  'poz',
  'pqe',
  'pqw',
  'prl',
  'rn',
  'rnd',
  'roa',
  'run',
  'rw',
  'sal',
  'sg',
  'sit',
  'sm',
  'sn',
  'srn',
  'ss',
  'ssp',
  'st',
  'swc',
  'taw',
  'tdt',
  'ti',
  'tiv',
  'tll',
  'tn',
  'to',
  'toi',
  'tpi',
  'trk',
  'ts',
  'tum',
  'tut',
  'tvl',
  'tw',
  'ty',
  'tzo',
  'umb',
 

In [45]:
df_merge.to_csv('../dataset/languages_mapping.txt', sep='\t', index=False)