#### ***multilingual reverse dictionary dataset generation***

In [None]:
import os
import copy
import json
import random
import threading
from googletrans import Translator

In [None]:
data_dir = 'data/'
data_src_file = 'data_train.json'
data_target_file = 'train.json'

In [None]:
bulk_size = 5
lang_size = 20000
langs = ('pt', 'it', 'zh-cn')

In [None]:
translator = Translator()

In [None]:
data_en = json.load(open(os.path.join(data_dir, data_src_file), 'r', encoding='utf8'))
defs_en = list(map(lambda x: x['definitions'], data_en))
data_en_size = len(data_en)
assert data_en_size == len(defs_en)

In [None]:
lang_samples = {}
for lang in langs:
    lang_samples[lang] = random.choices(data_en, k=lang_size)
    assert len(lang_samples[lang]) == lang_size

In [None]:
def collect(name, lang):
    data_lang = []
    print(f'\tstarting: {name} {lang}')
    for i in range(0, lang_size, bulk_size):
        lang_samples_bulk = lang_samples[lang][i:i+bulk_size]
        lang_defs_bulk = list(map(lambda x: x['definitions'], lang_samples_bulk))
        try: lang_defs_bulk_trans = list(map(lambda x: x.text, translator.translate(lang_defs_bulk, src='en', dest=lang)))
        except: continue
        lang_samples_bulk_trans = copy.deepcopy(lang_samples_bulk)
        for j in range(bulk_size):
            lang_samples_bulk_trans[j]['definitions'] = lang_defs_bulk_trans[j]
        data_lang.extend(lang_samples_bulk_trans)
        json.dump(data_lang, open(lang + '-' + data_target_file, 'w', encoding='utf8'), ensure_ascii=False, indent=4)
    print(f'\tfinishing: {name} {lang}')

In [None]:
threads = list()
for i, lang in enumerate(langs):
    print(f'Main: create and start thread {i} {lang}')
    x = threading.Thread(target=collect, args=(i, lang))
    threads.append(x)
    x.start()

for i, thread in enumerate(threads):
    print(f'Main: before joining thread {i} {langs[i]}')
    thread.join()
    print(f'Main: thread {i} {langs[i]} done')

#### ***merging data from different languages***

In [None]:
langs = ('pt', 'it', 'zh-cn')

merge instances from different sources for the same language

In [None]:
multilang_dict = {}
for lang in langs:
    print(f'processing {lang}...')
    lang_processed_data, samples_seen, i = [], set(), 0
    file_name = f'{lang}{i}-{data_target_file}'
    while os.path.exists(file_name):
        print('\t' + file_name)
        data = json.load(open(file_name, 'r', encoding='utf8'))
        for sample in data:
            sample['lang'] = lang
            if (sample['word'], sample['definitions']) in samples_seen:
                continue
            samples_seen.add((sample['word'], sample['definitions']))
            lang_processed_data.append(sample)
        i += 1
        file_name = f'{lang}{i}-{data_target_file}'
    print(f'processed {len(lang_processed_data)} {lang} samples')
    json.dump(lang_processed_data, open(lang + '-merged-' + data_target_file, 'w', encoding='utf8'), ensure_ascii=False, indent=4)

merging different languages and subsampling english data

In [None]:
en_size = 80000
data_en_subsample = random.choices(data_en, k=en_size)
for sample in data_en_subsample:
    sample['lang'] = 'en'

In [None]:
data = [] + data_en_subsample
for lang in langs:
    print(f'processing {lang}...')
    file_name = f'{lang}-merged-{data_target_file}'
    data += json.load(open(file_name, 'r', encoding='utf8'))
print(f'processed {len(data)} samples')
json.dump(data, open(data_target_file, 'w', encoding='utf8'), ensure_ascii=False, indent=4)