# Train human neural networks to translate

For any data analysis on translation data from German to English and vice versa,
see https://nlp.stanford.edu/projects/nmt/. All the data from the source should
be downloaded into `data/translation_model`.

In [None]:
import os
from tqdm.auto import tqdm
import pandas as pd

In [None]:
large_files_path = os.getcwd()
large_files_path = large_files_path[:large_files_path.rindex('/')] + '/large_files/translation_model'

translation_csv = f'{large_files_path}/translation.csv'

# use below to produce translation.csv
"""
en = open(f'{large_files_path}/train.en', 'rb').read()
de = open(f'{large_files_path}/train.de', 'rb').read()

en = en.decode('utf-8').split('\n')
de = de.decode('utf-8').split('\n')

data = pd.DataFrame({
    'german': de,
    'english': en
})

data.to_csv(translation_csv)
"""

data = pd.read_csv(translation_csv)
data.drop('Unnamed: 0', axis=1, inplace=True)
data.dropna(inplace=True)

In [None]:
# calculate word frequency
from collections import Counter

all_words = []
wordcount = Counter(
    word.lower() for sentence in tqdm(data['german'].iloc[:-2]) for word in sentence.split(' ') 
)

In [None]:
# sort the sentences by frequency


def frequency(sentence):
    if not isinstance(sentence, str):
        return 0
    if len(sentence) < 30:
        return 0
    return min(wordcount[word.lower()] for word in sentence.split(' '))

sentence = data['german'].iloc[100000]
print(frequency(sentence), sentence)

In [None]:
F = data['german'].apply(frequency)

In [None]:
data.columns

In [None]:
data['frequency'] = F

In [None]:
data.sort_values('frequency', inplace=True, ascending=False)

In [None]:
row = data.iloc[4000004]
print(row['german'])
print(row['english'])
print(row['frequency'])

In [None]:
data_valid = data[data['frequency'] > 1000]

In [None]:
data_valid.shape

In [None]:
data.shape

In [None]:
import json
result = []
for i, row in data_valid.sample(40000).iterrows():
    result.append({
        'english': row['english'],
        'german': row['german']
    })

with open('sentences_en_de.json', 'w') as f:
    json.dump(result, f)

## Testing translation capabilities though notebook

Simply sample some sentences below, and try to translate them.

In [None]:
row = data_valid.sample()
row.iloc[0]['english']

In [None]:
row.iloc[0]['german']