## Global modules import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import random as rnd
import sys
import torch

from sklearn.model_selection import train_test_split
from operator import itemgetter

## Local modules import

In [3]:
sys.path.append('..')

## Loading data

In [4]:
from data_loading import create_word_lists, tidy_sentence_length

In [5]:
with open('../data/corpus_data.json') as json_file:
    data = json.load(json_file)
data = data['records']

In [6]:
human_transcripts = [entry['human_transcript'] for entry in data]
stt_transcripts   = [entry['stt_transcript'] for entry in data]

In [7]:
human_words, stt_words, word_labels, word_grams, word_sems = \
    create_word_lists(data)

Some of the sentences are too long, so we need to shorten them. The sentences are basically concatenations of individual words with spaces in between, without any interpuction, so they are reconstructed from word lists when necessary.

In [8]:
stt_transcripts, stt_words, word_labels, word_grams, word_sems = \
    tidy_sentence_length(stt_transcripts, stt_words, word_labels, word_grams, word_sems)

In [9]:
max_length = max(map(len, word_labels))
padded_labels = [row + [False] * (max_length - len(row)) for row in word_labels]
padded_labels = np.array(padded_labels)
stat_labels = np.any(padded_labels, axis=1)

Here, we split only indices and not data itself, because the data contains arrays of variable length, which does not work with `train_test_split`:

In [10]:
indices = list(range(len(stt_transcripts)))
tr_indices, te_indices = train_test_split(indices, test_size=0.2, random_state=0, shuffle=True, stratify=stat_labels)

These are hepler functions that will extract data selected by indices:

In [11]:
extract_train = itemgetter(*tr_indices)
extract_test  = itemgetter(*te_indices)

Finally, do data splitting:

In [12]:
tr_stt_transcripts   = extract_train(stt_transcripts)
tr_stt_words         = extract_train(stt_words)

tr_word_labels       = extract_train(word_labels)
tr_word_grams        = extract_train(word_grams)
tr_word_sems         = extract_train(word_sems)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

te_stt_transcripts   = extract_test(stt_transcripts)
te_stt_words         = extract_test(stt_words)

te_word_labels       = extract_test(word_labels)
te_word_grams        = extract_test(word_grams)
te_word_sems         = extract_test(word_sems)

In [13]:
from googletrans import Translator

In [15]:
new_tr_words, new_tr_word_labels = ([], [])
to_translate_words = []

for sentence, labels in zip(tr_stt_words, tr_word_labels):
    if any(labels):
        new_tr_words.append(sentence)
        new_tr_word_labels.append(labels)
    else:
        to_translate_words.append(sentence)
    

In [16]:
import random as rnd
from tqdm import tqdm

In [17]:
def translate_sentences(sentences, n_untranslated=0):
    translator = Translator()
    translations, tr_labels = ([], [])

    for sentence in tqdm(sentences):

        if rnd.random() < 0.1:
            translations.append(sentence)
            tr_labels.append([0]*len(sentence))
            continue
        
        new_sentence = []
        new_labels = []

        for word in sentence:
            if rnd.random() < 0.2:
                try:
                    new_sentence.append(translator.translate(word, src='en', dest='de').text)
                    new_labels.append(1)
                except:
                    n_untranslated+=1
                    new_sentence.append(word)
                    new_labels.append(0)
            else:
                new_sentence.append(word)
                new_labels.append(0)
        
        translations.append(new_sentence)
        tr_labels.append(new_labels)
        

    return translations, tr_labels, n_untranslated


In [18]:
translations, tr_labels, n = translate_sentences(to_translate_words)

  0%|          | 0/4676 [00:00<?, ?it/s]

100%|██████████| 4676/4676 [29:44<00:00,  2.62it/s]  


In [19]:
print(n/len(to_translate_words))

0.5241659538066724


In [20]:
new_tr_words = new_tr_words + translations
new_tr_word_labels = new_tr_word_labels + tr_labels

In [24]:
print(len(new_tr_words) == len(new_tr_word_labels))
for w, l in zip(new_tr_words, new_word_labels):
    if len(w)!=len(l):
        print(w)

True


In [25]:
import os
import pickle

In [27]:
out_path = '../intermediate_data/translator_basic'

In [28]:
with open(os.path.join(out_path, 'words_higher_perc.pkl'), 'wb') as file:
    pickle.dump(new_tr_words, file)
with open(os.path.join(out_path, 'labels_higher_perc.pkl'), 'wb') as file:
    pickle.dump(new_tr_word_labels, file)
with open(os.path.join(out_path, 'test_words_higher_perc.pkl'), 'wb') as file:
    pickle.dump(te_stt_words, file)
with open(os.path.join(out_path, 'test_labels_higher_perc.pkl'), 'wb') as file:
    pickle.dump(te_word_labels, file)

In [30]:
num_words = 0
num_germans = 0
for l in new_tr_word_labels:
    num_words += len(l)
    num_germans += sum(l)
print(num_germans/num_words)

0.11717936393495153


In [34]:
num_words = 0
num_germans = 0
for l in word_labels:
    num_words += len(l)
    num_germans += sum(l)
print(num_germans/num_words)

0.029981542412326458


In [37]:
print(new_tr_words[-50:])

[['0'], ['when', 'i', 'hang', 'on', 'my', 'friends', 'ich', 'go', 'to', 'drink', 'a', 'beer', 'Und', 'then', 'we', 'go', 'to', 'the', 'beach', 'we', 'nehmen', 'a', 'swim', 'we', 'serf', 'and', 'Wir', 'gehen', 'eat', 'let', 'the', 'day', 'and', 'what', 'do', 'you', 'actually', 'do', 'in', 'you', 'of', 'what', 'do', 'you', 'do', 'actually', 'in', 'school', 'in', 'your', 'school'], ['i', 'can', 'see', '7', 'balls', 'of', 'the', 'picture'], ['and', 'what', 'is', 'your', 'favorite', 'food'], ['or', 'rink'], ['it', 'is', 'something', 'where', 'you', 'can', 'drive', 'or', 'you', 'can', 'ich'], ['my', 'name', 'is', 'Pluto', 'and', 'what', 'is', 'your', 'name'], ['the', 'people', 'has', 'blue', 'jeans', 'and', 'a', 'purple', 't', 'shirt'], ['is', 'it', 'a', 'reporter'], ['i', 'like', 'both', 'because', 'of', 'In', 'on', 'the', 'beach', 'you', 'can', 'you', 'can', 'swim', 'and', 'you', 'can', 'you', 'can', 'lie', 'in', 'the', 'sun', 'and', 'it', 'is', 'hot', 'there', 'but', 'ich', 'also', 'like'