In [87]:
import numpy as np
import pandas as pd
import re
import warnings
import unidecode
from keras.layers import Input,Embedding,Dense,LSTM,TimeDistributed,Concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model,load_model
from keras.callbacks import EarlyStopping,ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

warnings.filterwarnings('ignore')

In [88]:
def read_text(filename):
    file=open(filename,mode='rt',encoding='utf-8')
    text=file.read()
    file.close()
    return text

In [89]:
def to_lines(text):
    text=text.split('\n')
    text=[i.split('\t') for i in text]
    return text

In [90]:
text=read_text('spa-eng/spa.txt')
spa_eng=to_lines(text)

In [91]:
spa_eng

[['Go.',
  'Ve.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)'],
 ['Go.',
  'Vete.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)'],
 ['Go.',
  'Vaya.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986657 (cueyayotl)'],
 ['Go.',
  'Váyase.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #6586271 (arh)'],
 ['Hi.',
  'Hola.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #431975 (Leono)'],
 ['Run!',
  '¡Corre!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #1685404 (Elenitigormiti)'],
 ['Run!',
  '¡Corran!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5213896 (cueyayotl)'],
 ['Run!',
  '¡Corra!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #8005613 (Seael)'],
 ['Run!',
  '¡Corred!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #8005615 (Seael)'],
 ['Run.',
  'Corre

In [92]:
for i in range(len(spa_eng)):
    spa_eng[i]=spa_eng[i][:2]

In [93]:
spa_eng

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', '¡Corre!'],
 ['Run!', '¡Corran!'],
 ['Run!', '¡Corra!'],
 ['Run!', '¡Corred!'],
 ['Run.', 'Corred.'],
 ['Who?', '¿Quién?'],
 ['Wow!', '¡Órale!'],
 ['Fire!', '¡Fuego!'],
 ['Fire!', '¡Incendio!'],
 ['Fire!', '¡Disparad!'],
 ['Help!', '¡Ayuda!'],
 ['Help!', '¡Socorro! ¡Auxilio!'],
 ['Help!', '¡Auxilio!'],
 ['Jump!', '¡Salta!'],
 ['Jump.', 'Salte.'],
 ['Stop!', '¡Parad!'],
 ['Stop!', '¡Para!'],
 ['Stop!', '¡Pare!'],
 ['Wait!', '¡Espera!'],
 ['Wait.', 'Esperen.'],
 ['Go on.', 'Continúa.'],
 ['Go on.', 'Continúe.'],
 ['Hello!', 'Hola.'],
 ['Hurry!', '¡Date prisa!'],
 ['Hurry!', '¡Daos prisa!'],
 ['Hurry!', 'Dese prisa.'],
 ['I hid.', 'Me oculté.'],
 ['I hid.', 'Me escondí.'],
 ['I hid.', 'Me ocultaba.'],
 ['I hid.', 'Me escondía.'],
 ['I ran.', 'Corrí.'],
 ['I ran.', 'Corría.'],
 ['I try.', 'Lo intento.'],
 ['I won!', '¡He ganado!'],
 ['Oh no!', '¡Oh, no!'],
 ['Relax.', 'Tomátelo con soda

In [94]:
print(len(spa_eng))

125447


In [95]:
spa_eng=np.asarray(spa_eng)
eng_l=[]
spa_l=[]
for i in range(len(spa_eng)-1):
    eng_l.append(spa_eng[i][0])
    spa_l.append(spa_eng[i][1:][0])  

In [96]:
data=pd.DataFrame({'eng':eng_l,'spa':spa_l})

In [97]:
data.tail()

Unnamed: 0,eng,spa
125441,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
125442,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
125443,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
125444,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
125445,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."


In [98]:
data.drop_duplicates(subset=['spa'],inplace=True)

In [99]:
len(data)

119035

In [100]:
# function to preprocess the text
def cleaner(text):
    newString = text.lower()
    unaccented_string = unidecode.unidecode(newString)
    newString = re.sub("'",'', unaccented_string) 
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = newString.split()
    return (" ".join(tokens)).strip()

In [101]:
data['cleaned_eng']=data['eng'].apply(cleaner)
data['cleaned_spa']=data['spa'].apply(cleaner)

In [102]:
data.tail()

Unnamed: 0,eng,spa,cleaned_eng,cleaned_spa
125441,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...,there are four main causes of alcohol related ...,hay cuatro causas principales de muertes relac...
125442,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...,there are mothers and fathers who will lie awa...,hay madres y padres que se quedan despiertos d...
125443,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...,a carbon footprint is the amount of carbon dio...,una huella de carbono es la cantidad de contam...
125444,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...,since there are usually multiple websites on a...,como suele haber varias paginas web sobre cual...
125445,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe...",if you want to sound like a native speaker you...,si quieres sonar como un hablante nativo debes...


In [103]:
# train-test split
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(data['cleaned_spa'],data['cleaned_eng'], test_size = 0.2, random_state=0,shuffle=True)

In [104]:
# storing validation data into new variables for later use
x_val_original = x_val
y_val_original = y_val

# reset index
y_val_original.reset_index(inplace=True, drop=True)
x_val_original.reset_index(inplace=True, drop=True)

In [105]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

# create word-frequency pair dictionary
source_word_freq = build_vocab(x_tr)

In [106]:
# set threshold value for rare words
thresh=2

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in source_word_freq.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value
    
print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 42.380621031573455
Total Coverage of rare words: 1.6719424213569412


In [107]:
source_word_index={}
cnt=2
for key,value in source_word_freq.items():
    if value>=thresh:
        source_word_index[key]=cnt
        cnt+=1
source_word_index['<pad>']=0
source_word_index['<unk>']=1

In [108]:
source_vocab=[key for key,value in source_word_freq.items()]

source_seq_tr=[]
for i in x_tr:
    seq=[]
    for j in i.split():
        if j not in source_vocab:
            seq.append(source_word_index['<unk>'])
        elif source_word_freq[j]<thresh:
            seq.append(source_word_index['<unk>'])
        else:
            seq.append(source_word_index[j])
    source_seq_tr.append(seq)        

In [109]:
source_seq_tr[:10]

[[2, 3, 4, 5, 6],
 [7, 8, 9],
 [10, 11, 12, 3, 13, 14, 12, 15],
 [16, 17, 18, 19, 20, 21],
 [22, 23, 24],
 [3, 25, 26, 27, 28, 29, 30, 31, 32],
 [33, 34, 35, 36],
 [37, 38, 39, 40, 1],
 [41, 1],
 [33, 42, 3, 43, 44, 45, 46, 47]]

In [110]:
source_seq_val=[]
for i in x_val:
    seq=[]
    for j in i.split():
        if j not in source_vocab:
            seq.append(source_word_index['<unk>'])
        elif source_word_freq[j]<thresh:
            seq.append(source_word_index['<unk>'])
        else:
            seq.append(source_word_index[j])
    source_seq_val.append(seq)      

In [111]:
source_seq_val[:10]

[[64, 5, 2668, 3, 4865, 3381, 109, 354],
 [64, 399, 2579],
 [170, 171, 1576, 2496, 109, 765],
 [204, 27, 97, 267],
 [944, 1098, 24, 27, 2487],
 [932, 8, 2749],
 [191, 27, 231, 2464, 135, 13, 135, 97, 1204, 1, 30, 51, 2485],
 [33, 1516, 109, 765, 1248, 136, 234],
 [109, 1453, 12, 97, 8024, 48, 49, 11426, 30, 149, 395],
 [64, 823, 2221, 120]]

In [112]:
target_word_freq = build_vocab(y_tr)

In [113]:
# set threshold value for rare words
thresh=2

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in target_word_freq.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value
    
print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 33.75910914599198
Total Coverage of rare words: 0.6841132412332954


In [114]:
target_word_index={}
cnt=2
for key,value in target_word_freq.items():
    if value>=thresh:
        target_word_index[key]=cnt
        cnt+=1
target_word_index['<pad>']=0
target_word_index['<unk>']=1

In [116]:
target_vocab=[key for key,value in target_word_freq.items()]

target_seq_tr=[]
for i in y_tr:
    seq=[]
    for j in i.split():
        if j not in target_vocab:
            seq.append(target_word_index['<unk>'])
        elif target_word_freq[j]<thresh:
            seq.append(target_word_index['<unk>'])
        else:
            seq.append(target_word_index[j])
    target_seq_tr.append(seq)        

In [117]:
target_seq_tr[:10]

[[2, 3, 4, 5, 6],
 [7, 8, 9, 10, 11],
 [12, 13, 6, 14, 15, 16, 17, 18],
 [19, 20, 21, 22, 23, 24, 25, 26, 27, 24, 28],
 [8, 29, 30, 31, 8],
 [32, 33, 34, 19, 24, 35, 36, 37, 38, 39],
 [40, 41, 42, 43],
 [44, 45, 46, 47, 48],
 [29, 49],
 [40, 50, 51, 52, 53, 54, 55, 56]]

In [118]:
target_seq_val=[]
for i in y_val:
    seq=[]
    for j in i.split():
        if j not in target_vocab:
            seq.append(target_word_index['<unk>'])
        elif target_word_freq[j]<thresh:
            seq.append(target_word_index['<unk>'])
        else:
            seq.append(target_word_index[j])
    target_seq_val.append(seq)      

In [119]:
target_seq_val[:10]

[[186, 2949, 51, 2, 239, 1043, 55, 367, 280, 1621],
 [186, 1478, 3172],
 [190, 191, 122, 700, 556, 1967],
 [212, 36, 286],
 [51, 958, 160, 1123],
 [146, 341, 10, 6688],
 [157, 34, 65, 1944, 561, 131, 27, 2363, 1990, 853, 10, 511],
 [40, 526, 122, 700, 148, 529, 1083],
 [122, 1224, 70, 242, 3079, 168, 64, 280, 157, 405],
 [186, 755, 924, 513]]