In [89]:
import numpy as np 
import pandas as pd

In [90]:
df = pd.read_csv("./archive/fra.txt" , sep='\n', header=None)
df.head()

Unnamed: 0,0
0,Go.\tVa !\tCC-BY 2.0 (France) Attribution: tat...
1,Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: ...
2,Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: t...
3,Run!\tCours !\tCC-BY 2.0 (France) Attribution:...
4,Run!\tCourez !\tCC-BY 2.0 (France) Attribution...


In [91]:
df.shape

(177210, 1)

In [92]:
df["english"] = df[0].apply(lambda x: x.split('\t')[0])
df["french"] = df[0].apply(lambda x: x.split('\t')[1])  
df.drop(0,inplace=True,axis=1)

In [93]:
df.head()

Unnamed: 0,english,french
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [94]:
import string
# here we remouve ponctuation and lower the text
translator = str.maketrans('', '', string.punctuation)
df["english"] = df["english"].apply(lambda x: x.lower().translate(translator))
df["french"] = df["french"].apply(lambda x: x.lower().translate(translator))  
df.head()

Unnamed: 0,english,french
0,go,va
1,hi,salut
2,hi,salut
3,run,cours
4,run,courez


In [95]:
from keras.preprocessing.text import Tokenizer
french_tokenizer = Tokenizer()

In [96]:
french_tokenizer.fit_on_texts(df["french"])

In [97]:
frensh_vocab_size = len(french_tokenizer.word_index) + 1
print(f"the french vocab size is {frensh_vocab_size}")

the french vocab size is 33484


In [98]:
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(df["english"])

In [99]:
english_vocab_size = len(english_tokenizer.word_index) + 1
print(f"the english vocab size is {english_vocab_size}")

the english vocab size is 14783


In [100]:
from keras.preprocessing.sequence import pad_sequences

In [101]:
x = english_tokenizer.texts_to_sequences(df["english"])
y = french_tokenizer.texts_to_sequences(df["french"])
x = pad_sequences(x, padding='post')
y = pad_sequences(y, padding='post')

In [102]:
x = np.array(x)
y = np.array(y)
x.shape,y.shape

((177210, 44), (177210, 55))

In [103]:
# now lets create our decoder encoder architecture
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, RepeatVector, Dense, TimeDistributed

In [16]:
model = Sequential()
# the encoder:
model.add(Embedding(english_vocab_size, 128, input_length=x.shape[1]))
model.add(Bidirectional(LSTM(256)))
# model.add(Dense(256, activation='relu'))
model.add(RepeatVector(y.shape[1]))
# the decoder:
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(frensh_vocab_size, activation='softmax'))

In [17]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(x,y, epochs=30, batch_size=256)

In [19]:
model.save("model.h5")
model.save_weights("model.weights.h5")

In [104]:
from tensorflow.keras.models import load_model
model = load_model("./model.h5")
model.load_weights('./model.weights.h5')

In [105]:
test = ["go","i am hungry",'get on your horse','i made fun of him']
test = english_tokenizer.texts_to_sequences(test)
test = pad_sequences(test,maxlen=44, padding='post')
test

array([[ 42,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0],
       [  1, 114, 540,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0],
       [ 55,  32,  23, 874,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0],
       [  1, 130, 339,  11,  45,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,

In [106]:
y_pred = model.predict(test)
y_pred.shape

(4, 55, 33484)

In [107]:
# take the chosen word for every sentence
y_pred = [[np.argmax(w) for w in sentence] for sentence in y_pred]

In [108]:
y_pred = np.array(y_pred)
y_pred.shape

(4, 55)

In [109]:
# turn it into text
y_pred = french_tokenizer.sequences_to_texts(y_pred)
y_pred

['allez', 'jai faim', 'sors à cheval\xa0', 'je me suis un']

In [110]:
def translate(model, eng_tokenizer,fra_tokenizer, sentence):
    text = [sentence]
    text = eng_tokenizer.texts_to_sequences(text)
    text = pad_sequences(text,maxlen=44, padding='post')
    pred = model.predict(text)
    pred = [[np.argmax(w) for w in sentence] for sentence in pred]
    pred = fra_tokenizer.sequences_to_texts(pred)
    return pred[0]

In [111]:
test_sentences = ["i love you","the work is hard","i need money", "call me later",'the french president is going to visit japan next month','thats the worst thing that could possibly happen']
for i in range(len(test_sentences)):
    print(f"{test_sentences[i]} -----> {translate(model,english_tokenizer,french_tokenizer, test_sentences[i])}")

i love you -----> je taime
the work is hard -----> le travail est difficile
i need money -----> jai besoin de largent
call me later -----> appellemoi pour tard
the french president is going to visit japan next month -----> le président de est en à en mois mois
thats the worst thing that could possibly happen -----> cest la chose qui qui qui puisse


### with more training and a bigger dataset we can achieve a much better accuracy since with only 20 epochs it is working pretty well with small sentences 