<a href="https://colab.research.google.com/github/efo-anopa/nlp/blob/main/en_to_fr_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('eng_-french.csv')
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [3]:
df.columns = ['en', 'fr']
df

Unnamed: 0,en,fr
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [4]:
df['en'] = df['en'].str.lower()
df['fr'] = df['fr'].str.lower()

In [5]:
df

Unnamed: 0,en,fr
0,hi.,salut!
1,run!,cours !
2,run!,courez !
3,who?,qui ?
4,wow!,ça alors !
...,...,...
175616,"top-down economics never works, said obama. ""t...","« l'économie en partant du haut vers le bas, ç..."
175617,a carbon footprint is the amount of carbon dio...,une empreinte carbone est la somme de pollutio...
175618,death is something that we're often discourage...,la mort est une chose qu'on nous décourage sou...
175619,since there are usually multiple websites on a...,puisqu'il y a de multiples sites web sur chaqu...


In [6]:
en_data = df['en'].values
fr_data = df['fr'].values

In [7]:
en_data[0]

'hi.'

In [8]:
len(en_data[-1])

262

In [9]:
len(fr_data[-1])

325

In [10]:
len(en_data)

175621

In [11]:
df_samp = df.sample(5000, random_state=4574)
df_samp

Unnamed: 0,en,fr
12701,"tom, be careful!","tom, fais attention!"
36652,look at that picture.,regarde cette image.
83732,i was glad to hear the news.,j'étais contente d'entendre la nouvelle.
155763,i can't keep up with you if you walk so fast.,je ne peux pas vous suivre si vous marchez si ...
171462,your computer will restart several times durin...,votre ordinateur va redémarrer plusieurs fois ...
...,...,...
107782,don't worry. my lips are sealed.,"ne t'en fais pas, je suis muet comme une tombe."
165712,the speaker should stand where everyone can se...,l'orateur devrait se tenir là où tout le monde...
8537,is this enough?,est-ce assez ?
43911,the world is changing.,le monde est en train de changer.


In [12]:
en_samp = df_samp['en'].values
fr_samp = df_samp['fr'].values

In [13]:
en_vec = tf.keras.layers.TextVectorization(max_tokens = 10000, output_sequence_length=100)
en_vec.adapt(en_samp)
en_vocab = en_vec.get_vocabulary()

In [14]:
print(len(en_vocab))

3283


In [15]:
print(en_vocab)

['', '[UNK]', 'i', 'you', 'to', 'the', 'a', 'is', 'that', 'tom', 'of', 'do', 'in', 'have', 'me', 'this', 'he', 'it', 'dont', 'for', 'was', 'are', 'what', 'we', 'im', 'be', 'my', 'your', 'like', 'she', 'not', 'want', 'with', 'how', 'know', 'all', 'did', 'were', 'on', 'can', 'youre', 'at', 'his', 'time', 'think', 'its', 'go', 'and', 'they', 'him', 'very', 'get', 'about', 'no', 'cant', 'there', 'will', 'had', 'here', 'out', 'one', 'as', 'her', 'if', 'has', 'would', 'why', 'so', 'going', 'ill', 'need', 'just', 'didnt', 'up', 'tell', 'an', 'from', 'come', 'really', 'when', 'mary', 'see', 'help', 'us', 'should', 'could', 'where', 'thats', 'good', 'take', 'some', 'who', 'never', 'ive', 'please', 'by', 'too', 'much', 'got', 'been', 'now', 'thought', 'money', 'told', 'something', 'made', 'say', 'right', 'day', 'am', 'id', 'give', 'than', 'home', 'more', 'well', 'today', 'make', 'any', 'work', 'still', 'lot', 'look', 'leave', 'feel', 'tomorrow', 'only', 'must', 'many', 'believe', 'anything', 'pe

In [16]:
fr_vec = tf.keras.layers.TextVectorization(max_tokens = 10000, output_sequence_length=100)
fr_vec.adapt(fr_samp)
fr_vocab = fr_vec.get_vocabulary()

In [17]:
print(len(fr_vocab))

5249


In [18]:
print(fr_vocab)

['', '[UNK]', 'je', 'de', 'pas', 'que', 'à', 'ne', 'le', 'vous', 'la', 'tom', 'il', 'un', 'est', 'a', 'nous', 'tu', 'ce', 'une', 'les', 'en', 'pour', 'jai', 'me', 'suis', 'cest', 'elle', 'faire', 'ça', 'tout', 'plus', 'des', 'dans', 'te', 'si', 'ma', 'au', 'qui', 'du', 'y', 'veux', 'mon', 'avec', 'fait', 'se', 'et', 'sont', 'son', 'très', 'temps', 'quil', 'être', 'lui', 'cela', 'moi', 'votre', 'dit', 'comment', 'était', 'pourquoi', 'été', 'cette', 'pense', 'dire', 'où', 'nest', 'vraiment', 'sur', 'jamais', 'sais', 'ils', 'quelle', 'peux', 'nai', 'ici', 'chose', 'sommes', 'besoin', 'personne', 'tous', 'quelque', 'beaucoup', 'êtes', 'ton', 'estce', 'avons', 'aller', 'par', 'toi', 'monde', 'bien', 'mary', 'astu', 'quand', 'encore', 'trop', 'avoir', 'maison', 'comme', 'voir', 'toujours', 'va', 'sa', 'rien', 'parler', 'êtesvous', 'sest', 'avait', 'on', 'mes', 'avezvous', 'as', 'train', 'sil', 'peu', 'pensais', 'es', 'demain', 'jaimerais', 'fois', 'deux', 'ta', 'soit', 'quoi', 'là', 'jétais'

In [19]:
en_train_data = en_data[:int(0.8*(len(en_data)))]
en_test_data = en_data[int(0.8*(len(en_data))): ]
fr_train_data = fr_data[:int(0.8*(len(en_data)))]
fr_test_data = fr_data[int(0.8*(len(en_data))): ]

In [20]:
en_train_tk, en_test_tk = en_vec(en_train_data), en_vec(en_test_data)
fr_train_tk, fr_test_tk = fr_vec(fr_train_data), fr_vec(fr_test_data)

In [21]:
en_train_tk[-1]

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([  2, 109,  67, 283,   4,  13, 702,   3, 394,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])>

In [22]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

In [23]:
num_enc_tokens = len(en_vocab)
num_dec_tokens = len(fr_vocab)
latent_dim = 256

In [24]:
#let's get to model making
encoder_inputs = Input(shape = (None, ))
x = Embedding(num_enc_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim, return_state=True)(x)
encoder_states = [state_h, state_c]

In [25]:
decoder_inputs = Input(shape = (None, ))
x = Embedding(num_dec_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences = True)(x, initial_state = encoder_states)
decoder_outputs = Dense(num_dec_tokens, activation = 'softmax')(x)

In [26]:
from tensorflow.keras.models import Model

In [27]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [28]:
model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy')

In [None]:
model.fit([en_train_tk, fr_train_tk], fr_train_tk, batch_size = 128, epochs = 1)

Epoch 1/5