In [2]:
!pip install simpletransformers

In [3]:
from simpletransformers.language_representation import RepresentationModel

In [4]:
# here we got the embedding for all the words
sentences = ["Data science is really cool i love it", "Football is the best sport ever"] 

model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-uncased",
        use_cuda=True
    )

word_vectors = model.encode_sentences(sentences, combine_strategy=None)

In [5]:
word_vectors.shape

In [6]:
# here we got the embedding for the sentences
sentences = ["Data science is really cool i love it", "Football is the best sport ever"] 

model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-uncased",
        use_cuda=True
    )

word_vectors = model.encode_sentences(sentences, combine_strategy="mean")

In [7]:
word_vectors.shape

In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv("../input/frenchenglish/fra.txt" , sep='\n', header=None)
df.head()

In [10]:
df.shape

In [11]:
df["english"] = df[0].apply(lambda x: x.split('\t')[0])
df["french"] = df[0].apply(lambda x: x.split('\t')[1])  
df.drop(0,inplace=True,axis=1)
df.head()

In [12]:
import string
# here we remouve ponctuation and lower the text
translator = str.maketrans('', '', string.punctuation)
df["english"] = df["english"].apply(lambda x: x.lower().translate(translator))
df["french"] = df["french"].apply(lambda x: x.lower().translate(translator))  
df.head()

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
french_tokenizer = Tokenizer()

In [14]:
french_tokenizer.fit_on_texts(df["french"])

In [15]:
print(f"the frensh vocab size is {len(french_tokenizer.word_index)}")

In [16]:
y = french_tokenizer.texts_to_sequences(df["french"])

In [17]:
y[:20]

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = pad_sequences(y,padding='post')

In [19]:
y[:5]

In [20]:
# here we use the bert to embedd all the sentences 
bert = RepresentationModel(
        model_type="bert",
        model_name="bert-base-uncased",
        use_cuda=True
    )

In [21]:
x = bert.encode_sentences(df["english"], combine_strategy="mean")

In [22]:
x.shape

In [23]:
frensh_vocab_size = len(french_tokenizer.word_index) + 1
print(f"the french vocab size is {frensh_vocab_size}")

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, RepeatVector, Dense, TimeDistributed
import tensorflow as tf

In [25]:
# in our decoder and encoder model we will replace the encoder part with the bert embedding for the sentence
model = Sequential()
# the encoder:
model.add(tf.keras.Input(shape=(768,)))
model.add(RepeatVector(y.shape[1]))
# the decoder:
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(frensh_vocab_size, activation='softmax'))

In [26]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [27]:
model.fit(x,y, epochs=20, batch_size=256)

In [28]:
model.save("model.h5")
model.save_weights("model.weights.h5")

In [29]:
def translate(model, fra_tokenizer, sentence):
    text = [sentence]
    text = bert.encode_sentences(text, combine_strategy="mean")
    pred = model.predict(text)
    pred = [[np.argmax(w) for w in sentence] for sentence in pred]
    pred = fra_tokenizer.sequences_to_texts(pred)
    return pred[0]

In [33]:
test_sentences = ["love","the work is hard","i need money", "call me later",'the french president is going to visit japan next month','thats the worst thing that could possibly happen']
for i in range(len(test_sentences)):
    print(f"{test_sentences[i]} -----> {translate(model,french_tokenizer, test_sentences[i])}")

In [32]:
model = tf.keras.models.load_model("./model.h5")
model.load_weights("./model.weights.h5")

In [34]:
model.fit(x,y, epochs=2, batch_size=256)

In [35]:
test_sentences = ["i love you","the work is hard","i need money", "call me later",'the french president is going to visit japan next month','thats the worst thing that could possibly happen']
for i in range(len(test_sentences)):
    print(f"{test_sentences[i]} -----> {translate(model,french_tokenizer, test_sentences[i])}")