In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

In [2]:
lines = open('fra.txt', 'rb').read().decode(encoding='utf8').split('\n')

In [3]:
eng_sent = []
fra_sent = []
eng_chars = set()
fra_chars = set()
nb_samples = 10000

# Process english and french sentences
for i in range(nb_samples):
    line = lines[i].replace('\u202f', ' ')

    eng_line = str(line).split('\t')[0]
    
    # Append '\t' for start of the sentence and '\n' to signify end of the sentence
    fra_line = '\t' + str(line).split('\t')[1] + '\n'
    eng_sent.append(eng_line)
    fra_sent.append(fra_line)
    
    for ch in eng_line:
        if (ch not in eng_chars):
            eng_chars.add(ch)
            
    for ch in fra_line:
        if (ch not in fra_chars):
            fra_chars.add(ch)

In [4]:
fra_chars = sorted(list(fra_chars))
eng_chars = sorted(list(eng_chars))

In [5]:
# dictionary to index each english character - key is index and value is english character
eng_index_to_char_dict = {}

# dictionary to get english character given its index - key is english character and value is index
eng_char_to_index_dict = {}

for k, v in enumerate(eng_chars):
    eng_index_to_char_dict[k] = v
    eng_char_to_index_dict[v] = k

In [6]:
# dictionary to index each french character - key is index and value is french character
fra_index_to_char_dict = {}

# dictionary to get french character given its index - key is french character and value is index
fra_char_to_index_dict = {}
for k, v in enumerate(fra_chars):
    fra_index_to_char_dict[k] = v
    fra_char_to_index_dict[v] = k

In [7]:
max_len_eng_sent = max([len(line) for line in eng_sent])
max_len_fra_sent = max([len(line) for line in fra_sent])

In [8]:
max_len_eng_sent
max_len_fra_sent

60

In [9]:
tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent), dtype='float32')
tokenized_fra_sentences = np.zeros(shape = (nb_samples,max_len_fra_sent), dtype='float32')
target_data = np.zeros((nb_samples, max_len_fra_sent),dtype='float32')

In [10]:
# Vectorize the english and french sentences

for i in range(nb_samples):
    for k,ch in enumerate(eng_sent[i]):
        tokenized_eng_sentences[i,k] = eng_char_to_index_dict[ch]
        
    for k,ch in enumerate(fra_sent[i]):
        tokenized_fra_sentences[i,k] = fra_char_to_index_dict[ch]

        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i,k-1] = fra_char_to_index_dict[ch]

In [11]:
BATCH_SIZE = 128
EMBEDDING_DIM = 256

# Encoder model

# encoder_input = Input(shape=(None,len(eng_chars)))
encoder_input = Input(shape=(None,))

encoder_embedding_layer = Embedding(len(eng_chars), EMBEDDING_DIM)
encoder_embedding_output = encoder_embedding_layer(encoder_input)

encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_embedding_output)
encoder_states = [encoder_h, encoder_c]


In [12]:
# Decoder model

# decoder_input = Input(shape=(None,len(fra_chars)))
decoder_input = Input(shape=(None,))

decoder_embedding_layer = Embedding(len(fra_chars), EMBEDDING_DIM)
decoder_embedding_output = decoder_embedding_layer(decoder_input)

decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_embedding_output, initial_state=encoder_states)
decoder_dense = Dense(len(fra_chars), activation='softmax')
decoder_out = decoder_dense (decoder_out)


In [13]:
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])
model.summary()
# Run training
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit(x=[tokenized_eng_sentences,tokenized_fra_sentences], 
          y=target_data,
          batch_size=BATCH_SIZE,
          epochs=50,
          validation_split=0.2,)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    18432       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    23808       input_2[0][0]                    
_______________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x153bec9c0c8>

In [14]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)
encoder_model_inf.summary()

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))

decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_embedding_output,
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states)
decoder_model_inf.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 256)         18432     
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 525312    
Total params: 543,744
Trainable params: 543,744
Non-trainable params: 0
_________________________________________________________________
Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
___________________________________________________________

In [15]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fra_char_to_index_dict['\t']
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        
        if  sampled_fra_char == '\n' or len(translated_sent) > max_len_fra_sent :
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = max_val_index
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent



In [16]:
def get_reference(index):
    ref = set()
    for i, s in enumerate(eng_sent):
        if s == eng_sent[index]:
            ref.add(fra_sent[i].strip())
    return list(ref)

for seq_index in range(10):
    inp_seq = tokenized_eng_sentences[seq_index:seq_index+1]

    translated_sent = decode_seq(inp_seq).strip()
    reference = get_reference(seq_index)
    print('-')
    print('Input sentence:', eng_sent[seq_index])
    print('Output sentence:', translated_sent)
    print('reference:', reference)

    bleu = sentence_bleu(reference, translated_sent)
    print('bleu', bleu)

-
Input sentence: ﻿Go.
Output sentence: Va !
reference: ['Va !']
bleu 1.0
-
Input sentence: Run!
Output sentence: Cours !
reference: ['Cours !', 'Courez !']
bleu 1.0
-
Input sentence: Run!
Output sentence: Cours !
reference: ['Cours !', 'Courez !']
bleu 1.0
-
Input sentence: Wow!
Output sentence: Laquelle tempos !
reference: ['Ça alors !']
bleu 2.9559907859786378e-78
-
Input sentence: Fire!
Output sentence: T'ai !
reference: ['Au feu !']
bleu 5.431059661402121e-155
-
Input sentence: Help!
Output sentence: À l'aide !
reference: ["À l'aide !"]
bleu 1.0
-
Input sentence: Jump.
Output sentence: Allez-vous !
reference: ['Saute.']
bleu 1.1640469867513693e-231
-
Input sentence: Stop!
Output sentence: Arrête-toi !
reference: ['Stop !', 'Ça suffit !', 'Arrête-toi !']
bleu 1.0
-
Input sentence: Stop!
Output sentence: Arrête-toi !
reference: ['Stop !', 'Ça suffit !', 'Arrête-toi !']
bleu 1.0
-
Input sentence: Stop!
Output sentence: Arrête-toi !
reference: ['Stop !', 'Ça suffit !', 'Arrête-toi !']

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
