In [None]:
import Attention as AttentionLayer
import numpy as np
import pandas as pd
import re
import pickle
from unicodedata import normalize
import string
import keras
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import Callback
import warnings
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Bidirectional, LSTM, Dense, Concatenate
from tensorflow.keras.initializers import Constant
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
default_path = "drive/My Drive/"
path = default_path + 'cnn/fr-en'

## Preparing Data

In [2]:
def clean_data(text):
    result = []
    # regex for removing weird chars
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    regex_punct = re.compile('[%s]' % re.escape(string.punctuation))
    for line in lines:
        # unicode chars
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # split on whitespace so we can remove weird chars and punctuation
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation
        line = [regex_punct.sub('', word) for word in line]
        # remove weird chars
        line = [re_print.sub('', w) for w in line]
        #remove numbers
        line = [word for word in line if word.isalpha()]
        result.append(' '.join(line))
    return result

In [None]:
#load english text
with open(path + '/' + 'europarl-v7.fr-en.fr', encoding='utf-8') as f:
    english_text = f.read()

#load french data
with open(path + '/' + 'europarl-v7.fr-en.en', encoding='utf-8') as f:
    french_text = f.read()

clean_eng = clean_data(english_text)
clean_french = clean_data(french_text)
#save the files so we dont have to do this process again
with open('clean_english.pkl', 'wb') as f:
    pickle.dump(clean_eng, f)

with open('clean_french.pkl', 'wb') as f:
    pickle.dump(clean_french, f)

In [None]:
french_path = path + 'clean_french.pkl'
english_path = path + 'clean_english.pkl'

In [None]:
max_length = 40
#cut down length of sentences to speed up training times
shortened_english = [word_tokenize(x)[:max_length] for x in english_data]
shortened_french = [['<s>'] + word_tokenize(x)[:max_length - 2] + ['</s>']
                    for x in french_data]
#add padding to english sentences to make them all same length
shortened_english = [
    d + (max_length - len(d)) * ["<padding>"] for d in shortened_english
]

In [None]:
#creating list of words in english
english_words = list()
for sentence in shortened_english:
    for word in sentence:
        if word != "<padding>":
            english_words.append(word)

#putting the most common words into a dict
english_word_counter = collections.Counter(english_words).most_common()[:10000]
english_word_dict = dict()
english_word_dict["<padding>"] = 0
english_word_dict["<unk>"] = 1
english_word_dict["<s>"] = 2
english_word_dict["</s>"] = 3
for word, _ in english_word_counter:
    english_word_dict[word] = len(english_word_dict)

In [None]:
#creating list of french words
french_words = list()
for sentence in shortened_french:
    for word in sentence:
        if word != "<padding>" and word != "<s>" and word != "</s>":
            french_words.append(word)

#putting the most common words into a dict
french_word_counter = collections.Counter(french_words).most_common()[:10000]
french_word_dict = dict()
french_word_dict["<padding>"] = 0
french_word_dict["<unk>"] = 1
french_word_dict["<s>"] = 2
french_word_dict["</s>"] = 3
for word, _ in french_word_counter:
    french_word_dict[word] = len(french_word_dict)

In [None]:
#obtaining reversed dicts for inference
english_reversed_dict = dict(zip(english_word_dict.values(), english_word_dict.keys()))
french_reversed_dict = dict(zip(french_word_dict.values(), french_word_dict.keys()))

In [None]:
#initialising the training data
encoder_input_data = np.zeros(
    (len(shortened_english), max_length),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(shortened_french), max_length),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(shortened_french), max_length),
    dtype='float32')

In [None]:
for i, (input_text,target_text) in enumerate(zip(shortened_english, shortened_french)):
    for t, char in enumerate(input_text):
        try:
            #use unk if we cant find the word in the dictionary
            encoder_input_data[i, t] = english_word_dict.get(
                char, english_word_dict["<unk>"])
        except:
            print(char)

    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = french_word_dict.get(
            char, french_word_dict["<unk>"])
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1] = french_word_dict.get(
                char, french_word_dict["<unk>"])

In [None]:
#split the data to get validation set
from sklearn.model_selection import train_test_split
encoder_input_data_train, encoder_input_data_test, decoder_input_data_train, decoder_input_data_test, decoder_target_data_train, decoder_target_data_test = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2)

## Glove Embedding

In [None]:
with open(path + "glove/model_glove_300.pkl", 'rb') as handle:
        word_vectors = pickle.load(handle)

In [None]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(english_word_dict) + 1, EMBEDDING_DIM))

In [None]:
for word, i in english_word_dict.items():
      # if the word can be found in the glove word vector then add the weight
      # if not then just keep it blank
    try:
        embedding_vector = word_vectors.word_vec(word)
    except:
        embedding_vector=None
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Modelling Stage

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
        path+'models/{epoch:02d}-{val_loss:.2f}.h5',
        verbose=1,
        save_best_only=True
    )

In [None]:
EMBEDDING_DIM = 300
latent_dim = 300
num_words = 10004
units = 128
#put the glove embeddings in here
embedding_layer = Embedding(num_words + 1,
                            latent_dim,
                            weights=[embedding_matrix],
                            trainable=False)

#encoder model
encoder_inputs = Input(shape=(max_length, ), name="encoder_input")
encoder_emb = embedding_layer(encoder_inputs)
encoder_lstm_1 = LSTM(latent_dim, return_state=True, return_sequences=True)
encoder_output1, state_h1, state_c1 = encoder_lstm_1(encoder_emb)
encoder_lstm_2 = LSTM(latent_dim, return_state=True, return_sequences=True)

encoder_output2, state_h2, state_c2 = encoder_lstm_2(encoder_output1)

#decoder model
decoder_inputs = Input(shape=(None, ))
decoder_emb_layer = Embedding(num_words, latent_dim, trainable=True)
decoder_emb = decoder_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_emb,
                                     initial_state=[state_h2, state_c2])
#attention layer
attention = AttentionLayer()
attn_out, attn_states = attention([encoder_output2, decoder_outputs])
#combine attention with decoder outputs
decoder_outputs = Concatenate(axis=-1)([decoder_outputs, attn_out])

decoder_dense = TimeDistributed(
    Dense(num_words, activation='softmax', name="Dense_layer"))

decoder_final_outputs = decoder_dense(decoder_outputs)
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_final_outputs)
#use sparse categorical as our data is not one hot encoded
seq2seq_Model.compile(optimizer='rmsprop',
                      loss='sparse_categorical_crossentropy',
                      metrics=["accuracy"])

In [None]:
#training model
history = seq2seq_Model.fit(
    [encoder_input_data_train, decoder_input_data_train],
    decoder_target_data_train,
    epochs=10,
    validation_data=([encoder_input_data_test,
                      decoder_input_data_test], decoder_target_data_test),
    callbacks=[checkpoint])

In [None]:
#encoder inference model
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_output2,state_h2, state_c2])

#decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(40,latent_dim))
dec_emb2=decoder_emb_layer(decoder_inputs)
decoder_outputs_inf, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
attn_out_inf,attn_out_states=attention([decoder_hidden_state_input, decoder_outputs_inf])

decoder_inf_concat=tf.keras.layers.Concatenate()([decoder_outputs_inf, attn_out_inf])
decoder_outputs2 = decoder_dense(decoder_inf_concat)
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [None]:
#encoding incoming data
e_out, e_h, e_c = encoder_model.predict(test_data.reshape(1, -1))
#creating target sequence
target_seq = np.zeros((1, 1))
#inputing start token
target_seq[0, 0] = english_word_dict['<s>']
stop_condition = False
decoded_sentence = ''
#looping until stop token is predicted or
while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] +
                                                [e_out, e_h, e_c])
    #want to get index with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    #find word at that index
    sampled_char = french_reversed_dict[sampled_token_index]
    #add to sentence
    decoded_sentence += ' ' + sampled_char
    if (sampled_char == "</s>" or len(word_tokenize(decoded_sentence)) > 52):
        stop_condition = True
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = sampled_token_index
    #update hidden + cell states
    e_h, e_c = h, c