# NLP Final Project
### Team 3: Daniel van Heuven van Staereling, Teo Stereciu, Csanad Vegh, Atakan Tekparmak

For the final project we tackled a machine translation task from English to French.

In [96]:
# Imports
from keras.preprocessing.text import Tokenizer;
from keras.utils import pad_sequences;

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import contractions

import re
from typing import Tuple

import matplotlib.pyplot as plt

In [97]:
# Global variables, constants
# Set seed for reproducibility
RANDOM_SEED = 42
DATASET_PATH = "../en-fr.csv"
DATASET_LENGTH = 50_000
ENCODING_LENGTH = 35

# Global Flags
DEBUG = True
SAVE_DATA = False # Save clean data to disk
SHOW_SENTENCE_LENGHTS = True # Show the sentence lengths of the dataset, for choosing the ENCODING_LENGTH

# PRE-PROCESSING

Here we preprocess the data following the steps described below:

- Cleaning the data
    - Expand English contractions
    - Remove every char that is not alphanumeric or end of sentence punctuation, keep spaces and French accents
    - Lowercase
    - Add bos and eos tags
- Tokenization
- Encoding into sequences of integers, where each integer is the index of a word from the vocabulary
- Padding of the zequences with zeros

## Cleaning

In [98]:
def clean_data(doc: str, expand: bool, lemma: bool):
    '''
    Cleans the data by expanding contractions (if specified in args),
    removing non alphanumeric characters (except end of sentence punctuation), 
    lowercasing,
    and adding bos and eos tags.

    Args:
        doc (str): The document to clean
        expand (bool): Whether to expand contractions or not
    '''
    if expand:
        doc = contractions.fix(doc)

    doc = doc.lower()
    doc = re.sub(r'[^ùûüÿàâæçéèêëïîôœÙÛÜŸÀÂÆÇÉÈÊËÏÎÔŒa-z0-9.!?]+', ' ', doc)
    doc = 'bos ' + doc + ' eos'
    
    return doc

def clean(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Cleans the dataset by applying the clean_data function to each row.

    Args:
        df (pd.DataFrame): The dataset to clean
    '''
    df['en'] = df['en'].apply(lambda x: clean_data(x, expand=True, lemma=True))
    df['fr'] = df['fr'].apply(lambda x: clean_data(x, expand=False, lemma=False))
    
    return df

In [99]:
def split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Splits the dataset into train, validation and test sets,
    using the split ratio 80-10-10.
        
    Args:
        df (pd.DataFrame): The dataset to split
    '''
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        df["en"], df["fr"], test_size=0.2, random_state=RANDOM_SEED)
    X_test, X_val, y_test, y_val = train_test_split(
        X_val_test, y_val_test, test_size=0.5, random_state=RANDOM_SEED)
        
    return X_train, y_train, X_val, y_val, X_test, y_test

In [100]:
def preprocess(df: pd.DataFrame, is_clean: bool = False):
    '''
    Preprocesses the dataset by cleaning it and splitting it into train, validation and test sets.

    Args:
        df (pd.DataFrame): The dataset to preprocess
        is_clean (bool): Whether to load a clean version of the dataset or apply cleaning
    '''
    if not is_clean:
        df = df.dropna()
        df_clean = clean(df)
        df_clean = df_clean.dropna()
        df_clean.to_pickle('../fr-clean-data' + str(int(DATASET_LENGTH/1_000)) + '.pkl')
    else:
        df_clean = pd.read_pickle('../fr-clean-data' + str(int(DATASET_LENGTH/1_000)) + '.pkl')
        
    X_train, y_train, X_val, y_val, X_test, y_test = split(df_clean)
    return X_train, y_train, X_val, y_val, X_test, y_test

In [112]:
df = pd.read_csv(DATASET_PATH, nrows=DATASET_LENGTH)

In [113]:
X_train, y_train, X_val, y_val, X_test, y_test = preprocess(df, is_clean=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['en'] = df['en'].apply(lambda x: clean_data(x, expand=True, lemma=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fr'] = df['fr'].apply(lambda x: clean_data(x, expand=False, lemma=False))


## Encoding

Now that the data is clean and split, let us look at what the sentence length tends to be for English and French. We see that the length 35 would cover most of the corpus in both cases.

In [102]:
def plot_sent_len(lenghts, lang):
    plt.hist(lenghts, bins=200)
    plt.xlabel('Sentence Length')
    plt.ylabel('Frequency')
    plt.title(lang + ' Sentence Length Distribution')
    plt.xlim(0, 100)
    plt.xticks(np.arange(0, 100, 5))
    plt.show()

In [None]:
if SHOW_SENTENCE_LENGHTS:
    en_sen_lens = [len(sentence.split()) for sentence in X_train]
    fr_sen_lens = [len(sentence.split()) for sentence in y_train]
    plot_sent_len(en_sen_lens, "English")
    plot_sent_len(fr_sen_lens, "French")

In [104]:
def create_tokenizer(text: pd.Series , max_words: int = 0):
    '''
    Creates a tokenizer and fits it on the specified text.

    Args:
        text (pd.Series): The text to fit the tokenizer on
        max_words (int): The maximum number of words to keep (0 means no limit)
    '''
    if max_words == 0:
        tokenizer = Tokenizer()
    else:
        tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text)
    return tokenizer


def encode_sequences(tokenizer: Tokenizer, text: pd.Series, pad_len: int = ENCODING_LENGTH):
    '''
    Encodes the sequences using the specified tokenizer.

    Args:
        tokenizer (Tokenizer): The tokenizer to use 
        text (pd.Series): The text to encode
        pad_len (int): The maximum length of the sequences
    '''
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=pad_len, padding='post')
    return seq


def get_encodings(
        X : pd.Series,
        y: pd.Series, 
        is_train: bool = False, 
        maxlen: int = ENCODING_LENGTH,
        sc_tokenizer: Tokenizer = None, 
        tg_tokenizer: Tokenizer = None, 
    ) -> Tuple[np.ndarray, np.ndarray, Tokenizer, Tokenizer]:
    '''
    Encodes the sequences using the specified tokenizers.

    Args:
        X (pd.Series): The input sequences
        y (pd.Series): The target sequences
        is_train (bool): Whether to create new tokenizers or not
        maxlen (int): The maximum length of the sequences
        sc_tokenizer (Tokenizer): The source language tokenizer
        tg_tokenizer (Tokenizer): The target language tokenizer
    '''
    # Only create and fit a new tokenizer on the training set
    if is_train:
        sc_tokenizer = create_tokenizer(X)
        tg_tokenizer = create_tokenizer(y)

    X_encoded = encode_sequences(sc_tokenizer, X, maxlen)
    y_encoded = encode_sequences(tg_tokenizer, y, maxlen)

    return X_encoded, y_encoded, sc_tokenizer, tg_tokenizer

In [None]:
 # Turn sentences into tokenized and padded sequences
X_train_encoded, y_train_encoded, en_tokenizer, fr_tokenizer = get_encodings(X_train, y_train, is_train=True)

if DEBUG:
    print(f"English vocabulary size: {len(en_tokenizer.word_index) + 1}")
    print(f"French vocabulary size: {len(fr_tokenizer.word_index) + 1}")
    print(f"An encoded row of the English training set: {X_train_encoded[860]}")
    print(f"The corrresponding sentence: {X_train.iloc[860]}")
    print(f"And its French translation: {y_train.iloc[860]}")

X_val_encoded, y_val_encoded, _, _ = get_encodings(X_val, y_val, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

X_test_encoded, y_test_encoded, _, _ = get_encodings(X_test, y_test, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

# MODEL DEVELOPMENT

Here we define our model, the training and prediction functions. Then, we train/load the model and evaluate it on the test set.

## Model

In [106]:
# Declare model hyperparameters
BATCH_SIZE = 128
EPOCHS = 30
PATIENCE = 5
UNITS = 256

SAVE_MODEL = True
LOAD_MODEL = False

In [107]:
from keras.layers import LSTM, Embedding, Dense, RepeatVector, TimeDistributed
from keras.models import Sequential
from keras.callbacks import EarlyStopping

def define_model(
        in_vocab_size: int, 
        out_vocab_size: int, 
        in_seq_length: int = ENCODING_LENGTH, 
        out_seq_length: int = ENCODING_LENGTH, 
        units: int = UNITS
    ) -> Sequential:
    '''
    Defines the model architecture.

    Args:
        in_vocab_size (int): The size of the source language vocabulary
        out_vocab_size (int): The size of the target language vocabulary
        in_seq_length (int): The maximum length of the source language sequences
        out_seq_length (int): The maximum length of the target language sequences
        units (int): Number of neurons in each layer
    '''
    model = Sequential()

    # Embedding
    model.add(Embedding(input_dim=in_vocab_size,
              output_dim=units, input_length=in_seq_length, mask_zero=True))
    # Encoder
    model.add(LSTM(units))
    # Decoder
    model.add(RepeatVector(out_seq_length))
    # GRU
    model.add(LSTM(units, return_sequences=True))
    # Prediction
    model.add(TimeDistributed(Dense(out_vocab_size, activation='softmax')))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer="RMSprop",
                  metrics="accuracy")
    
    return model

## Training

In [108]:
def train(
        model: Sequential, 
        X_train: np.ndarray, 
        y_train: np.ndarray, 
        X_val: np.ndarray, 
        y_val: np.ndarray
    ):
    """
    Trains the model.

    Args:
        model (Sequential): The model to train
        X_train (np.ndarray): The source language sequences from the training set
        y_train (np.ndarray): The target language sequences from the training set
        X_val (np.ndarray): The source language sequences from the validation set
        y_val (np.ndarray): The target language sequences from the validation set
    """
    history = model.fit(
        X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
        validation_data=(X_val, y_val), verbose=1,
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                patience=PATIENCE,
                restore_best_weights=True
            )
        ]
    )
    if SAVE_MODEL:
        model.save('../fr-model.h5')
    return history

In [None]:
# Define the model
if LOAD_MODEL:
    from keras.models import load_model
    from pickle import load
    fr_tokenizer = load(open("../fr_tokenizer.pkl", "rb"))
    model = load_model("../fr-model.h5")
    
else:
    en_vocab_size = len(en_tokenizer.word_index) + 1
    fr_vocab_size = len(fr_tokenizer.word_index) + 1
    model = define_model(en_vocab_size, fr_vocab_size)
    if SAVE_MODEL:
        from pickle import dump
        dump(fr_tokenizer, open("../fr_tokenizer.pkl", "wb"))

if DEBUG:
    model.summary()

In [111]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True)

In [None]:
# Train the model
history = train(model, X_train_encoded, y_train_encoded, X_val_encoded, y_val_encoded)
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

## Evaluation

In [None]:
def vector_to_word(prediction: np.ndarray, tokenizer: Tokenizer):
    '''
    Converts the most likely word index in the prediction back to text.

    Args:
        prediction (np.ndarray): A probability distribution over the vocabulary
        tokenizer (Tokenizer): The tokenizer to use
    '''
    idx = np.argmax(prediction)

    for word, transform in tokenizer.word_index.items():
        if transform == idx:
            return word
    return None


def get_sentences(sequences, tokenizer: Tokenizer):
    '''
    Converts a list of sequences to a list of sentences.

    Args:
        sequences (list): The prediction of the model
        tokenizer (Tokenizer): The tokenizer to use
    '''
    predictions = []
    for sentence in sequences:
        predict = ''
        for word_pred in sentence:
            word = vector_to_word(word_pred, tokenizer)
            if word is not None:
                predict += word + ' '
        predictions.append(predict)
    return predictions

In [64]:
prediction = model.predict(X_test_encoded)
prediction_sentences = get_sentences(prediction, fr_tokenizer)
candidate_translations = [[sentence] for sentence in prediction_sentences]
print(candidate_translations)
# todo make this nicer and add bleu score