In [41]:
# Imports
from nltk.corpus import wordnet
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download

from keras.preprocessing.text import Tokenizer;
from keras.utils import pad_sequences;
from keras.utils import to_categorical;

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import contractions

import re
from pickle import dump
from typing import Tuple

In [42]:
# Global variables, constants and some other pre-requisites
# Lemmatizer as singleton, set seed for reproducibility
lemmatizer = WordNetLemmatizer()
RANDOM_SEED = 42
DATASET_PATH = "../en-fr.csv"
DATASET_LENGTH = 30_000
ENCODING_LENGTH = 30

# Downloading necessary nltk data
nltk_download('averaged_perceptron_tagger')
nltk_download('wordnet')
nltk_download('punkt')

# Global Flags
DEBUG = True
SAVE_DATA = False # Save processed data to disk
SHOW_SENTENCE_LENGTS = False # Show the sentence lengths of the dataset, for choosing the ENCODING_LENGTH

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/teodorastereciu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/teodorastereciu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/teodorastereciu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
if SHOW_SENTENCE_LENGTS:
    import matplotlib.pyplot as plt


# Exploratory data analysis to find the ENCODING_LENGTH
def add_sentence_length_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds a column to the dataframe with the length of the sentence in words
    """

    def count_words(sentence: str) -> int:
        ''' Counts the number of words in a sentence, using vanilla python '''
        if isinstance(sentence, float):
            return 0
        return len(sentence.split())
    
    df.dropna(inplace=True)
    df['length'] = df['en'].apply(lambda x: count_words(x))
    return df


def plot_sentence_length(df: pd.DataFrame):
    """
    Plots the sentence length distribution
    """
    if not SHOW_SENTENCE_LENGTS:
        return
    
    plt.hist(df['length'], bins=200)
    plt.xlabel('Sentence Length')
    plt.ylabel('Frequency')
    plt.title('Sentence Length Distribution')
    plt.xlim(0, 100)
    plt.xticks(np.arange(0, 100, 5))
    plt.show()

In [44]:
# Load the dataset and print some info if in debug mode
df = pd.read_csv(DATASET_PATH, nrows=DATASET_LENGTH)

if DEBUG:
    if SAVE_DATA:
        df.to_csv(f"en-fr-{DATASET_LENGTH}.csv", index=False)
    if SHOW_SENTENCE_LENGTS:
        plot_sentence_length(add_sentence_length_to_df(df))
        
    print(df.head(10))

                                                  en  \
0  Changing Lives | Changing Society | How It Wor...   
1                                           Site map   
2                                           Feedback   
3                                            Credits   
4                                           Français   
5                                    What is light ?   
6  The white light spectrum Codes in the light Th...   
7  The sky of the first inhabitants A contemporar...   
8                                            Cartoon   
9                                              Links   

                                                  fr  
0  Il a transformé notre vie | Il a transformé la...  
1                                       Plan du site  
2                                        Rétroaction  
3                                            Crédits  
4                                            English  
5                          Qu’est-ce que la lumière? 

In [45]:
# Data cleaning and lemmaization
def get_wordnet_pos(treebank_tag):
    '''
    Helper function to convert the pos tag format 
    into something compatible with the lemmatizer.
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_data(doc: str, expand: bool, lemma: bool):
    '''
    Cleans the data by removing non alphanumeric characters (except punctuation), 
    tokenizing and lemmatizing (if specified in the args).

    Args:
        doc (str): The document to clean
        expand (bool): Whether to expand contractions or not
        lemma (bool): Whether to lemmatize or not
    '''
    if expand:
        doc = contractions.fix(doc)

    # Remove every char that is not alphanumeric or end of sentence punctuation, keep spaces
    doc = doc.lower()
    doc = re.sub(r'[^ùûüÿàâæçéèêëïîôœÙÛÜŸÀÂÆÇÉÈÊËÏÎÔŒa-z0-9.!?]+', ' ', doc)
    doc = '<bos> ' + doc + ' <eos>'
    #tokens = wordpunct_tokenize(doc)

    # Lemmatize the tokens
    #if lemma:
    #    pos = pos_tag(tokens)
    #    clean_tokens = [lemmatizer.lemmatize(
    #        word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in pos]
    #else:
    #    clean_tokens = tokens.insert(0,'<bos>')
    #    clean_tokens = clean_tokens.append('<eos>')
        
    return doc

def clean(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Cleans the dataset by applying the clean_data function to each row.

    Args:
        df (pd.DataFrame): The dataset to clean
    '''
    df['en'] = df['en'].apply(lambda x: clean_data(x, expand=True, lemma=False))
    df['fr'] = df['fr'].apply(lambda x: clean_data(x, expand=False, lemma=False))
    
    return df

In [46]:
def split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Splits the dataset into train, validation and test sets,
    using the split ratio 80-10-10.
        
    Args:
        df (pd.DataFrame): The dataset to split
    '''
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        df["en"], df["fr"], test_size=0.2, random_state=RANDOM_SEED)
    X_test, X_val, y_test, y_val = train_test_split(
        X_val_test, y_val_test, test_size=0.5, random_state=RANDOM_SEED)
        
    return X_train, y_train, X_val, y_val, X_test, y_test

In [47]:
def create_tokenizer(text: pd.Series , max_words: int = 0):
    '''
    Creates a tokenizer and fits it on the specified text.

    Args:
        text (pd.Series): The text to fit the tokenizer on
        max_words (int): The maximum number of words to keep (0 means no limit)
    '''
    if max_words == 0:
        tokenizer = Tokenizer()
    else:
        tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text)
    return tokenizer


def encode_sequences(tokenizer: Tokenizer, text: pd.Series, pad_type: str, pad_len: int = ENCODING_LENGTH):
    '''
    Encodes the sequences using the specified tokenizer.

    Args:
        tokenizer (Tokenizer): The tokenizer to use 
        text (pd.Series): The text to encode
        pad_type (str): The type of padding for the sequence, "pre" or "post"
        pad_len (int): The maximum length of the sequences
    '''
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=pad_len, padding=pad_type)
    return seq


def get_encodings(
        X : pd.Series,
        y: pd.Series, 
        is_train: bool = False, 
        maxlen: int = ENCODING_LENGTH,
        sc_tokenizer: Tokenizer = None, 
        tg_tokenizer: Tokenizer = None, 
    ) -> Tuple[np.ndarray, np.ndarray, Tokenizer, Tokenizer]:
    '''
    Encodes the sequences using the specified tokenizers.

    Args:
        X (pd.Series): The input sequences
        y (pd.Series): The target sequences
        is_train (bool): Whether to create new tokenizers or not
        maxlen (int): The maximum length of the sequences
        sc_tokenizer (Tokenizer): The source language tokenizer
        tg_tokenizer (Tokenizer): The target language tokenizer
    '''
    # Only create and fit a new tokenizer on the training set
    if is_train:
        sc_tokenizer = create_tokenizer(X)
        tg_tokenizer = create_tokenizer(y)

    X_encoded = encode_sequences(sc_tokenizer, X, "pre", maxlen)
    y_encoded = encode_sequences(tg_tokenizer, y, "post", maxlen)

    return X_encoded, y_encoded, sc_tokenizer, tg_tokenizer

In [48]:
def preprocess(df: pd.DataFrame, is_clean: bool = False):
    '''
    Preprocesses the dataset by cleaning it and splitting it into train, validation and test sets.

    Args:
        df (pd.DataFrame): The dataset to preprocess
        is_clean (bool): Whether to load a clean version of the dataset or not
    '''
    if not is_clean:
        df = df.dropna()
        df_clean = clean(df)
        df_clean = df_clean.dropna()
        df_clean.to_pickle('../fr-clean-data' + str(int(DATASET_LENGTH/1_000)) + '.pkl')
    else:
        df_clean = pd.read_pickle('../fr-clean-data' + str(int(DATASET_LENGTH/1_000)) + '.pkl')
        
    X_train, y_train, X_val, y_val, X_test, y_test = split(df_clean)
    return X_train, y_train, X_val, y_val, X_test, y_test

# PRE-PROCESSING

Here we apply the functions we have defined above to pre-process the data. The pre-processing steps are:

- Cleaning the data
    - Remove every char that is not alphanumeric or end of sentence punctuation, keep spaces
    - Lemmatization
- Tokenization
- Encoding

In [49]:
# Apply pre-processing to the dataset
# In a separate cell to avoid re-running it every time (takes a while)
X_train, y_train, X_val, y_val, X_test, y_test = preprocess(df, is_clean=False);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['en'] = df['en'].apply(lambda x: clean_data(x, expand=True, lemma=False))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fr'] = df['fr'].apply(lambda x: clean_data(x, expand=False, lemma=False))


In [53]:
 # Turn sentences into tokenized and padded sequences
X_train_encoded, y_train_encoded, en_tokenizer, fr_tokenizer = get_encodings(X_train, y_train, is_train=True)

if DEBUG:
    print(len(en_tokenizer.word_index) + 1)
    print(len(fr_tokenizer.word_index) + 1)

X_val_encoded, y_val_encoded, _, _ = get_encodings(X=X_val, is_train=False, y=y_val, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

X_test_encoded, y_test_encoded, _, _ = get_encodings(X_test, y_test, is_train=False, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

with open('../fr_train_data' + str(int(DATASET_LENGTH/1_000)) + '.npy', 'wb') as f: 
            np.save(f, X_train_encoded)
            np.save(f, y_train_encoded)

with open('../fr_test_data' + str(int(DATASET_LENGTH/1_000)) + '.npy', 'wb') as f:
            np.save(f, X_test_encoded)
            np.save(f, y_test_encoded)

with open('../fr_valid_data' + str(int(DATASET_LENGTH/1_000)) + '.npy', 'wb') as f:
            np.save(f, X_val_encoded)
            np.save(f, y_val_encoded)

24428
29395


# MODEL, TRAINING AND PREDICTION

Here we define our model, the training and prediction functions. Then, we train/load the model and predict the selected labels.

In [54]:
from keras.layers import GRU, Embedding, Bidirectional, Dense, RepeatVector, TimeDistributed
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

def define_model(
        in_vocab_size: int, 
        out_vocab_size: int, 
        in_seq_length: int = ENCODING_LENGTH, 
        out_seq_length: int = ENCODING_LENGTH, 
        embedding_size: int = 50
    ) -> Sequential:
    '''
    Defines the model architecture.

    Args:
        in_vocab_size (int): The size of the source language vocabulary
        out_vocab_size (int): The size of the target language vocabulary
        in_seq_length (int): The maximum length of the source language sequences
        out_seq_length (int): The maximum length of the target language sequences
        embedding_size (int): The size of the embedding layer
    '''
    model = Sequential()

    # Embedding
    model.add(Embedding(input_dim=in_vocab_size,
              output_dim=embedding_size, input_length=in_seq_length))
    # Encoder
    model.add(Bidirectional(GRU(256)))
    # Decoder
    model.add(RepeatVector(out_seq_length))
    # GRU
    model.add(GRU(256, return_sequences=True))
    # Prediction
    model.add(TimeDistributed(Dense(out_vocab_size, activation='softmax')))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(),
                  metrics="accuracy")
    
    if DEBUG:
        model.summary()
    return model

In [60]:
# Declare model hyperparameters
BATCH_SIZE = 128
EPOCHS = 5
PATIENCE = 5
LOAD_MODEL = False

In [61]:
def train(
        model: Sequential, 
        X_train: np.ndarray, 
        y_train: np.ndarray, 
        X_val: np.ndarray, 
        y_val: np.ndarray
    ):
    """
    Trains the model.

    Args:
        model (Sequential): The model to train
        X_train (np.ndarray): The training set
        y_train (np.ndarray): The training labels
        X_val (np.ndarray): The validation set
        y_val (np.ndarray): The validation labels
    """
    history = model.fit(
        X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
        validation_data=(X_val, y_val),verbose=1,
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                patience=PATIENCE,
                restore_best_weights=True
            )
        ]
    )
    model.save('../fr-model.h5')
    return history


def vector_to_word(embedding: np.ndarray, tokenizer: Tokenizer):
    '''
    Converts a vectorized word back to its original form.

    Args:
        embedding (np.ndarray): The vectorized word
        tokenizer (Tokenizer): The tokenizer to use
    '''
    idx = np.argmax(embedding)

    if DEBUG:
        print(idx)
    for word, transform in tokenizer.word_index.items():
        if transform == idx:
            return word
    return None


def get_sentences(sequences, tokenizer: Tokenizer):
    '''
    Converts a list of sequences to a list of sentences.

    Args:
        sequences (list): The list of sequences
        tokenizer (Tokenizer): The tokenizer to use
    '''
    predictions = []
    for sentence in sequences:
        predict = ''
        for emb in sentence:
            word = vector_to_word(emb, tokenizer)
            if word is not None:
                predict += word + ' '
        predictions.append(predict)
    return predictions

In [62]:
# Define the model
if LOAD_MODEL:
    from keras.models import load_model
    model = load_model("../fr-model-csanad.h5")
else:
    en_vocab_size = len(en_tokenizer.word_index) + 1
    fr_vocab_size = len(fr_tokenizer.word_index) + 1
    model = define_model(en_vocab_size, fr_vocab_size)

2023-06-17 16:08:34.979349: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-17 16:08:34.980315: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-17 16:08:34.982608: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            1221400   
                                                                 
 bidirectional_1 (Bidirectio  (None, 512)              473088    
 nal)                                                            
                                                                 
 repeat_vector_1 (RepeatVect  (None, 30, 512)          0         
 or)                                                             
                                                                 
 gru_3 (GRU)                 (None, 30, 256)           591360    
                                                                 
 time_distributed_1 (TimeDis  (None, 30, 29395)        7554515   
 tributed)                                                       
                                                      

In [63]:
# Train the model
history = train(model, X_train_encoded, y_train_encoded, X_val_encoded, y_val_encoded)

Epoch 1/5


2023-06-17 16:08:50.386700: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-17 16:08:50.387358: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-17 16:08:50.387983: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-17 16:14:29.835121: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-17 16:14:29.836108: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-17 16:14:29.836772: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
# Predict on the first 100 training examples
prediction = model.predict(X_train_encoded[0:100])
prediction_sentences = get_sentences(prediction, fr_tokenizer)
candidate_translations = [[sentence] for sentence in prediction_sentences]
print(candidate_translations)

2023-06-17 16:47:18.013967: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-17 16:47:18.014910: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-17 16:47:18.015541: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
2
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [None]:
# Save the tokenizer (didn't remove this, might be useful to at least someone)
from pickle import dump

dump(fr_tokenizer, open("../fr_tokenizer.pkl", "wb"))