In [83]:
# Imports
from nltk.corpus import wordnet
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import contractions

import re
from pickle import dump
from typing import Tuple

In [84]:
# Global variables, constants and some other pre-requisites
# Lemmatizer as singleton, set seed for reproducibility
lemmatizer = WordNetLemmatizer()
RADNOM_SEED = 42
DATASET_PATH = "../en-fr.csv"
DATASET_LENGTH = 100_000

# Downloading necessary nltk data
nltk_download('averaged_perceptron_tagger')
nltk_download('wordnet')
nltk_download('punkt')

# Debug Mode Flag (for printing stuff)
DEBUG = False

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/atakan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/atakan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/atakan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [85]:
# Load the dataset
df = pd.read_csv(DATASET_PATH, nrows=100_000)

if DEBUG:
    print(df.head(10))

In [87]:
def get_wordnet_pos(treebank_tag):
    '''
    Helper function to convert the pos tag format 
    into something compatible with the lemmatizer.
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_data(doc: str, expand: bool, lemma: bool):
    '''
    Cleans the data by removing non alphanumeric characters, 
    tokenizing and lemmatizing (if specified in the args).

    Args:
        doc (str): The document to clean
        expand (bool): Whether to expand contractions or not
        lemma (bool): Whether to lemmatize or not
    '''
    if expand:
        doc = contractions.fix(doc)

    # Remove every char that is not alphanumeric or end of sentence punctuation, keep spaces
    doc = doc.lower()
    doc = re.sub(r'[^ùûüÿàâæçéèêëïîôœÙÛÜŸÀÂÆÇÉÈÊËÏÎÔŒa-z0-9.!?]+', ' ', doc)
    tokens = wordpunct_tokenize(doc)

    #lowercase_tokens = [token.lower() for token in tokens]

    # Lemmatize the tokens
    if lemma:
        pos = pos_tag(tokens)
        clean_tokens = [lemmatizer.lemmatize(
            word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in pos]
    else:
        clean_tokens = tokens
        
    return clean_tokens

In [88]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Cleans the dataset by applying the clean_data function to each row.

    Args:
        df (pd.DataFrame): The dataset to clean
    '''
    df['en'] = df['en'].apply(lambda x: clean_data(x, expand=True, lemma=True))
    df['fr'] = df['fr'].apply(lambda x: clean_data(x, expand=False, lemma=False))
    
    return df

In [89]:
def split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Splits the dataset into train, validation and test sets,
    using the split ratio 80-10-10.
        
    Args:
        df (pd.DataFrame): The dataset to split
    '''
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        df["en"], df["fr"], test_size=0.2, random_state=RADNOM_SEED)
    X_test, X_val, y_test, y_val = train_test_split(
        X_val_test, y_val_test, test_size=0.5, random_state=RADNOM_SEED)
        
    return X_train, y_train, X_val, y_val, X_test, y_test

In [90]:
def create_tokenizer(text, max_words=0):
    # tokenizer = Tokenizer(num_words=max_words)
    if max_words == 0:
        tokenizer = Tokenizer()
    else:
        tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text)
    return tokenizer


def encode_sequences(tokenizer, text, pad_len):
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=pad_len, padding='post')
    return seq


def get_encodings(X, y=None, is_train=False, sc_tokenizer=None, tg_tokenizer=None, maxlen=40):
    # Only create and fit a new tokenizer on the training set
    if is_train:
        sc_tokenizer = create_tokenizer(X)
        tg_tokenizer = create_tokenizer(y)

    X_encoded = encode_sequences(sc_tokenizer, X, maxlen)
    y_encoded = encode_sequences(tg_tokenizer, y, maxlen)

    return X_encoded, y_encoded, sc_tokenizer, tg_tokenizer

In [91]:
def preprocess(df, is_clean):
    if not is_clean:
        df = df.dropna()
        df_clean = clean(df)
        df_clean = df_clean.dropna()
        df_clean.to_pickle('../fr-clean-data20.pkl')
    else:
        df_clean = pd.read_pickle('../fr-clean-data20.pkl')
        
    X_train, y_train, X_val, y_val, X_test, y_test = split(df_clean)
    return X_train, y_train, X_val, y_val, X_test, y_test

In [92]:
X_train, y_train, X_val, y_val, X_test, y_test = preprocess(df.head(20_000), is_clean=False)

In [None]:
# Turn sentences into tokenized and padded sequences
X_train_encoded, y_train_encoded, en_tokenizer, fr_tokenizer = get_encodings(X_train, y_train, is_train=True)

if DEBUG:
    print(len(en_tokenizer.word_index) + 1)
    print(len(fr_tokenizer.word_index) + 1)

X_val_encoded, y_val_encoded, _, _ = get_encodings(X=X_val, is_train=False, y=y_val, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

X_test_encoded, y_test_encoded, _, _ = get_encodings(X_test, y_test, is_train=False, sc_tokenizer=en_tokenizer, tg_tokenizer=fr_tokenizer)

with open('../fr_train_data20.npy', 'wb') as f:
            np.save(f, X_train_encoded)
            np.save(f, y_train_encoded)

with open('../fr_test_data20.npy', 'wb') as f:
            np.save(f, X_test_encoded)
            np.save(f, y_test_encoded)

with open('../fr_valid_data20.npy', 'wb') as f:
            np.save(f, X_val_encoded)
            np.save(f, y_val_encoded)

In [None]:
from keras.layers import LSTM, Embedding, Bidirectional, Dense, RepeatVector, TimeDistributed
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

def define_model(in_vocab_size, out_vocab_size, embedding_matrix=None, in_seq_length=40, out_seq_length=40, embedding_size=50):
    model = Sequential()
    model.add(Embedding(input_dim=in_vocab_size,
              output_dim=embedding_size, input_length=in_seq_length))
    # Encoder
    model.add(Bidirectional(LSTM(128)))
    # Decoder
    model.add(RepeatVector(out_seq_length))
    model.add(LSTM(256, return_sequences=True))
    # Prediction
    model.add(TimeDistributed(Dense(out_vocab_size, activation='softmax')))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(learning_rate=0.003),
                  metrics="accuracy")
    model.summary()
    return model

In [None]:
def train(model, X_train, y_train, X_val, y_val):
    history = model.fit(X_train, y_train, batch_size=128, epochs=1, validation_data=(X_val, y_val),
              verbose=1,
              callbacks=[
        EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ])
    model.save('../fr-model.h5')
    return history

In [None]:
en_vocab_size = len(en_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1
print(en_vocab_size, fr_vocab_size)
#model = define_model(en_vocab_size, fr_vocab_size)

16195 23101


In [None]:
history = train(model, X_train_encoded, y_train_encoded, X_val_encoded, y_val_encoded)

In [None]:
def vector_to_word(embedding, tokenizer):
    idx = np.argmax(embedding)
    print(idx)
    for word, transform in tokenizer.word_index.items():
        if transform == idx:
            return word
    return None


def get_sentences(sequences, tokenizer):
    predictions = []
    for sentence in sequences:
        predict = ''
        for emb in sentence:
            word = vector_to_word(emb, tokenizer)
            if word is not None:
                predict += word + ' '
        predictions.append(predict)
    return predictions

from keras.models import load_model
model = load_model("../fr-model-csanad.h5")
prediction = model.predict(X_train_encoded[0:100])
prediction_sentences = get_sentences(prediction, fr_tokenizer)
candidate_translations = [[sentence] for sentence in prediction_sentences]
print(candidate_translations)

In [None]:
from pickle import dump

dump(fr_tokenizer, open("../fr_tokenizer.pkl", "wb"))