In [1]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
import nltk
import gensim
from gensim.models import Word2Vec
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

2023-06-22 17:59:47.759189: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-22 17:59:48.542996: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-22 17:59:49.992700: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-22 17:59:49.992959: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

### Building the Vocabulary

First, we build the vocabulary dictionaries for the source and target. 
The vocabulary is the the file `vocab.txt` (generated in the other script).

In [2]:
# Word string -> ID mapping
dictionary = dict()

with open('data/vocab.30K.txt', encoding='utf-8') as f:
    for line in f:
        # disregard the new line aka `\n`
        dictionary[line[:-1]] = len(dictionary)
        
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

print('Dictionary:', list(dictionary.items())[:10], end = '\n')
print('Reverse dictionary:', list(reverse_dictionary.items())[:10], end = '\n')
print('Vocabulary size: ', len(dictionary), end = '\n')


Dictionary: [('<unk>', 0), ('<s>', 1), ('</s>', 2), ('.', 3), ('the', 4), (',', 5), ('a', 6), ('?', 7), ('to', 8), ('you', 9)]
Reverse dictionary: [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, '.'), (4, 'the'), (5, ','), (6, 'a'), (7, '?'), (8, 'to'), (9, 'you')]
Vocabulary size:  30000


### Loading the Dataset
Here we load the data from the `dataset.csv` file (generated in the other script)

In [3]:
dataset = pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [4]:
def lowerDataset(data):
    return data.str.lower() 
    
def cleanDataset(data):
    return data.str.replace('/r/','')                  \
                .str.replace(')','', regex=False)      \
                .str.replace('(','', regex=False)      \
                .str.replace(']','', regex=False)      \
                .str.replace('[','', regex=False)      \
                .str.replace('!','')                   \
                .str.replace('"','')                   \
    
def paddDataset(data):
    return data.str.replace(',', ' ,')                 \
                .str.replace('.',' . ', regex=False)    \
                .str.replace('?',' ?', regex=False)    \
                .str.replace('\n',' ')


In [5]:
wt = nltk.tokenize.WhitespaceTokenizer()
for column in dataset.columns:    
    dataset[column] = lowerDataset(dataset[column]) 
    dataset[column] = cleanDataset(dataset[column])
    dataset[column] = paddDataset(dataset[column])                                    
    dataset[column] = dataset[column].apply(wt.tokenize)
dataset = shuffle(dataset)

In [6]:
dataset.head()

Unnamed: 0,question,answer
868670,"[you, are, opening, a, history-themed, 24, hou...","[manifest, breakfesty, -, build, your, own, br..."
40509,"[what, short, joke, is, so, stupid, but, funny...","[shout, out, to, all, the, people, who, don't,..."
415514,"[a, nuclear, meltdown, enables, everyone, on, ...","[raging, boner, ., ., ., it, will, give, me, t..."
354298,"[what, invention, is, way, older, than, people...","[the, ancient, romans, well, ,, the, wealthy, ..."
676325,"[what, made, the, difference, in, your, weight...","[realising, that, the, only, reason, i, was, w..."


### Data analysis
Mean sentence length and standard deviation of sentence length

In [15]:
print('Central tendency, dispersion and shape of questions’s distribution')
print(dataset['question'].str.len().describe().apply(lambda x: format(x, 'f')))

Central tendency, dispersion and shape of questions’s distribution
count    1149819.000000
mean          17.113906
std            9.139078
min            1.000000
25%           11.000000
50%           15.000000
75%           21.000000
max           82.000000
Name: question, dtype: object


In [16]:
print('Central tendency, dispersion and shape of answers’s distribution')
print(dataset['answer'].str.len().describe().apply(lambda x: format(x, 'f')))

Central tendency, dispersion and shape of answers’s distribution
count    1149819.000000
mean          54.452828
std          844.371854
min            0.000000
25%           10.000000
50%           22.000000
75%           53.000000
max       563680.000000
Name: answer, dtype: object


### Update the sentences to fixed length
Update all sentences with a fixed size, to process the sentences as batches.

In [7]:
max_sent_length = {'question' : 30, 'answer': 60}

def padding_sent(source):
    padded = []
    for tokens in dataset[source]: 
        # adding the start token
        tokens.insert(0, '<s>')  

        if len(tokens) >= max_sent_length[source]:
            tokens = tokens[:(max_sent_length[source] - 1)]
            tokens.append('</s>')

        if len(tokens) < max_sent_length[source]:
            tokens.extend(['</s>' for _ in range(max_sent_length[source] - len(tokens))])  

        padded.append(tokens)
    return padded

In [8]:
questions = padding_sent('question')
answers = padding_sent('answer')

### Create the reverse dataset

In [9]:
def create_reverse_dataset(source):
    reverse_tokens = []
    reverse_dataset = []
    for tokens in source: 
        for token in tokens: 
            if token not in dictionary.keys():
                reverse_tokens.append(dictionary['<unk>'])
            else:
                reverse_tokens.append(dictionary[token])
        reverse_dataset.append(reverse_tokens)
        reverse_tokens = []
    return reverse_dataset

inputs_indexes =  np.array(create_reverse_dataset(questions), dtype=np.int32)
outputs_indexes =  np.array(create_reverse_dataset(answers), dtype=np.int32)

### Word Embedding

In [11]:
model = Word2Vec(questions + answers + [['<unk>']], vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [None]:
model = Word2Vec.load("word2vec.model")

train_inputs = list()
train_output = list()

for _ , (input_index, output_index) in enumerate(zip(inputs_indexes, outputs_indexes)):
    
    # Formates the input to the word2vec encoded format
    train_input = list()   
    for index in input_index:
        train_input.append(model.wv[reverse_dictionary[index]])
    train_inputs.append(train_input)
    
    # Formates the output to the one-hot-encode format
    train_output = list()
    for index in output_index:
        output_encoded = np.zeros((vocabulary_size), dtype=np.float32)
        output_encoded[index] = 1
        train_output.append(output_encoded)
    train_outputs.append(train_output)