In [46]:
import math
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
import nltk
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split

### Loading the Dataset
Here we load the data from the `dataset.csv` file (generated in the other script)

In [32]:
def load_data():
    return pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [33]:
def lower_data(data):
    return data.str.lower() 
    
def clean_data(data):
    return data.str.replace(',', ' ,')                \
                .str.replace('.',' . ', regex=False)  \
                .str.replace('?',' ?', regex=False)   \
                .str.replace(r"[^a-zA-Z0-9?'.,]+",' ',regex=True)

def get_data():
    data = load_data()
    for column in data.columns:    
        data[column] = lower_data(data[column])
        data[column] = clean_data(data[column])
    return shuffle(data)

### Data analysis
Mean sentence length and standard deviation of sentence length

In [35]:
def print_data_analysis(data):
    print('Central tendency, dispersion and shape of questions’s distribution')
    print(data['question'].str.len().describe().apply(lambda x: format(x, 'f')))
    print('-'*100)
    print('Central tendency, dispersion and shape of answers’s distribution')
    print(data['answer'].str.len().describe().apply(lambda x: format(x, 'f')))

In [36]:
#print_data_analysis(dataset)

In [37]:
def remove_outliers(data):
    return data[(data['question'].str.len() < 100) & (data['answer'].str.len() < 200)]

def padd_data(data):
    data['question'] = data['question'].apply(lambda x: f"<start>{x}<end>")
    data['answer'] = data['answer'].apply(lambda x: f"<start>{x}<end>")
    return data

### Creating the dataset
Removing the outliers and adding <start> and <end> for each question, awnser pair

In [38]:
def create_dataset(num_examples):
    dataset = remove_outliers(get_data())
    dataset = padd_data(dataset)
    return dataset['question'].tolist(), dataset['answer'].tolist()

### Tokenizing 
Tokenize the data, padd the sequence and create the vocabulary

In [39]:
def tokenize(text):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+-:;<=>@[\\]^_{|}~\t')
  
    # Convert sequences into internal vocab
    tokenizer.fit_on_texts(text)

    # Convert internal vocab to numbers
    tensor = tokenizer.texts_to_sequences(text)

    # Pad the tensors to assign equal length to all the sequences
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', truncating='post',maxlen=None)

    return tensor, tokenizer

### Load the clean and formated data 

In [40]:
def load_dataset(num_examples=None):
 
    questions, answers = create_dataset(num_examples=None)

    questions_tensor, questions_tokenizer = tokenize(questions)
    answers_tensor, answers_tokenizer = tokenize(answers)

    return questions_tensor, answers_tensor, questions_tokenizer, answers_tokenizer

In [41]:
questions_tensor, answers_tensor, questions_tokenizer, answers_tokenizer = load_dataset()

### Split in train and test
Split 80% of the data to train and 20% for testing

In [52]:
max_length_input, max_length_target = questions_tensor.shape[1], answers_tensor.shape[1]
input_train, input_test, target_train, target_test = train_test_split(questions_tensor, answers_tensor, test_size=0.2)

print("Test count:", len(input_train))
print("Train count:", len(input_test))


Test count: 464364
Train count: 116092


In [50]:
def convert(text, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, text.index_word[t]))
      
print("Input question; index to word mapping")
convert(questions_tokenizer, input_train[0])
print()
print("Target awnser; index to word mapping")
convert(answers_tokenizer, target_train[0])

Input question; index to word mapping
3334 ----> represent
8 ----> your
2488 ----> nationality
920 ----> race
33 ----> who
6 ----> is
4 ----> the
25 ----> most
569 ----> beautiful
636 ----> woman
48 ----> from
8 ----> your
352 ----> part
10 ----> of
4 ----> the
110 ----> world
1 ----> ?

Target awnser; index to word mapping
416 ----> may
5 ----> i
2309 ----> present
3 ----> ,
722 ----> miss
38687 ----> huntington
76692 ----> whiteley
1 ----> .
2500 ----> nsfw


### Setting the hyperparameter

In [None]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_input_size = len(questions_tokenizer.word_index) + 1
vocab_target_size = len(answers_tokenizer.word_index) + 1

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape