# Data Preprocessing
Cleaning the Corpus and making the Dictionaries the Seq2Seq Model will use

In [1]:
# Importing the Libraries
import re, time, os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

  from ._conv import register_converters as _register_converters


In [2]:
def con(var,num):
    limit = num
    print("Size: {}".format(len(var)))
    print("Sample:"+"\n"+"{}".format(var[1:limit]))

In [3]:
# Important Variables
id2line = {}
conv_ids = []

questions = []
answers = []

clean_q = []
clean_a = []

w2c = {}

# hyper parameters
threshold = 24
qw2int = {}
aw2int = {}
word_num = 0

tokens = ['<PAD>','<EOS>','<OUT>','<SOS>']

q2int = []
a2int = []

s_clean_q = []
s_clean_a = []

In [4]:
# Importing Cornell Movie Data
lines = open('data/movie_lines.txt', 
             encoding='utf-8',
             errors='ignore').read().split('\n')
conversations = open('data/movie_conversations.txt', 
             encoding='utf-8',
             errors='ignore').read().split('\n')

In [5]:
# Creating a Dictionary that maps the line to its ID
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [6]:
# Creating a list of all the conversations
for conversation in conversations[:-1]:
    _conv = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conv_ids.append(_conv.split(','))    

In [7]:
# Splitting the questions and Answers to individual lists
for conv in conv_ids:
    for i in range(len(conv) - 1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])

In [8]:
# Cleaing the Dictionaries
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"y'know", "you know", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"\!", "", text)
    text = re.sub(r"\...", "", text)
    text = re.sub(r"!@#$%^&*()_+-=`~;',./:<>?", "", text)
    text = re.sub(r"0123456789", "", text)
    return text

In [9]:
for question in questions:
    clean_q.append(clean_text(question))
for answer in answers:
    clean_a.append(clean_text(answer))

In [10]:
# Creating a dictionary that maps each word to its number of ocurrences'
for q in clean_q:
    for word in q.split():
        if word not in w2c:
            w2c[word] = 1
        else:
            w2c[word] += 1
            
for a in clean_a:
    for word in a.split():
        if word not in w2c:
            w2c[word] = 1
        else:
            w2c[word] += 1

In [11]:
for word, count in w2c.items():
        if count >= threshold:
            qw2int[word] = word_num
            word_num += 1
for word, count in w2c.items():
        if count >= threshold:
            aw2int[word] = word_num
            word_num += 1

In [12]:
# Adding the last tokens to theres 2 dictionaries
for token in tokens:
    qw2int[token] = len(qw2int) + 1
for token in tokens:
    aw2int[token] = len(aw2int) + 1

In [13]:
# Creating the inverse dictionary of the answerswords2int dictionary
ai2word = {w_i: w for w, w_i in aw2int.items()}

In [14]:
# Adding the End Of String token to the end of every answer
for i in range(len(clean_a)):
    clean_a[i] += '<EOS>'

In [15]:
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
for q in clean_q:
    ints = []
    for word in question.split():
        if word not in qw2int:
            ints.append(qw2int['<OUT>'])
        else:
            ints.append(qw2int[word])
    q2int.append(ints)
    
for a in clean_a:
    ints = []
    for word in answer.split():
        if word not in aw2int:
            ints.append(aw2int['<OUT>'])
        else:
            ints.append(aw2int[word])
    a2int.append(ints)

In [16]:
# Sorting Questions and Answers by the length of the question
for length in range(1,25 + 1):
    for i in enumerate(q2int):
        if len(i[1]) == length:
            s_clean_q.append(q2int[i[0]])
            s_clean_a.append(a2int[i[0]])

# Seq2Seq Model
Functions that will define the architecture of the Seq2Seq Model

Here we are making the Tensorflow Placeholders, Variables, and Graphs needed for the Training

In [17]:
# Creatinng Placeholders for the inputs and Targets
def model_inputs():
    
    inputs = tf.placeholder(tf.int32,[None,None],name='input')
    
    targets = tf.placeholder(tf.int32,[None,None],name='target')
    
    # Hyper Params
    lr = tf.placeholder(tf.float32,name='learning_rate')
    
    keep_prob = tf.placeholder(tf.float32,name='keep_prob')
    
    return inputs, targets, lr, keep_prob

In [23]:
# Preprocessing the targets
def preprocess_targets(targets,w2c,batch_size):
    
    left_side = tf.fill([batch_size,1],w2c['<SOS>'])
    
    right_side = tf.strided_slice(targets,[0,0],[batch_size,-1],[1,1])
    
    preporcessed_targets = tf.concat([left_side,right_side],1)
    
    return preporcessed_targets

In [27]:
# creating the Encoder RNN Layer
# Creating the LSTM Cell with Dropout
def encoder_rnn_layer(rnn_inputs, rnn_size, num_layers, keep_prob, seq_len):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw= encoder_cell,
                                                       cell_bw= encoder_cell,
                                                       sequence_length= sequence_length,
                                                       inputs= rnn_inputs,
                                                       dtype=tf.float32)
    return encoder_state

#### Cell Below needs updating for Tensorflow 1.7 API 
[Stack Overflow Question for Above](https://stackoverflow.com/questions/50518926/ideas-for-upgrading-code-from-tensorflow-v1-0-12-to-1-7)

In [32]:
# Decoding the Training set
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_func, keep_prob, batch_size):
    attn_state = tf.zeros([batch_size,1,decoder_cell.output_size])
    
    attn_keys, attn_values, attn_score_func, attn_construct_func = tf.contrib.seq2seq.prepare_attention(attention_states, 
                                                                                                        atention_option='bahdanau',
                                                                                                        num_units=decoder_cell.output)

    training_decoder_func = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                          attn_keys,
                                                                          attn_values,
                                                                          attn_score_func,
                                                                          attn_construct_func,
                                                                          name = "attn_dec_train")
    
    decoder_output, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                  training_decoder_func,
                                                                  decoder_embedded_input,
                                                                  sequence_length,
                                                                  scope= decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output,keep_prob)
    return output_func(decoder_output_dropout)