# Quora Question Pairs Extensions

### Objective: 

Determine if MIMICK model improve performance on classification task

### Approach

Create two models, each with an identical architecture but different embedding layer.  In the baseline we will use Google's word2vec embedding if it exists, and map everything else to a trainable <UNK> token.  In the second model, we will use Google's word2vec if it exists, the predicted embedding from MIMICK if the word fits our initial training criteria, and <UNK> otherwise.  
    
### Model

We build two separate models, each of which are identical.  They consist of an embedding layer, a LSTM layer w/ attention, and finally a second LSTM layer.  The final state output from both models is concatenated and passed through a fully connected layer with Softmax activation.  We use categorical-crossentropy as our loss function.

In [26]:
# Imports necessary for notebook

import pandas as pd
from string import ascii_lowercase, ascii_uppercase

import gensim
import numpy as np

from statistics import mean 

from keras import optimizers, regularizers
from keras.models import load_model, Model
from keras.utils import CustomObjectScope, to_categorical
from keras.initializers import glorot_uniform
from keras.layers import Dropout, Embedding, Dense, LSTM, Bidirectional, Input, concatenate
from keras.preprocessing import sequence 
from keras.preprocessing.text import Tokenizer

from keras_self_attention import SeqSelfAttention

# Set known dimension of word embeddings from word2vec
H=300

## Load pretrained word2vec embeddings and MIMICK model

In [2]:
# Use gensim to load pre-trained google word2vec embeddings
model = gensim.models.KeyedVectors.load_word2vec_format(
    '../word2vec_model/GoogleNews-vectors-negative300.bin', binary=True) 


# Load our previously trained embedding model; we use our complex model
lstm = load_model("../Saved_Models/mimic_model_complex_attention.h5"
                  , custom_objects=SeqSelfAttention.get_custom_objects())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


## Generate character index lookup which was used in MIMICK

In [3]:
def create_char_dicts(non_letter_chars, lower_case=True, upper_case=True):
    """
    Create dictionary mapping characters to indices
    :param non_letter_chars: list of characters which should be supported other than letters
    :param lower_case: Should set of english lowercase letters be included; default True
    :param upper_case: Should set of english uppercase letters be included; default True
    """
    lower_case_letter_dict={}
    upper_case_letter_dict={}
    index_count = 0
    # Create a dictionary with upper and lower case letters and associated index
    # Note: We include underscores, hyphens, and apostrophes but ignore other characters
    # found in word2vec model, including chinese symbols, emojis, etc
    if lower_case:
        lower_case_letter_dict = {letter: int(index)+index_count for index, letter in enumerate(ascii_lowercase, start=1)}
        index_count += 26
    if upper_case:
        upper_case_letter_dict = {letter: int(index)+index_count for index, letter in enumerate(ascii_uppercase, start=1)} 
        index_count += 26
        
    chardict = {**lower_case_letter_dict, **upper_case_letter_dict}
    
    for char in non_letter_chars:
        chardict[char] = index_count
        index_count += 1

    # Creation of reverse character lookup for debugging and word creation
    reverse_chardict = {}
    for k,v in chardict.items():
        reverse_chardict[v] = k
    
    return chardict, reverse_chardict

supported_non_letter_characters = ['-','\'']
chardict, reverse_chardict = create_char_dicts(supported_non_letter_characters)


## Read in quora data, and perform preprocessing

In [4]:
# Read in data using Pandas, and fill NaNs with blank text
data = pd.read_csv("quora_train.csv")
data['question1'] = data['question1'].fillna('')
data['question2'] = data['question2'].fillna('')


In [12]:
def tokenize(line):
    """
    Helper function to take in string and return list
    of words with key characters removed
    """
    chars_to_remove=['?','!','_','(',')','[',']','..',':']
    sc = set(chars_to_remove)
    return_list = []
    words = line.split()
    for word in words:
        word = ''.join([c for c in word if c not in sc])
        word = word.replace("/"," ")
        word = word.replace("...", " ")
        words=word.split()
        for word in words:
            return_list.append(word)
    return return_list

In [51]:
# Continue preprocessing data by applying 'tokenize' function to all questions
q1 = np.array(data['question1'].apply(tokenize))
q2 = np.array(data['question2'].apply(tokenize))

# Define our target outcome and convert to categorical response
answers = np.array(data['is_duplicate'])
answers = to_categorical(answers)

# Create master list of all questions
question_pairs = q1+q2

In [14]:
# Create a word dictionary which will hold all the valid words we encounter

word_dict = {}

for question in question_pairs:
    try:
        for word in question:
            word_dict[word] = 1
    except:
        print(question)


In [15]:
# Clipping and padding functions in order to prepare a word for MIMICK prediction
def get_padding(sequence, max_padding = 25):
    sequence = sequence[:max_padding] if len(sequence)>max_padding else sequence
    result = np.zeros(max_padding)
    result[max_padding-len(sequence):]=sequence
    return result

def get_padded_sequence(token, max_padding=25):
    sequence = [chardict[c] if c in chardict else 0 for c in token]
    return np.reshape(get_padding(sequence), (1,-1))

## Create an embedding dictionary which has word and embedding.  
Note: One dictionary will generate MIMICK predictions if the word is not avialable

In [16]:
# Loop through all known words and store word2vec embedding if it exists.  If it does not, and the
# word meets the MIMICK training criteria then use the predicted embedding but only in emb_dict_model
emb_dict_model = {}
emb_dict_none = {}
use_counter = 0
words_needing_prediction = []

for k,v in word_dict.items():
    if model.vocab.get(k):
        emb_dict_model[k] = model[k]
        emb_dict_none[k] = model[k]
    else:
        if all(char in chardict.keys() for char in k) & (len(k)<=25):
            use_counter += 1
            words_needing_prediction.append(k)
            emb_dict_model[k] = lstm.predict(get_padded_sequence(k))
        
print("Made predictions for %s tokens" %use_counter)

Made predictions for 34005 tokens


# Tokenize

Create two tokenizers, one for our dictionary with MIMICK and one for baseline

In [17]:
# Create tokenizer based on our embedding dictionary, making sure we specify we want an OOV token
tok_model = Tokenizer(num_words=len(emb_dict_model), oov_token='<UNK>')
tok_model.fit_on_texts(emb_dict_model.keys())

# Convert our questions into word sequences
fq_seq_model = tok_model.texts_to_sequences(q1)
sq_seq_model = tok_model.texts_to_sequences(q2)

In [18]:
# Create tokenizer based on our embedding dictionary, making sure we specify we want an OOV token
tok = Tokenizer(num_words=len(emb_dict_none), oov_token='<UNK>')
tok.fit_on_texts(emb_dict_none.keys())

# Convert our questions into word sequences
fq_seq_none = tok.texts_to_sequences(q1)
sq_seq_none = tok.texts_to_sequences(q2)

In [24]:
# Pad sequences
max_question_length = 0

for question in q1:
    if len(question) > max_question_length:
        max_question_length=len(question)
        
for question in q2:
    if len(question) > max_question_length:
        max_question_length=len(question)
        
fq_seq_model = sequence.pad_sequences(fq_seq_model, maxlen=max_question_length)
sq_seq_model = sequence.pad_sequences(sq_seq_model, maxlen=max_question_length)
fq_seq_none = sequence.pad_sequences(fq_seq_none, maxlen=max_question_length)
sq_seq_none = sequence.pad_sequences(sq_seq_none, maxlen=max_question_length)

## Model Creation


In [31]:
# Create embedding layer which uses Google word2vec if known or Mimick if unknown
word_index_model = tok_model.word_index

embedding_matrix_model = np.zeros((len(word_index_model) + 1, H))
for word, i in word_index_model.items():
    embedding_vector = emb_dict_model.get(word)
    if embedding_vector is not None:
        embedding_matrix_model[i] = embedding_vector
        
embedding_layer_model = Embedding(len(word_index_model) + 1,
                            H,
                            weights=[embedding_matrix_model],
                            input_length=max_question_length,
                            trainable=True,
                            mask_zero=True)

#--------------------------------------------------------------------------------------------------------

# Create embedding layer which uses Google word2vec if known 
word_index = tok.word_index

embedding_matrix = np.zeros((len(word_index) + 1, H))
for word, i in word_index.items():
    embedding_vector = emb_dict_none.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            H,
                            weights=[embedding_matrix],
                            input_length=max_question_length,
                            trainable=True,
                            mask_zero=True)

In [122]:
def train_quora(embedding, path):
    # Build Q1 stacked w/attention LSTM

    model_1_input = Input(shape=(max_question_length,))
    embeddings_1 = embedding(model_1_input)
    model_1 = LSTM(50, return_sequences=True)(embeddings_1)
    model_1 = SeqSelfAttention(attention_activation='sigmoid',
                                                 kernel_regularizer=regularizers.l2(1e-5))(model_1)
    model_1 = LSTM(50, return_sequences=False)(model_1)


    # Build Q2 stacked w/attention LSTM
    model_2_input = Input(shape=(max_question_length,))
    embeddings_2 = embedding(model_2_input)
    model_2 = LSTM(50, return_sequences=True)(embeddings_2)
    model_2 = SeqSelfAttention(attention_activation='sigmoid',
                                                 kernel_regularizer=regularizers.l2(1e-5))(model_2)
    model_2 = LSTM(50, return_sequences=False)(model_2)

    # Merge output of two models
    joined = concatenate([model_1, model_2], axis = 1)

    # Predict probabilities for our binary classification
    outputs = Dense(2, activation='softmax')(joined)

    # Define model as set of two inputs, and final output
    final = Model(inputs=[model_1_input, model_2_input], output=outputs)

    # Use cross-entropy as loss as this is a classification task
    final.compile(loss='categorical_crossentropy', optimizer='adam')

    # View the model so we can bask in Keras' glory!
    final.summary()

    # Fit our model on input questions and known target
    final.fit([fq_seq_model, sq_seq_model], answers,
              epochs=1, batch_size=250)

    final.save(path)
    
    return

In [123]:
train_quora(embedding_layer_model,'quora_model_w_mimick.h5')



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_95 (InputLayer)           (None, 237)          0                                            
__________________________________________________________________________________________________
input_96 (InputLayer)           (None, 237)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 237, 300)     24372000    input_95[0][0]                   
                                                                 input_96[0][0]                   
__________________________________________________________________________________________________
lstm_176 (LSTM)                 (None, 237, 50)      70200       embedding_1[90][0]               
__________

KeyboardInterrupt: 

In [None]:
train_quora(embedding_layer,'quora_model_no_mimick.h5')