In [2]:
import gensim
import multiprocessing
import os

# Create Cleaned Data from raw .tsv file

In [171]:
# files names - if these files and folders don't exist then they are downloaded 
raw_file_name = "quora_duplicate_questions.tsv"
q1_file_name = "cleaned_q1.txt"
q2_file_name = "cleaned_q2.txt"
dup_file_name = "is_duplicate.txt"
complete_data_dataframe = "complete_data_dataframe.csv"
questions_folder_name = "cleaned_data"

# file to store hyperparameter tuning numbers
parameters_and_errors_name = "parameters_and_errors.csv"# "parameters_and_errors.csv"

# number of question pairs to use in training doc2vec
# total number of pairs is currently 404288
num_question_pairs = 404288

In [172]:
import pandas as pd
import os
import string
import requests

# check whether downloaded data already exists with names provided above
if not os.path.isdir(questions_folder_name) or not os.path.isfile(dup_file_name):
    
    # Download questions file - data set has been changed since first release
    url = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
    r = requests.get(url)
    with open(raw_file_name, 'wb') as f:
        f.write(r.content)
    
    # DATA CLEANING WITH PANDAS
    
    # read in file into dataframe
    data = pd.read_csv(raw_file_name, sep='\t')
    # drop rows with null value
    data.dropna(inplace=True)
    # make columns of lower cased words
    data["cleaned_q1"] = data.text1.str.lower()
    data["cleaned_q2"] = data.text2.str.lower()
    # remove punctuation from lower-cased words columns
    data['cleaned_q1'] = data['cleaned_q1'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    data['cleaned_q2'] = data['cleaned_q2'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    # remove the character "\n", which messes up the line delimiters in txt file
    # these only occur ~20 times in the questions
    data["cleaned_q1"] = data['cleaned_q1'].str.replace("\n", "")
    data["cleaned_q2"] = data['cleaned_q2'].str.replace("\n", "")
    # shuffle data before writing to file - this way random sample can be taken from file 
    # simply by choosing first n rows of file
    data = data.sample(frac=1)

    # create directory to hold question data
    if not os.path.exists(questions_folder_name):
        os.makedirs(questions_folder_name)

    # write cleaned text rows to txt files, one line for each sentence
    data["cleaned_q1"].to_csv(questions_folder_name + "/" + q1_file_name, sep='\n', header=False, index=False)
    data["cleaned_q2"].to_csv(questions_folder_name + "/" + q2_file_name, sep='\n', header=False, index=False)
    # write dup values to txt file, one line for each value
    data["duplicate"].to_csv(dup_file_name, sep='\n', header=False, index=False)
    # write complete cleaning dataframe to text files as well - this can be reloaded 
    # to look at raw questions that get misclassified, etc.
    data.to_csv(complete_data_dataframe, sep=',', header=True, index=False)

    print "Saved file with", len(data), "rows at", raw_file_name
else: 
    print "Directory that should contain data already exists."

Directory that should contain data already exists.


# Load in data

## Input parameters

In [173]:
# notebook assumes you have a directory "questions_directory" that contains two text files, one for all 
# first questions and one for all second questions, and a file is_dup_file that has one boolean value (0 or 1)
# on each line, one for all ~400,000 document pairs that tells whether questions are duplicates or not

# path to questions directory (contains sorted question text files) and 
# is_dup_file (contains sorted answers about whether question pairs are duplicates)

# These files and directories are created automatically by the above cell
questions_directory = questions_folder_name
is_dup_file = dup_file_name
questions_file_names = [os.path.basename(filename) for filename in os.listdir(questions_directory)]

In [174]:
questions_file_names

['cleaned_q1.txt', 'cleaned_q2.txt']

## Load in data

In [175]:
# produce logs during training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [176]:
# create iterator to run through directory of text files with one sentence per line
from itertools import izip, count

# class that iterates through first "rows" lines of questions list ("rows" is integer)
class LabeledLineSentence(object):
    def __init__(self, dirname, rows=None):
        self.dirname = dirname
        self.rows = rows
    def __iter__(self):
        for filename in os.listdir(self.dirname):
            for uid, text_line in enumerate(open(os.path.join(self.dirname, filename))):
                if self.rows:
                    if uid >= self.rows: 
                        break
            # for uid, line in enumerate(open(os.path.join(self.dirname, filename))):
                yield gensim.models.doc2vec.LabeledSentence(words=text_line.split(), tags=[os.path.basename(filename) + '_%s' % uid])

# make sure using fast version
assert gensim.models.doc2vec.FAST_VERSION > -1
cores = multiprocessing.cpu_count() # number of cores on computer to use for computations

In [177]:
# load in data into memory - all data combined should only be 200-300 megabytes
# this is done instead of using iterator - makes doing shuffles of data easier

all_docs = []
sentences = LabeledLineSentence(questions_directory, rows=num_question_pairs)
for sentence in sentences:
    all_docs.append(sentence)

print('%d question pairs to train (%d documents total)' % (num_question_pairs, len(all_docs)))

404288 question pairs to train (808576 documents total)


In [178]:
# make list of tuples (document1, document2, is_dup) for all num_question_pairs
# the document names come from naming scheme used in LabeledLineSentence class 

doc_names_and_duplicate_class = []
for i, line in enumerate(open(is_dup_file)):
    if i >= num_question_pairs:
        break
    doc_tup = (questions_file_names[0] + "_" + str(i), questions_file_names[1] + "_" + str(i), int(line.strip("\n")))
    doc_names_and_duplicate_class.append(doc_tup)

print len(doc_names_and_duplicate_class), "document pairs to classify"
print "Document pair names and labels contained in doc_names_and_duplicate_class"

404288 document pairs to classify
Document pair names and labels contained in doc_names_and_duplicate_class


In [179]:
doc_names_and_duplicate_class[0]

('cleaned_q1.txt_0', 'cleaned_q2.txt_0', 0)

# Evaluate Model

In [15]:
import numpy as np
from sklearn.metrics import roc_auc_score

def calculate_AUC(model, doc_names_and_duplicate_class): 
    """ Return area under ROC curve for model. This is done by simply taking cosine similarity between 
        document vectors to predict whether they are duplicate questions or not.
    """
    doc_distances = []

    for i in range(len(doc_names_and_duplicate_class)):
        # get word vectors for given pair
        vec1_name = doc_names_and_duplicate_class[i][0]
        vec2_name = doc_names_and_duplicate_class[i][1]
        vec1 = model.docvecs[vec1_name]         
        vec2 = model.docvecs[vec2_name]       
        # take cosine distance between them
        distance = cosine_similarity(vec1, vec2)
        doc_distances.append(distance)

    doc_distances = np.array(doc_distances)
    doc_scores = np.array([x[2] for x in doc_names_and_duplicate_class])
    
    return roc_auc_score(doc_scores, doc_distances)

def cosine_similarity(vec1, vec2): 
    # return cosine angle between numpy vectors v1 and v2
    def unit_vector(vec):
        return vec/np.linalg.norm(vec)
    vec1_u, vec2_u = unit_vector(vec1), unit_vector(vec2)
    return np.dot(vec1_u, vec2_u)

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Train model

In [None]:
# initialize Doc2vec model parameters
"""
The documents iterable can be simply a list of TaggedDocument elements, 
but for larger corpora, consider an iterable that streams the documents 
directly from disk/network.

If you don’t supply documents, the model is left uninitialized – 
use if you plan to initialize it in some other way.



dm defines the training algorithm. By default (dm=1), 
    ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.

size is the dimensionality of the feature vectors.

window is the maximum distance between the predicted word and context words used for prediction within a document.

alpha is the initial learning rate (will linearly drop to zero as training progresses).

seed = for the random number generator. Note that for a fully deterministically-reproducible run, 
    you must also limit the model to a single worker thread, to eliminate ordering jitter from 
    OS thread scheduling. (In Python 3, reproducibility between interpreter launches also 
    requires use of the PYTHONHASHSEED environment variable to control hash randomization.)

min_count = ignore all words with total frequency lower than this.

max_vocab_size = limit RAM during vocabulary building; if there are more unique words 
                 than this, then prune the infrequent ones. Every 10 million word types 
                 need about 1GB of RAM. Set to None for no limit (default).

sample = threshold for configuring which higher-frequency words are randomly downsampled;
        default is 0 (off), useful value is 1e-5.
        
workers = use this many worker threads to train the model (=faster training with multicore machines).

iter = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, 
        but values of 10 or 20 are common in published ‘Paragraph Vector’ experiments.

hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

negative = if > 0, negative sampling will be used, the int for negative specifies how many 
            “noise words” should be drawn (usually between 5-20).

dm_mean = if 0 (default), use the sum of the context word vectors. If 1, use the mean. 
            Only applies when dm is used in non-concatenative mode.

dm_concat = if 1, use concatenation of context vectors rather than sum/average; default is 0 (off). 
                Note concatenation results in a much-larger model, as the input is no longer the size of 
                one (sampled or arithmatically combined) word vector, but the size of the tag(s) and all 
                words in the context strung together.

dm_tag_count = expected constant number of document tags per document, when using dm_concat mode; default is 1.

dbow_words if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; 
            default is 0 (faster training of doc-vectors only).

"""


# set model parameters parameters

parameters_dict = {

'documents' : all_docs,
'dm' : 0, # use bag-of-words (dbow) model; 1 uses embedding (dmpv) model
'size' : 200, # size of word/doc vectors
'window' : 15, # # max distance between word and neighbor word for word embeddings
'alpha' : .025, # learning rate - use rate in paper
'min_alpha' : 0.0001, # rate from paper
'min_count' : 5, # ignore words with count less than this
'sample' : 1e-5, # how to configure downsampling for high frequency words
'workers' : cores, # number of cores to use
'hs' : 0, # use negative sampling
'negative' : 5, # used in negative sampling
'dbow_words' : 1, # trains word vectors in addition to document vectors in dbow model
'iter' : 3 # recommended number of epochs is ~20 for dbow model on question comparison   

}

## Create parameters for grid search

In [None]:
# create list of parameters to use in model
dms = [0]
sizes = [300]
windows = [5, 15]
alphas = [0.025]
min_alphas = [0.0001]
min_counts = [1, 5]
samples = [1e-5, 5e-5, 1e-4]
workers_s = [cores]
hs_s = [0]
negatives = [5]
dbow_words_s = [1]
iters = [150]

In [None]:
# run through all parameters and record error rate of each one
from itertools import product

# create list to score ROC AUC scores and their model parameters
params_and_errors = []
# create iterable of all combinations of parameters
params_product = product(dms, sizes, windows, alphas, min_alphas, 
                        min_counts, samples, workers_s, hs_s, negatives, 
                        dbow_words_s, iters)
parameters = [x for x in params_product]

In [None]:
print "Starting first run of", len(parameters), "runs"
total_time = 0
for run_number, pars in enumerate(parameters): 
    params = {'dm':pars[0], 'size':pars[1], 'window':pars[2], 
              'alpha':pars[3], 'min_alpha':pars[4], 'min_count':pars[5],
              'sample':pars[6], 'workers':pars[7], 'hs':pars[8],
              'negative':pars[9], 'dbow_words':pars[10], 'iter':pars[11]}
    with elapsed_timer() as elapsed:
        model = gensim.models.doc2vec.Doc2Vec(documents=all_docs, **params)
        AUC_value = calculate_AUC(model, doc_names_and_duplicate_class)
        duration = '%.1f' % elapsed()
        # save time to complete computation
        m, s = divmod(float(duration), 60)
        h, m = divmod(m, 60)
        time_string = "%dh %02dm %02ds" % (h, m, s)
        params_and_errors.append((params, AUC_value, time_string))
        total_time += float(duration)
        print 
        print "Completed run number", run_number + 1, "of", len(parameters), "runs total"
        print "AUC score:", round(AUC_value, 4)
        print "Training for this run took", round(float(duration)/60.,1), "minutes"

best_AUC = max([x[1] for x in params_and_errors])
print
print
print "Total training time for all runs:", round(float(total_time)/3600.,2), "hours"
print "Best AUC value:", round(best_AUC, 6)
print "Paramters for best AUC value:", [x[0] for x in params_and_errors if x[1] == best_AUC][0]

# convert params and errors into easy-to-read pandas dataframe
params_df = pd.DataFrame([x[0] for x in params_and_errors])
params_df["AUC"] = pd.Series([x[1] for x in params_and_errors])
params_df["num_doc_pairs"] = pd.Series([len(all_docs) for _ in range(len(params_and_errors))])
params_df["compute_time"] = pd.Series([x[2] for x in params_and_errors])
params_df.sort_values("AUC", ascending=False, inplace=True)

# write parameter values to csv - append if this csv already exists
header=True
if os.path.isfile(parameters_and_errors_name):
    header=False
params_df.to_csv(parameters_and_errors_name, header=header, index=False, mode="a")

In [None]:
params_df

In [None]:
# show only the columns that changed in value 
params_df.loc[:, (params_df != params_df.ix[0]).any()]

In [None]:
""" PARAMETER TIPS: 

sample: CRITICAL: 1e-6 terrible, 5e-6 and 1e-5 better - try 5e-5 
min_count: 1 works well with small data set
window: 5 and 15 both worked reasonably well
size: 300 works well
iter: 150 seems ok, try more - sentence similarity in paper used ~400 to converge

"""



# Look at previously computed runs and their AUC scores

All runs (parameters and scores) are stored externally in parameters_and_errors_name file.

In [12]:
# read in paramater file to see what has been done before
all_params_df = pd.read_csv(parameters_and_errors_name)
all_params_df.sort_values("AUC", ascending=False, inplace=True)
# drop duplicate rows (not including AUC scores, which might vary slightly due to randomness)
dup_columns = [u'alpha', u'dbow_words', u'dm', u'hs', u'iter', u'min_alpha',
       u'min_count', u'negative', u'sample', u'size', u'window', u'workers',
       u'num_doc_pairs']
all_params_df.drop_duplicates(subset=dup_columns,inplace=True)

In [13]:
all_params_df

Unnamed: 0,alpha,dbow_words,dm,hs,iter,min_alpha,min_count,negative,sample,size,window,workers,AUC,num_doc_pairs,compute_time
0,0.025,1,0,0,150,0.0001,1,5,5e-05,300,5,4,0.659777,200,0h 00m 03s
1,0.025,1,0,0,150,0.0001,1,5,0.0001,300,5,4,0.648615,200,0h 00m 05s
2,0.025,1,0,0,150,0.0001,1,5,5e-05,300,15,4,0.640967,200,0h 00m 03s
3,0.025,1,0,0,150,0.0001,1,5,0.0001,300,15,4,0.618437,200,0h 00m 03s
4,0.025,1,0,0,150,0.0001,5,5,1e-05,300,5,4,0.526251,200,0h 00m 02s
5,0.025,1,0,0,150,0.0001,5,5,1e-05,300,15,4,0.517156,200,0h 00m 02s
6,0.025,1,0,0,150,0.0001,5,5,5e-05,300,15,4,0.407606,200,0h 00m 02s
7,0.025,1,0,0,150,0.0001,5,5,0.0001,300,5,4,0.399339,200,0h 00m 02s
8,0.025,1,0,0,150,0.0001,5,5,0.0001,300,15,4,0.399339,200,0h 00m 03s
9,0.025,1,0,0,150,0.0001,1,5,1e-05,300,15,4,0.399339,200,0h 00m 03s


In [None]:
# 0.672 with 100-length vectors and 20 epochs
# 0.671 with 300-length vectors and 20 epochs
# 0.74 with 200-length vectors and 200 epochs
# 0.74 with 200-length vectors and 100 epochs

# model run with all documents got 0.75 with 300 epochs, 300-size vectors, window=5, 
# min_count=1 and samples=5e-5

# model seems to tail off in accuracy around 100-150 epochs

# Choose specific model to maximize accuracy 

Model can just be threshold choice (give the cosine products of the document vectors) or a logistic regression classifier - shouldn't make much different, but logistic regression classifier might be faster - need to test this with threshold-choosing code below.

In [None]:
# function to find best accuracy threshold given the cosine similarities between document vectors
# function to use is aggregate function report_accuracy_prec_recall_F1

# also includes function to get document predictions and document scores given some threshold

import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def max_accuracy(y_target, y_pred, thresh_number=5000):
    # find the maximum accuracy that can be achieved with y_pred by 
    # choosing appropriate threshold
    
    # returns (max_accuracy, max_accuracy_threshold, max_accuracy_predictions)
    
    min_thresh, max_thresh = min(y_pred), max(y_pred)
    thresholds = np.linspace(min_thresh, max_thresh,thresh_number)
    best_thresh, best_acc = 0, 0
    best_preds = y_pred
    for thresh in thresholds: 
        # make predictions list
        y_pred_vals = np.array([0 if x<thresh else 1 for x in y_pred])
        # compute accuracy
        acc = get_accuracy(y_target, y_pred_vals)
        if acc > best_acc:
            best_thresh, best_acc = thresh, acc
            best_preds = y_pred_vals
    print "Best accuracy:", round(best_acc,4)
    return (round(best_acc,4), best_thresh, best_preds)
    
def get_accuracy(y_target, y_pred_vals): 
    # get accuracy between vector of targets and vector of definite predictions
    assert len(y_target) == len(y_pred_vals)
    num_correct = 0
    for i in range(len(y_target)): 
        if y_target[i] == y_pred_vals[i]:
            num_correct += 1
    return float(num_correct)/float(len(y_target))

from sklearn.metrics import precision_recall_fscore_support

def report_accuracy_prec_recall_F1(y_target, y_pred):
    (best_acc, best_thresh, best_preds) = max_accuracy(y_target, y_pred)
    (precision, recall, F1, support) = precision_recall_fscore_support(y_target, best_preds, average='binary')
    print "Precision:", precision
    print "Recall:", recall
    print "F1-score:", round(F1, 4)    

In [None]:
# create model with best parameters found so far
best_params = {'dm':0, 'size':300, 'window':5, 
              'alpha':.025, 'min_alpha':.0001, 'min_count':1,
              'sample':5e-5, 'workers':cores, 'hs':0,
              'negative':5, 'dbow_words':1, 'iter':100}
with elapsed_timer() as elapsed:
    model = gensim.models.doc2vec.Doc2Vec(documents=all_docs, **best_params)
    AUC_value = calculate_AUC(model, doc_names_and_duplicate_class)
    duration = '%.1f' % elapsed()
    print "AUC score:", round(AUC_value, 4)
    print "Training for this run took", round(float(duration)/60.,1), "minutes"  

In [None]:
# get document distances (y_pred) and actual scores (y_target)

def get_model_distances_and_scores(model, doc_names_and_duplicate_class): 
    """ Return (y_target, y_pred) for model and given documents 
    y_pred is number between -1 and 1
    """
    doc_distances = []

    for i in range(len(doc_names_and_duplicate_class)):
        # get word vectors for given pair
        vec1_name = doc_names_and_duplicate_class[i][0]
        vec2_name = doc_names_and_duplicate_class[i][1]
        vec1 = model.docvecs[vec1_name]         
        vec2 = model.docvecs[vec2_name]       
        # take cosine distance between them
        distance = cosine_similarity(vec1, vec2)
        doc_distances.append(distance)

    doc_distances = np.array(doc_distances)
    doc_scores = np.array([x[2] for x in doc_names_and_duplicate_class])
    
    return (doc_scores, doc_distances)

In [None]:
# get y_target and y_pred for model
y_target, y_pred = get_model_distances_and_scores(model, doc_names_and_duplicate_class)

In [None]:
# get max accuracy for model (i.e., choose decision threshold for max accuracy)
# takes about 15 minutes to calculate for 50,000 document pairs
report_accuracy_prec_recall_F1(y_target, y_pred)

# Manual runs with accuracy check and shuffle at each epoch

In [None]:
# try a run with manual epochs and recording accuracy after each epoch
model = gensim.models.doc2vec.Doc2Vec(documents=None, dm=dm, size=size, window=window, 
                                      alpha=alpha, min_alpha=min_alpha, min_count=min_count, 
                                      sample=sample, workers=workers, hs=hs, negative=negative, 
                                      dbow_words=dbow_words, iter=1)

In [None]:
from random import shuffle

# number of epochs to train
# need to manually decrease alpha in this case
epochs = 100
alpha = 0.025
min_alpha = 0.0001
alpha_delta = (alpha - min_alpha) / epochs

model.build_vocab(all_docs)
with elapsed_timer() as elapsed:
    for epoch in range(epochs): 
        model.alpha, model.min_alpha = alpha, alpha
        # shuffle documents
        shuffle(all_docs)
        # train model
        model.train(all_docs)
        # evaluate model 
        error = calculate_AUC(model, doc_names_and_duplicate_class)
        alpha -= alpha_delta
        print "AUC for epoch", epoch, ":", error
    duration = '%.1f' % elapsed()
    print("completed training in %s minutes" % (duration/60.))

In [None]:
seconds = 9000
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print "%dh %02dm %02ds" % (h, m, s)

In [None]:
time_string = "%dh %02dm %02ds" % (h, m, s)

In [None]:
time_string

# Use Keras to train model based on document vectors

### Train doc2vec model

In [None]:
# create model with best parameters found so far
best_params = {'dm':0, 'size':300, 'window':5, 
              'alpha':.025, 'min_alpha':.0001, 'min_count':1,
              'sample':5e-5, 'workers':cores, 'hs':0,
              'negative':5, 'dbow_words':1, 'iter':100}
with elapsed_timer() as elapsed:
    model = gensim.models.doc2vec.Doc2Vec(documents=all_docs, **best_params)
    AUC_value = calculate_AUC(model, doc_names_and_duplicate_class)
    duration = '%.1f' % elapsed()
    print "AUC score:", round(AUC_value, 4)
    print "Training for this run took", round(float(duration)/60.,1), "minutes"  

First need to output document vectors into data frame. Pandas can store a complete numpy array as an element in a dataframe.

In [147]:
# load in dataframe containing data - need only as many question pairs as used in training
# this dataframe can be useful for generating further features to be used for training
current_data = pd.read_csv(complete_data_dataframe, nrows=num_question_pairs)

In [148]:
current_data

Unnamed: 0,id,qid1,qid2,text1,text2,duplicate,cleaned_q1,cleaned_q2
0,158184,247173,247174,When will Kancolle season 2 be on Crunchyroll?,How do I download tv series 'the flash' season 2?,0,when will kancolle season 2 be on crunchyroll,how do i download tv series the flash season 2
1,82924,140523,140524,Do you want to be a celebrity?,Would you want to be a celebrity in the future?,1,do you want to be a celebrity,would you want to be a celebrity in the future
2,191299,12544,93146,How will Trump's presidency affect prospective...,Will Trump's win affect the matriculation of s...,1,how will trumps presidency affect prospective ...,will trumps win affect the matriculation of st...
3,239666,351295,11879,What are the reasons why land breeze occurs?,When does land breeze occur?,1,what are the reasons why land breeze occurs,when does land breeze occur
4,180192,276312,276313,What is mean by proportional circuits in hydra...,Has comedy on social media gone too far?,0,what is mean by proportional circuits in hydra...,has comedy on social media gone too far
5,295314,34248,176816,What are the top sex movies?,What is best sex movie?,1,what are the top sex movies,what is best sex movie
6,197989,299078,299079,"Guys..i am interested in cars, bikes and even ...","I have been working for Mu Sigma, Bangalore as...",0,guysi am interested in cars bikes and even in ...,i have been working for mu sigma bangalore as ...
7,196055,84493,296672,Why does my penis shrinks after masturbation?,Why and how does a penis shrink?,0,why does my penis shrinks after masturbation,why and how does a penis shrink
8,254302,368998,368999,What are the best practices for Hadoop benchma...,What are the best practices for Hadoop from Ho...,0,what are the best practices for hadoop benchma...,what are the best practices for hadoop from ho...
9,276775,395761,395762,What is it like to have a stepsibling of a dif...,"When small air bubbles enter an IV line, how d...",0,what is it like to have a stepsibling of a dif...,when small air bubbles enter an iv line how do...


### Extract document vectors from model

In [149]:
# add numpy doc2vec representation for each question (2 questions per row)
# the documents in doc_names_and_duplicate_class are in same order as documents in current_data dataframe
# (and current_data is ready from complete_data_dataframe csv file)
q1_vecs = []
q2_vecs = []
is_dup = []

for i in range(len(doc_names_and_duplicate_class)):
    # get word vectors for each question and add to current_data dataframe
    vec1_name = doc_names_and_duplicate_class[i][0]
    vec2_name = doc_names_and_duplicate_class[i][1]
    vec1 = model.docvecs[vec1_name]         
    vec2 = model.docvecs[vec2_name] 
    q1_vecs.append(vec1)
    q2_vecs.append(vec2)
    is_dup.append(doc_names_and_duplicate_class[i][2])

# sanity check that ensures document vectors match their text in dataframe - compare the duplicate 
# tags between current_data and doc_names_and_duplicate_class

for i in range(len(doc_names_and_duplicate_class)):
    assert doc_names_and_duplicate_class[i][2] == current_data["duplicate"][i], "Error in aligning document \
                                                                              vectors with their text at index %d" % i

current_data["q1_vecs"] = pd.Series(q1_vecs)
current_data["q2_vecs"] = pd.Series(q2_vecs)
print "Document vectors added to current_data dataframe"

Document vectors added to current_data dataframe


In [150]:
current_data

Unnamed: 0,id,qid1,qid2,text1,text2,duplicate,cleaned_q1,cleaned_q2,q1_vecs,q2_vecs
0,158184,247173,247174,When will Kancolle season 2 be on Crunchyroll?,How do I download tv series 'the flash' season 2?,0,when will kancolle season 2 be on crunchyroll,how do i download tv series the flash season 2,"[0.113089, -0.0236182, -0.146961, 0.173789, -0...","[0.0640692, 0.116034, -0.294704, 0.0784339, -0..."
1,82924,140523,140524,Do you want to be a celebrity?,Would you want to be a celebrity in the future?,1,do you want to be a celebrity,would you want to be a celebrity in the future,"[-0.0378412, 0.0948879, -0.102228, -0.134602, ...","[0.0799231, 0.0595522, 0.0171781, -0.199275, 0..."
2,191299,12544,93146,How will Trump's presidency affect prospective...,Will Trump's win affect the matriculation of s...,1,how will trumps presidency affect prospective ...,will trumps win affect the matriculation of st...,"[-0.0373748, 0.195244, -0.15544, -0.175518, 0....","[-0.20581, 0.191862, -0.0444463, 0.0238138, 0...."
3,239666,351295,11879,What are the reasons why land breeze occurs?,When does land breeze occur?,1,what are the reasons why land breeze occurs,when does land breeze occur,"[-0.0997776, 0.145314, 0.0853042, -0.131476, 0...","[0.0257296, 0.020773, -0.0240053, -0.120982, 0..."
4,180192,276312,276313,What is mean by proportional circuits in hydra...,Has comedy on social media gone too far?,0,what is mean by proportional circuits in hydra...,has comedy on social media gone too far,"[0.16521, 0.0237708, -0.0441134, -0.114565, -0...","[0.146409, 0.213066, -0.152847, -0.0077061, 0...."
5,295314,34248,176816,What are the top sex movies?,What is best sex movie?,1,what are the top sex movies,what is best sex movie,"[-0.0177868, 0.083759, -0.0749799, -0.0359831,...","[-0.02737, 0.117701, -0.11589, -0.0408971, 0.0..."
6,197989,299078,299079,"Guys..i am interested in cars, bikes and even ...","I have been working for Mu Sigma, Bangalore as...",0,guysi am interested in cars bikes and even in ...,i have been working for mu sigma bangalore as ...,"[0.12482, -0.187078, -0.205443, -0.118652, -0....","[0.256855, 0.520362, -0.253016, -0.330339, 0.2..."
7,196055,84493,296672,Why does my penis shrinks after masturbation?,Why and how does a penis shrink?,0,why does my penis shrinks after masturbation,why and how does a penis shrink,"[0.0803748, -0.0175071, -0.11632, -0.0825567, ...","[0.0447627, 0.0486946, -0.13276, -0.196069, 0...."
8,254302,368998,368999,What are the best practices for Hadoop benchma...,What are the best practices for Hadoop from Ho...,0,what are the best practices for hadoop benchma...,what are the best practices for hadoop from ho...,"[0.23143, -0.0617862, -0.079843, -0.1213, 0.14...","[0.226094, -0.0688689, -0.0805669, -0.141731, ..."
9,276775,395761,395762,What is it like to have a stepsibling of a dif...,"When small air bubbles enter an IV line, how d...",0,what is it like to have a stepsibling of a dif...,when small air bubbles enter an iv line how do...,"[0.142606, 0.0467626, -0.0903129, -0.11932, 0....","[0.192009, -0.241993, -0.0433549, -0.276916, 0..."


In [151]:
# create X_1, X_2 and y
# X_1 is numpy array of q1 vectors, same with X_2
# y is array of target values

# we will just use the already existing q1_vecs and q2_vecs lists
X_1 = np.array(q1_vecs)
X_2 = np.array(q2_vecs)

from keras.utils import np_utils
num_classes = 2

y = np_utils.to_categorical(np.array(is_dup), num_classes)

### Make train/test split

In [152]:
# MAKE TRAIN/TEST SPLIT
# since vectors already in random order, train/test split can be done by simple indexing without shuffle
test_portion = 0.2
split_index = int((1-test_portion)*len(X_1))
X_1_train, X_2_train = X_1[0:split_index], X_2[0:split_index]
X_1_test, X_2_test = X_1[split_index:], X_2[split_index:]
y_train = y[0:split_index]
y_test = y[split_index:]

### Design neural network

In [153]:
from keras.models import Sequential
from keras.layers import Merge, Dense, Activation, Dropout

In [158]:
q1_branch = Sequential()
q1_branch.add(Dense(1000, input_shape=(300,), activation='relu'))
q1_branch.add(Dropout(0.2))

q2_branch = Sequential()
q2_branch.add(Dense(1000, input_shape=(300,), activation='relu'))
q2_branch.add(Dropout(0.2))

merged = Merge([q1_branch, q2_branch], mode='concat')

final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(1000, activation='relu'))
final_model.add(Dropout(0.2))
final_model.add(Dense(2, activation='softmax'))

In [159]:
# compile model
final_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [160]:
# train model
final_model.fit([X_1_train, X_2_train], y_train, 
                   batch_size=100, nb_epoch=10,
                   verbose=1, validation_data=([X_1_test, X_2_test], y_test))  # we pass one data array per model input

Train on 40000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x134f6a290>

In [161]:
# evaluate model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

prediction_probs = final_model.predict([X_1_test, X_2_test])
# predictions is list of definite [0,1] predictions as extracted from predicted probabilities
predictions = [np.array([1,0]) if x[0]>x[1] else np.array([0,1]) for x in prediction_probs]

# convert predictions and y_test back into regular list of 0s and 1s for sklearn functions
predictions_labels, y_test_labels = [], []
for i in range(len(y_test)):
    if predictions[i][1] == 1: 
        predictions_labels.append(1)
    else: 
        predictions_labels.append(0)
for i in range(len(y_test)):
    if y_test[i][1] == 1: 
        y_test_labels.append(1)
    else: 
        y_test_labels.append(0)

prfs = precision_recall_fscore_support(y_test_labels, predictions_labels, average='binary')
acc = accuracy_score(y_test_labels, predictions_labels)
print "Accuracy:", acc
print "Precision:", round(prfs[0], 4)
print "Recall:", round(prfs[1], 4)
print "F1 Score:", round(prfs[2], 4)

Accuracy: 0.7527
Precision: 0.6884
Recall: 0.5882
F1 Score: 0.6343
