In [1]:
import gensim
import multiprocessing
import os

# Create Cleaned Data from raw .tsv file

In [2]:
# files names - if these files and folders don't exist then they are downloaded 
raw_file_name = "quora_duplicate_questions.tsv"
q1_file_name = "cleaned_q1.txt"
q2_file_name = "cleaned_q2.txt"
dup_file_name = "is_duplicate.txt"
questions_folder_name = "cleaned_data"

# file to store hyperparameter tuning numbers
parameters_and_errors_name = "parameters_and_errors.csv"# "parameters_and_errors.csv"

In [3]:
import pandas as pd
import os
import string
import requests

# check whether downloaded data already exists with names provided above
if not os.path.isdir(questions_folder_name) or not os.path.isfile(dup_file_name):
    
    # Download questions file - data set has been changed since first release
    url = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
    r = requests.get(url)
    with open(raw_file_name, 'wb') as f:
        f.write(r.content)
    
    # DATA CLEANING WITH PANDAS
    
    # read in file into dataframe
    data = pd.read_csv(raw_file_name, sep='\t')
    # drop rows with null value
    data.dropna(inplace=True)
    # make columns of lower cased words
    data["cleaned_q1"] = data.text1.str.lower()
    data["cleaned_q2"] = data.text2.str.lower()
    # remove punctuation from lower-cased words columns
    data['cleaned_q1'] = data['cleaned_q1'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    data['cleaned_q2'] = data['cleaned_q2'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    # remove the character "\n", which messes up the line delimiters in txt file
    # these only occur ~20 times in the questions
    data["cleaned_q1"] = data['cleaned_q1'].str.replace("\n", "")
    data["cleaned_q2"] = data['cleaned_q2'].str.replace("\n", "")
    # shuffle data before writing to file - this way random sample can be taken from file 
    # simply by choosing first n rows of file
    data = data.sample(frac=1)

    # create directory to hold question data
    if not os.path.exists(questions_folder_name):
        os.makedirs(questions_folder_name)

    # write cleaned text rows to txt files, one line for each sentence
    data["cleaned_q1"].to_csv(questions_folder_name + "/" + q1_file_name, sep='\n', header=False, index=False)
    data["cleaned_q2"].to_csv(questions_folder_name + "/" + q2_file_name, sep='\n', header=False, index=False)
    # write dup values to txt file, one line for each value
    data["duplicate"].to_csv(dup_file_name, sep='\n', header=False, index=False)

    print "Saved file with", len(data), "rows at", raw_file_name
else: 
    print "Directory that should contain data already exists."

Saved file with 404288 rows at quora_duplicate_questions.tsv


# Load in data

## Input parameters

In [4]:
# notebook assumes you have a directory "questions_directory" that contains two text files, one for all 
# first questions and one for all second questions, and a file is_dup_file that has one boolean value (0 or 1)
# on each line, one for all ~400,000 document pairs that tells whether questions are duplicates or not

# path to questions directory (contains sorted question text files) and 
# is_dup_file (contains sorted answers about whether question pairs are duplicates)

# These files and directories are created automatically by the above cell
questions_directory = questions_folder_name
is_dup_file = dup_file_name
questions_file_names = [os.path.basename(filename) for filename in os.listdir(questions_directory)]

In [5]:
questions_file_names

['cleaned_q1.txt', 'cleaned_q2.txt']

## Load in data

In [20]:
# set number of rows (question pairs) to train on
num_question_pairs = 100

In [21]:
# produce logs during training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [22]:
# create iterator to run through directory of text files with one sentence per line
from itertools import izip, count

# class that iterates through first "rows" lines of questions list ("rows" is integer)
class LabeledLineSentence(object):
    def __init__(self, dirname, rows=None):
        self.dirname = dirname
        self.rows = rows
    def __iter__(self):
        for filename in os.listdir(self.dirname):
            for uid, text_line in enumerate(open(os.path.join(self.dirname, filename))):
                if self.rows:
                    if uid >= self.rows: 
                        break
            # for uid, line in enumerate(open(os.path.join(self.dirname, filename))):
                yield gensim.models.doc2vec.LabeledSentence(words=text_line.split(), tags=[os.path.basename(filename) + '_%s' % uid])

# make sure using fast version
assert gensim.models.doc2vec.FAST_VERSION > -1
cores = multiprocessing.cpu_count() # number of cores on computer to use for computations

In [23]:
# load in data into memory - all data combined should only be 200-300 megabytes
# this is done instead of using iterator - makes doing shuffles of data easier

all_docs = []
sentences = LabeledLineSentence(questions_directory, rows=num_question_pairs)
for sentence in sentences:
    all_docs.append(sentence)

print('%d question pairs to train (%d documents total)' % (num_question_pairs, len(all_docs)))

100 question pairs to train (200 documents total)


In [24]:
# make list of tuples (document1, document2, is_dup) for all num_question_pairs
# the document names come from naming scheme used in LabeledLineSentence class 

doc_names_and_duplicate_class = []
for i, line in enumerate(open(is_dup_file)):
    if i >= num_question_pairs:
        break
    doc_tup = (questions_file_names[0] + "_" + str(i), questions_file_names[1] + "_" + str(i), int(line.strip("\n")))
    doc_names_and_duplicate_class.append(doc_tup)

print len(doc_names_and_duplicate_class), "document pairs to classify"
print "Document pair names and labels contained in doc_names_and_duplicate_class"

100 document pairs to classify
Document pair names and labels contained in doc_names_and_duplicate_class


In [25]:
doc_names_and_duplicate_class[0]

('cleaned_q1.txt_0', 'cleaned_q2.txt_0', 0)

# Evaluate Model

In [16]:
import numpy as np
from sklearn.metrics import roc_auc_score

def calculate_AUC(model, doc_names_and_duplicate_class): 
    """ Return area under ROC curve for model. This is done by simply taking cosine similarity between 
        document vectors to predict whether they are duplicate questions or not.
    """
    doc_distances = []

    for i in range(len(doc_names_and_duplicate_class)):
        # get word vectors for given pair
        vec1_name = doc_names_and_duplicate_class[i][0]
        vec2_name = doc_names_and_duplicate_class[i][1]
        vec1 = model.docvecs[vec1_name]         
        vec2 = model.docvecs[vec2_name]       
        # take cosine distance between them
        distance = cosine_similarity(vec1, vec2)
        doc_distances.append(distance)

    doc_distances = np.array(doc_distances)
    doc_scores = np.array([x[2] for x in doc_names_and_duplicate_class])
    
    return roc_auc_score(doc_scores, doc_distances)

def cosine_similarity(vec1, vec2): 
    # return cosine angle between numpy vectors v1 and v2
    def unit_vector(vec):
        return vec/np.linalg.norm(vec)
    vec1_u, vec2_u = unit_vector(vec1), unit_vector(vec2)
    return np.dot(vec1_u, vec2_u)

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Train model

In [None]:
# initialize Doc2vec model parameters
"""
The documents iterable can be simply a list of TaggedDocument elements, 
but for larger corpora, consider an iterable that streams the documents 
directly from disk/network.

If you don’t supply documents, the model is left uninitialized – 
use if you plan to initialize it in some other way.



dm defines the training algorithm. By default (dm=1), 
    ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.

size is the dimensionality of the feature vectors.

window is the maximum distance between the predicted word and context words used for prediction within a document.

alpha is the initial learning rate (will linearly drop to zero as training progresses).

seed = for the random number generator. Note that for a fully deterministically-reproducible run, 
    you must also limit the model to a single worker thread, to eliminate ordering jitter from 
    OS thread scheduling. (In Python 3, reproducibility between interpreter launches also 
    requires use of the PYTHONHASHSEED environment variable to control hash randomization.)

min_count = ignore all words with total frequency lower than this.

max_vocab_size = limit RAM during vocabulary building; if there are more unique words 
                 than this, then prune the infrequent ones. Every 10 million word types 
                 need about 1GB of RAM. Set to None for no limit (default).

sample = threshold for configuring which higher-frequency words are randomly downsampled;
        default is 0 (off), useful value is 1e-5.
        
workers = use this many worker threads to train the model (=faster training with multicore machines).

iter = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, 
        but values of 10 or 20 are common in published ‘Paragraph Vector’ experiments.

hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

negative = if > 0, negative sampling will be used, the int for negative specifies how many 
            “noise words” should be drawn (usually between 5-20).

dm_mean = if 0 (default), use the sum of the context word vectors. If 1, use the mean. 
            Only applies when dm is used in non-concatenative mode.

dm_concat = if 1, use concatenation of context vectors rather than sum/average; default is 0 (off). 
                Note concatenation results in a much-larger model, as the input is no longer the size of 
                one (sampled or arithmatically combined) word vector, but the size of the tag(s) and all 
                words in the context strung together.

dm_tag_count = expected constant number of document tags per document, when using dm_concat mode; default is 1.

dbow_words if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; 
            default is 0 (faster training of doc-vectors only).

"""


# set model parameters parameters

parameters_dict = {

'documents' : all_docs,
'dm' : 0, # use bag-of-words (dbow) model; 1 uses embedding (dmpv) model
'size' : 200, # size of word/doc vectors
'window' : 15, # # max distance between word and neighbor word for word embeddings
'alpha' : .025, # learning rate - use rate in paper
'min_alpha' : 0.0001, # rate from paper
'min_count' : 5, # ignore words with count less than this
'sample' : 1e-5, # how to configure downsampling for high frequency words
'workers' : cores, # number of cores to use
'hs' : 0, # use negative sampling
'negative' : 5, # used in negative sampling
'dbow_words' : 1, # trains word vectors in addition to document vectors in dbow model
'iter' : 3 # recommended number of epochs is ~20 for dbow model on question comparison   

}

## Create parameters for grid search

In [26]:
# create list of parameters to use in model
dms = [0]
sizes = [300]
windows = [5, 15]
alphas = [0.025]
min_alphas = [0.0001]
min_counts = [1, 5]
samples = [1e-5, 5e-5, 1e-4]
workers_s = [cores]
hs_s = [0]
negatives = [5]
dbow_words_s = [1]
iters = [150]

In [27]:
# run through all parameters and record error rate of each one
from itertools import product

# create list to score ROC AUC scores and their model parameters
params_and_errors = []
# create iterable of all combinations of parameters
params_product = product(dms, sizes, windows, alphas, min_alphas, 
                        min_counts, samples, workers_s, hs_s, negatives, 
                        dbow_words_s, iters)
parameters = [x for x in params_product]

In [31]:
print "Starting first run of", len(parameters), "runs"
total_time = 0
for run_number, pars in enumerate(parameters): 
    params = {'dm':pars[0], 'size':pars[1], 'window':pars[2], 
              'alpha':pars[3], 'min_alpha':pars[4], 'min_count':pars[5],
              'sample':pars[6], 'workers':pars[7], 'hs':pars[8],
              'negative':pars[9], 'dbow_words':pars[10], 'iter':pars[11]}
    with elapsed_timer() as elapsed:
        model = gensim.models.doc2vec.Doc2Vec(documents=all_docs, **params)
        AUC_value = calculate_AUC(model, doc_names_and_duplicate_class)
        params_and_errors.append((params, AUC_value))
        duration = '%.1f' % elapsed()
        total_time += float(duration)
        print 
        print "Completed run number", run_number + 1, "of", len(parameters), "runs total"
        print "AUC score:", round(AUC_value, 4)
        print "Training for this run took", round(float(duration)/60.,1), "minutes"

best_AUC = max([x[1] for x in params_and_errors])
print
print
print "Total training time for all runs:", round(float(total_time)/3600.,2), "hours"
print "Best AUC value:", round(best_AUC, 6)
print "Paramters for best AUC value:", [x[0] for x in params_and_errors if x[1] == best_AUC][0]

# convert params and errors into easy-to-read pandas dataframe
params_df = pd.DataFrame([x[0] for x in params_and_errors])
params_df["AUC"] = pd.Series([x[1] for x in params_and_errors])
params_df["num_doc_pairs"] = pd.Series([len(all_docs) for _ in range(len(params_and_errors))])
params_df.sort_values("AUC", ascending=False, inplace=True)

# write parameter values to csv - append if this csv already exists
header=True
if os.path.isfile(parameters_and_errors_name):
    header=False
params_df.to_csv(parameters_and_errors_name, header=header, index=False, mode="a")

Starting first run of 12 runs

Completed run number 1 of 12 runs total
AUC score: 0.3437
Training for this run took 0.0 minutes

Completed run number 2 of 12 runs total
AUC score: 0.6061
Training for this run took 0.0 minutes

Completed run number 3 of 12 runs total
AUC score: 0.7381
Training for this run took 0.0 minutes

Completed run number 4 of 12 runs total
AUC score: 0.4251
Training for this run took 0.0 minutes

Completed run number 5 of 12 runs total
AUC score: 0.4496
Training for this run took 0.0 minutes

Completed run number 6 of 12 runs total
AUC score: 0.479
Training for this run took 0.0 minutes

Completed run number 7 of 12 runs total
AUC score: 0.3148
Training for this run took 0.0 minutes

Completed run number 8 of 12 runs total
AUC score: 0.6298
Training for this run took 0.0 minutes

Completed run number 9 of 12 runs total
AUC score: 0.7259
Training for this run took 0.0 minutes

Completed run number 10 of 12 runs total
AUC score: 0.4274
Training for this run took 0.

In [32]:
params_df

Unnamed: 0,alpha,dbow_words,dm,hs,iter,min_alpha,min_count,negative,sample,size,window,workers,AUC,num_doc_pairs
14,0.025,1,0,0,150,0.0001,1,5,0.0001,300,5,8,0.763003,200
2,0.025,1,0,0,150,0.0001,1,5,0.0001,300,5,8,0.741746,200
26,0.025,1,0,0,150,0.0001,1,5,0.0001,300,5,8,0.738128,200
20,0.025,1,0,0,150,0.0001,1,5,0.0001,300,15,8,0.730891,200
32,0.025,1,0,0,150,0.0001,1,5,0.0001,300,15,8,0.725916,200
8,0.025,1,0,0,150,0.0001,1,5,0.0001,300,15,8,0.715061,200
7,0.025,1,0,0,150,0.0001,1,5,5e-05,300,15,8,0.634555,200
31,0.025,1,0,0,150,0.0001,1,5,5e-05,300,15,8,0.629806,200
19,0.025,1,0,0,150,0.0001,1,5,5e-05,300,15,8,0.626866,200
25,0.025,1,0,0,150,0.0001,1,5,5e-05,300,5,8,0.606061,200


In [None]:
# show only the columns that changed in value 
params_df.loc[:, (params_df != params_df.ix[0]).any()]

In [None]:
""" PARAMETER TIPS: 

sample: CRITICAL: 1e-6 terrible, 5e-6 and 1e-5 better - try 5e-5 
min_count: 1 works well with small data set
window: 5 and 15 both worked reasonably well
size: 300 works well
iter: 150 seems ok, try more - sentence similarity in paper used ~400 to converge






"""



In [None]:
# read in paramater file to see what has been done before
all_params_df = pd.read_csv(parameters_and_errors_name)

In [None]:
all_params_df

In [None]:
# 0.672 with 100-length vectors and 20 epochs
# 0.671 with 300-length vectors and 20 epochs
# 0.74 with 200-length vectors and 200 epochs
# 0.74 with 200-length vectors and 100 epochs

# model seems to tail off in accuracy around 100 epochs

# Manual runs with accuracy check at each epoch

In [None]:
# try a run with manual epochs and recording accuracy after each epoch
model = gensim.models.doc2vec.Doc2Vec(documents=None, dm=dm, size=size, window=window, 
                                      alpha=alpha, min_alpha=min_alpha, min_count=min_count, 
                                      sample=sample, workers=workers, hs=hs, negative=negative, 
                                      dbow_words=dbow_words, iter=1)

In [None]:
from random import shuffle

# number of epochs to train
# need to manually decrease alpha in this case
epochs = 100
alpha = 0.025
min_alpha = 0.0001
alpha_delta = (alpha - min_alpha) / epochs

model.build_vocab(all_docs)
with elapsed_timer() as elapsed:
    for epoch in range(epochs): 
        model.alpha, model.min_alpha = alpha, alpha
        # shuffle documents
        shuffle(all_docs)
        # train model
        model.train(all_docs)
        # evaluate model 
        error = calculate_AUC(model, doc_names_and_duplicate_class)
        alpha -= alpha_delta
        print "AUC for epoch", epoch, ":", error
    duration = '%.1f' % elapsed()
    print("completed training in %s minutes" % (duration/60.))