In [1]:
import gensim
import multiprocessing
import os

# Create Cleaned Data from raw .tsv file

In [109]:
# files names - if these files and folders don't exist then they are downloaded 
raw_file_name = "quora_duplicate_questions.tsv"
q1_file_name = "cleaned_q1.txt"
q2_file_name = "cleaned_q2.txt"
dup_file_name = "is_duplicate.txt"
questions_folder_name = "cleaned_data"

# file to store hyperparameter tuning numbers
parameters_and_errors_name = "test_parameters_and_errors.csv"# "parameters_and_errors.csv"

In [3]:
import pandas as pd
import os
import string
import requests

# check whether downloaded data already exists with names provided above
if not os.path.isdir(questions_folder_name) or not os.path.isfile(dup_file_name):
    
    # Download questions file - data set has been changed since first release
    url = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
    r = requests.get(url)
    with open(raw_file_name, 'wb') as f:
        f.write(r.content)
    
    # DATA CLEANING WITH PANDAS
    
    # read in file into dataframe
    data = pd.read_csv(raw_file_name, sep='\t')
    # drop rows with null value
    data.dropna(inplace=True)
    # make columns of lower cased words
    data["cleaned_q1"] = data.text1.str.lower()
    data["cleaned_q2"] = data.text2.str.lower()
    # remove punctuation from lower-cased words columns
    data['cleaned_q1'] = data['cleaned_q1'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    data['cleaned_q2'] = data['cleaned_q2'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    # remove the character "\n", which messes up the line delimiters in txt file
    # these only occur ~20 times in the questions
    data["cleaned_q1"] = data['cleaned_q1'].str.replace("\n", "")
    data["cleaned_q2"] = data['cleaned_q2'].str.replace("\n", "")
    # shuffle data before writing to file - this way random sample can be taken from file 
    # simply by choosing first n rows of file
    data = data.sample(frac=1)

    # create directory to hold question data
    if not os.path.exists(questions_folder_name):
        os.makedirs(questions_folder_name)

    # write cleaned text rows to txt files, one line for each sentence
    data["cleaned_q1"].to_csv(questions_folder_name + "/" + q1_file_name, sep='\n', header=False, index=False)
    data["cleaned_q2"].to_csv(questions_folder_name + "/" + q2_file_name, sep='\n', header=False, index=False)
    # write dup values to txt file, one line for each value
    data["duplicate"].to_csv(dup_file_name, sep='\n', header=False, index=False)

    print "Saved file with", len(data), "rows at", raw_file_name
else: 
    print "Directory that should contain data already exists."

Directory that should contain data already exists.


# Load in data

## Input parameters

In [4]:
# notebook assumes you have a directory "questions_directory" that contains two text files, one for all 
# first questions and one for all second questions, and a file is_dup_file that has one boolean value (0 or 1)
# on each line, one for all ~400,000 document pairs that tells whether questions are duplicates or not

# path to questions directory (contains sorted question text files) and 
# is_dup_file (contains sorted answers about whether question pairs are duplicates)

# These files and directories are created automatically by the above cell
questions_directory = questions_folder_name
is_dup_file = dup_file_name
questions_file_names = [os.path.basename(filename) for filename in os.listdir(questions_directory)]

In [5]:
questions_file_names

['cleaned_q1.txt', 'cleaned_q2.txt']

## Load in data

In [110]:
# set number of rows (question pairs) to train on
num_question_pairs = 5000

In [101]:
# produce logs during training
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)

In [119]:
# create iterator to run through directory of text files with one sentence per line
from itertools import izip, count

# class that iterates through first "rows" lines of questions list ("rows" is integer)
class LabeledLineSentence(object):
    def __init__(self, dirname, rows=None):
        self.dirname = dirname
        self.rows = rows
    def __iter__(self):
        for filename in os.listdir(self.dirname):
            for uid, text_line in enumerate(open(os.path.join(self.dirname, filename))):
                if self.rows:
                    if uid >= self.rows: 
                        break
            # for uid, line in enumerate(open(os.path.join(self.dirname, filename))):
                yield gensim.models.doc2vec.LabeledSentence(words=text_line.split(), tags=[os.path.basename(filename) + '_%s' % uid])

# make sure using fast version
assert gensim.models.doc2vec.FAST_VERSION > -1
cores = multiprocessing.cpu_count() # number of cores on computer to use for computations

In [111]:
# load in data into memory - all data combined should only be 200-300 megabytes
# this is done instead of using iterator - makes doing shuffles of data easier

all_docs = []
sentences = LabeledLineSentence(questions_directory, rows=num_question_pairs)
for sentence in sentences:
    all_docs.append(sentence)

print('%d question pairs to train (%d documents total)' % (num_question_pairs, len(all_docs)))

5000 question pairs to train (10000 documents total)


In [112]:
# make list of tuples (document1, document2, is_dup) for all num_question_pairs
# the document names come from naming scheme used in LabeledLineSentence class 

doc_names_and_duplicate_class = []
for i, line in enumerate(open(is_dup_file)):
    if i >= num_question_pairs:
        break
    doc_tup = (questions_file_names[0] + "_" + str(i), questions_file_names[1] + "_" + str(i), int(line.strip("\n")))
    doc_names_and_duplicate_class.append(doc_tup)

print len(doc_names_and_duplicate_class), "document pairs to classify"
print "Document pair names and labels contained in doc_names_and_duplicate_class"

5000 document pairs to classify
Document pair names and labels contained in doc_names_and_duplicate_class


In [113]:
doc_names_and_duplicate_class[0]

('cleaned_q1.txt_0', 'cleaned_q2.txt_0', 1)

# Evaluate Model

In [114]:
import numpy as np
from sklearn.metrics import roc_auc_score

def calculate_AUC(model, doc_names_and_duplicate_class): 
    """ Return area under ROC curve for model. This is done by simply taking cosine similarity between 
        document vectors to predict whether they are duplicate questions or not.
    """
    doc_distances = []

    for i in range(len(doc_names_and_duplicate_class)):
        # get word vectors for given pair
        vec1_name = doc_names_and_duplicate_class[i][0]
        vec2_name = doc_names_and_duplicate_class[i][1]
        vec1 = model.docvecs[vec1_name]         
        vec2 = model.docvecs[vec2_name]       
        # take cosine distance between them
        distance = cosine_similarity(vec1, vec2)
        doc_distances.append(distance)

    doc_distances = np.array(doc_distances)
    doc_scores = np.array([x[2] for x in doc_names_and_duplicate_class])
    
    return roc_auc_score(doc_scores, doc_distances)

def cosine_similarity(vec1, vec2): 
    # return cosine angle between numpy vectors v1 and v2
    def unit_vector(vec):
        return vec/np.linalg.norm(vec)
    vec1_u, vec2_u = unit_vector(vec1), unit_vector(vec2)
    return np.dot(vec1_u, vec2_u)

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Train model

In [37]:
# initialize Doc2vec model parameters
"""
The documents iterable can be simply a list of TaggedDocument elements, 
but for larger corpora, consider an iterable that streams the documents 
directly from disk/network.

If you don’t supply documents, the model is left uninitialized – 
use if you plan to initialize it in some other way.



dm defines the training algorithm. By default (dm=1), 
    ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.

size is the dimensionality of the feature vectors.

window is the maximum distance between the predicted word and context words used for prediction within a document.

alpha is the initial learning rate (will linearly drop to zero as training progresses).

seed = for the random number generator. Note that for a fully deterministically-reproducible run, 
    you must also limit the model to a single worker thread, to eliminate ordering jitter from 
    OS thread scheduling. (In Python 3, reproducibility between interpreter launches also 
    requires use of the PYTHONHASHSEED environment variable to control hash randomization.)

min_count = ignore all words with total frequency lower than this.

max_vocab_size = limit RAM during vocabulary building; if there are more unique words 
                 than this, then prune the infrequent ones. Every 10 million word types 
                 need about 1GB of RAM. Set to None for no limit (default).

sample = threshold for configuring which higher-frequency words are randomly downsampled;
        default is 0 (off), useful value is 1e-5.
        
workers = use this many worker threads to train the model (=faster training with multicore machines).

iter = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, 
        but values of 10 or 20 are common in published ‘Paragraph Vector’ experiments.

hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

negative = if > 0, negative sampling will be used, the int for negative specifies how many 
            “noise words” should be drawn (usually between 5-20).

dm_mean = if 0 (default), use the sum of the context word vectors. If 1, use the mean. 
            Only applies when dm is used in non-concatenative mode.

dm_concat = if 1, use concatenation of context vectors rather than sum/average; default is 0 (off). 
                Note concatenation results in a much-larger model, as the input is no longer the size of 
                one (sampled or arithmatically combined) word vector, but the size of the tag(s) and all 
                words in the context strung together.

dm_tag_count = expected constant number of document tags per document, when using dm_concat mode; default is 1.

dbow_words if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; 
            default is 0 (faster training of doc-vectors only).

"""


# set model parameters parameters

parameters_dict = {

'documents' : all_docs,
'dm' : 0, # use bag-of-words (dbow) model; 1 uses embedding (dmpv) model
'size' : 200, # size of word/doc vectors
'window' : 15, # # max distance between word and neighbor word for word embeddings
'alpha' : .025, # learning rate - use rate in paper
'min_alpha' : 0.0001, # rate from paper
'min_count' : 5, # ignore words with count less than this
'sample' : 1e-5, # how to configure downsampling for high frequency words
'workers' : cores, # number of cores to use
'hs' : 0, # use negative sampling
'negative' : 5, # used in negative sampling
'dbow_words' : 1, # trains word vectors in addition to document vectors in dbow model
'iter' : 3 # recommended number of epochs is ~20 for dbow model on question comparison   

}

## Create parameters for grid search

In [123]:
# create list of parameters to use in model
dms = [0]
sizes = [300]
windows = [5, 15]
alphas = [0.025]
min_alphas = [0.0001]
min_counts = [1, 5]
samples = [1e-5, 0.5e-5, 1e-6]
workers_s = [cores]
hs_s = [0]
negatives = [5]
dbow_words_s = [1]
iters = [150]

In [124]:
# run through all parameters and record error rate of each one
from itertools import product

# create list to score ROC AUC scores and their model parameters
params_and_errors = []
# create iterable of all combinations of parameters
params_product = product(dms, sizes, windows, alphas, min_alphas, 
                        min_counts, samples, workers_s, hs_s, negatives, 
                        dbow_words_s, iters)
parameters = [x for x in params_product]

In [125]:
print "Starting first run of", len(parameters), "runs"
total_time = 0
for run_number, pars in enumerate(parameters): 
    params = {'dm':pars[0], 'size':pars[1], 'window':pars[2], 
              'alpha':pars[3], 'min_alpha':pars[4], 'min_count':pars[5],
              'sample':pars[6], 'workers':pars[7], 'hs':pars[8],
              'negative':pars[9], 'dbow_words':pars[10], 'iter':pars[11]}
    with elapsed_timer() as elapsed:
        model = gensim.models.doc2vec.Doc2Vec(documents=all_docs, **params)
        AUC_value = calculate_AUC(model, doc_names_and_duplicate_class)
        params_and_errors.append((params, AUC_value))
        duration = '%.1f' % elapsed()
        total_time += float(duration)
        print 
        print "Completed run number", run_number + 1, "of", len(parameters), "runs total"
        print "AUC score:", round(AUC_value, 4)
        print "Training for this run took", round(float(duration)/60.,1), "minutes"

best_AUC = max([x[1] for x in params_and_errors])
print
print
print "Total training time for all runs:", round(float(total_time)/3600.,2), "hours"
print "Best AUC value:", round(best_AUC, 6)
print "Paramters for best AUC value:", [x[0] for x in params_and_errors if x[1] == best_AUC][0]

# convert params and errors into easy-to-read pandas dataframe
params_df = pd.DataFrame([x[0] for x in params_and_errors])
params_df["AUC"] = pd.Series([x[1] for x in params_and_errors])
params_df.sort_values("AUC", ascending=False, inplace=True)

# write parameter values to csv - append if this csv already exists
header=True
if os.path.isfile(parameters_and_errors_name):
    header=False
params_df.to_csv(parameters_and_errors_name, header=header, index=False, mode="a")

2017-02-23 14:14:12,858 : INFO : collecting all words and their counts
2017-02-23 14:14:12,883 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-02-23 14:14:13,062 : INFO : collected 10735 word types and 10000 unique tags from a corpus of 10000 examples and 108824 words
2017-02-23 14:14:13,067 : INFO : Loading a fresh vocabulary


Starting first run of 12 runs


2017-02-23 14:14:13,161 : INFO : min_count=1 retains 10735 unique words (100% of original 10735, drops 0)
2017-02-23 14:14:13,162 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:14:13,279 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:14:13,280 : INFO : sample=1e-05 downsamples 3746 most-common words
2017-02-23 14:14:13,282 : INFO : downsampling leaves estimated 26577 word corpus (24.4% of prior 108824)
2017-02-23 14:14:13,286 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:14:13,373 : INFO : resetting layer weights
2017-02-23 14:14:14,156 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=5
2017-02-23 14:14:14,157 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:14:15,401 : INFO : PROGRESS: at 0.55% examples, 24621 words/s, in_qsize 7,


Completed run number 1 of 12 runs total
AUC score: 0.7127
Training for this run took 2.2 minutes


2017-02-23 14:16:26,502 : INFO : min_count=1 retains 10735 unique words (100% of original 10735, drops 0)
2017-02-23 14:16:26,504 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:16:26,592 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:16:26,594 : INFO : sample=5e-06 downsamples 5947 most-common words
2017-02-23 14:16:26,594 : INFO : downsampling leaves estimated 19732 word corpus (18.1% of prior 108824)
2017-02-23 14:16:26,596 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:16:26,663 : INFO : resetting layer weights
2017-02-23 14:16:27,291 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=5e-06 negative=5 window=5
2017-02-23 14:16:27,292 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:16:28,538 : INFO : PROGRESS: at 0.80% examples, 28686 words/s, in_qsize 7,


Completed run number 2 of 12 runs total
AUC score: 0.7052
Training for this run took 2.1 minutes


2017-02-23 14:18:32,299 : INFO : min_count=1 retains 10735 unique words (100% of original 10735, drops 0)
2017-02-23 14:18:32,300 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:18:32,384 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:18:32,385 : INFO : sample=1e-06 downsamples 10735 most-common words
2017-02-23 14:18:32,387 : INFO : downsampling leaves estimated 7983 word corpus (7.3% of prior 108824)
2017-02-23 14:18:32,388 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:18:32,453 : INFO : resetting layer weights
2017-02-23 14:18:33,062 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=1e-06 negative=5 window=5
2017-02-23 14:18:33,063 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:18:34,168 : INFO : PROGRESS: at 0.80% examples, 19614 words/s, in_qsize 7, 


Completed run number 3 of 12 runs total
AUC score: 0.3493
Training for this run took 2.1 minutes


2017-02-23 14:20:39,980 : INFO : resetting layer weights
2017-02-23 14:20:40,379 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=5
2017-02-23 14:20:40,381 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:20:41,538 : INFO : PROGRESS: at 0.80% examples, 23341 words/s, in_qsize 7, out_qsize 1
2017-02-23 14:20:42,649 : INFO : PROGRESS: at 1.78% examples, 26257 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:20:43,782 : INFO : PROGRESS: at 2.76% examples, 27043 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:20:44,905 : INFO : PROGRESS: at 3.74% examples, 27498 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:20:46,026 : INFO : PROGRESS: at 4.72% examples, 27802 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:20:47,139 : INFO : PROGRESS: at 5.70% examples, 28032 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:20:48,244 : INFO : PROGRESS: at 6.67% examples, 28226 words/


Completed run number 4 of 12 runs total
AUC score: 0.642
Training for this run took 1.9 minutes


2017-02-23 14:22:34,880 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:22:34,882 : INFO : sample=5e-06 downsamples 2343 most-common words
2017-02-23 14:22:34,883 : INFO : downsampling leaves estimated 8125 word corpus (8.6% of prior 94878)
2017-02-23 14:22:34,884 : INFO : estimated required memory for 2343 words and 300 dimensions: 20794700 bytes
2017-02-23 14:22:34,904 : INFO : resetting layer weights
2017-02-23 14:22:35,284 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=5e-06 negative=5 window=5
2017-02-23 14:22:35,285 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:22:36,377 : INFO : PROGRESS: at 0.80% examples, 19851 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:22:37,451 : INFO : PROGRESS: at 1.78% examples, 22303 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:22:38,506 : INFO : PROGRESS: at 2.76% examples, 23262 words/s, in_qsize 8, out_qsize 0
20


Completed run number 5 of 12 runs total
AUC score: 0.5569
Training for this run took 2.0 minutes


2017-02-23 14:24:36,404 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:24:36,407 : INFO : sample=1e-06 downsamples 2343 most-common words
2017-02-23 14:24:36,409 : INFO : downsampling leaves estimated 3358 word corpus (3.5% of prior 94878)
2017-02-23 14:24:36,410 : INFO : estimated required memory for 2343 words and 300 dimensions: 20794700 bytes
2017-02-23 14:24:36,431 : INFO : resetting layer weights
2017-02-23 14:24:36,893 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=1e-06 negative=5 window=5
2017-02-23 14:24:36,895 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:24:38,081 : INFO : PROGRESS: at 0.80% examples, 13595 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:24:39,211 : INFO : PROGRESS: at 1.78% examples, 15423 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:24:40,231 : INFO : PROGRESS: at 2.51% examples, 15099 words/s, in_qsize 7, out_qsize 0
20


Completed run number 6 of 12 runs total
AUC score: 0.4626
Training for this run took 1.9 minutes


2017-02-23 14:26:33,223 : INFO : min_count=1 retains 10735 unique words (100% of original 10735, drops 0)
2017-02-23 14:26:33,225 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:26:33,295 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:26:33,297 : INFO : sample=1e-05 downsamples 3746 most-common words
2017-02-23 14:26:33,298 : INFO : downsampling leaves estimated 26577 word corpus (24.4% of prior 108824)
2017-02-23 14:26:33,300 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:26:33,354 : INFO : resetting layer weights
2017-02-23 14:26:34,080 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=15
2017-02-23 14:26:34,081 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:26:35,149 : INFO : PROGRESS: at 0.55% examples, 28593 words/s, in_qsize 7


Completed run number 7 of 12 runs total
AUC score: 0.6987
Training for this run took 2.4 minutes


2017-02-23 14:28:58,791 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:28:58,890 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:28:58,891 : INFO : sample=5e-06 downsamples 5947 most-common words
2017-02-23 14:28:58,893 : INFO : downsampling leaves estimated 19732 word corpus (18.1% of prior 108824)
2017-02-23 14:28:58,894 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:28:58,973 : INFO : resetting layer weights
2017-02-23 14:28:59,654 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=5e-06 negative=5 window=15
2017-02-23 14:28:59,655 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:29:00,664 : INFO : PROGRESS: at 0.73% examples, 32826 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:29:01,878 : INFO : PROGRESS: at 1.53% examples, 30908 words/s, in_qsize 8, out


Completed run number 8 of 12 runs total
AUC score: 0.6974
Training for this run took 2.3 minutes


2017-02-23 14:31:15,716 : INFO : min_count=1 retains 10735 unique words (100% of original 10735, drops 0)
2017-02-23 14:31:15,717 : INFO : min_count=1 leaves 108824 word corpus (100% of original 108824, drops 0)
2017-02-23 14:31:15,793 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:31:15,794 : INFO : sample=1e-06 downsamples 10735 most-common words
2017-02-23 14:31:15,795 : INFO : downsampling leaves estimated 7983 word corpus (7.3% of prior 108824)
2017-02-23 14:31:15,796 : INFO : estimated required memory for 10735 words and 300 dimensions: 45131500 bytes
2017-02-23 14:31:15,858 : INFO : resetting layer weights
2017-02-23 14:31:16,563 : INFO : training model with 4 workers on 10735 vocabulary and 300 features, using sg=1 hs=0 sample=1e-06 negative=5 window=15
2017-02-23 14:31:16,564 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:31:17,752 : INFO : PROGRESS: at 0.80% examples, 18318 words/s, in_qsize 7,


Completed run number 9 of 12 runs total
AUC score: 0.3474
Training for this run took 2.0 minutes


2017-02-23 14:33:19,049 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=15
2017-02-23 14:33:19,050 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:33:20,288 : INFO : PROGRESS: at 0.80% examples, 21547 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:21,526 : INFO : PROGRESS: at 1.78% examples, 23957 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:22,713 : INFO : PROGRESS: at 2.76% examples, 25061 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:23,718 : INFO : PROGRESS: at 3.62% examples, 25785 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:24,857 : INFO : PROGRESS: at 4.47% examples, 25645 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:25,891 : INFO : PROGRESS: at 5.20% examples, 25292 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:26,957 : INFO : PROGRESS: at 6.12% examples, 25769 words/s, in_qsize 7, out_qsize 0
2017-02-23 14:33:28,156 : INF


Completed run number 10 of 12 runs total
AUC score: 0.6555
Training for this run took 2.2 minutes


2017-02-23 14:35:29,108 : INFO : min_count=5 leaves 94878 word corpus (87% of original 108824, drops 13946)
2017-02-23 14:35:29,141 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:35:29,143 : INFO : sample=5e-06 downsamples 2343 most-common words
2017-02-23 14:35:29,149 : INFO : downsampling leaves estimated 8125 word corpus (8.6% of prior 94878)
2017-02-23 14:35:29,151 : INFO : estimated required memory for 2343 words and 300 dimensions: 20794700 bytes
2017-02-23 14:35:29,179 : INFO : resetting layer weights
2017-02-23 14:35:29,746 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=5e-06 negative=5 window=15
2017-02-23 14:35:29,748 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:35:30,810 : INFO : PROGRESS: at 0.55% examples, 14459 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:35:31,811 : INFO : PROGRESS: at 1.17% examples, 15573 words/s, in_qsize 8, out_qs


Completed run number 11 of 12 runs total
AUC score: 0.5567
Training for this run took 2.2 minutes


2017-02-23 14:37:38,829 : INFO : min_count=5 leaves 94878 word corpus (87% of original 108824, drops 13946)
2017-02-23 14:37:38,856 : INFO : deleting the raw counts dictionary of 10735 items
2017-02-23 14:37:38,858 : INFO : sample=1e-06 downsamples 2343 most-common words
2017-02-23 14:37:38,860 : INFO : downsampling leaves estimated 3358 word corpus (3.5% of prior 94878)
2017-02-23 14:37:38,861 : INFO : estimated required memory for 2343 words and 300 dimensions: 20794700 bytes
2017-02-23 14:37:38,884 : INFO : resetting layer weights
2017-02-23 14:37:39,268 : INFO : training model with 4 workers on 2343 vocabulary and 300 features, using sg=1 hs=0 sample=1e-06 negative=5 window=15
2017-02-23 14:37:39,269 : INFO : expecting 10000 sentences, matching count from corpus used for vocabulary survey
2017-02-23 14:37:40,338 : INFO : PROGRESS: at 0.80% examples, 15046 words/s, in_qsize 8, out_qsize 0
2017-02-23 14:37:41,379 : INFO : PROGRESS: at 1.78% examples, 16979 words/s, in_qsize 7, out_qs


Completed run number 12 of 12 runs total
AUC score: 0.4654
Training for this run took 1.9 minutes


Total training time for all runs: 0.03 hours
Best AUC value: 0.712672
Paramters for best AUC value: {'dm': 0, 'hs': 0, 'sample': 1e-05, 'dbow_words': 1, 'alpha': 0.025, 'min_count': 1, 'size': 300, 'workers': 4, 'negative': 5, 'iter': 150, 'window': 5, 'min_alpha': 0.0001}


In [128]:
params_df

Unnamed: 0,alpha,dbow_words,dm,hs,iter,min_alpha,min_count,negative,sample,size,window,workers,AUC
0,0.025,1,0,0,150,0.0001,1,5,1e-05,300,5,4,0.712672
1,0.025,1,0,0,150,0.0001,1,5,5e-06,300,5,4,0.705181
6,0.025,1,0,0,150,0.0001,1,5,1e-05,300,15,4,0.698711
7,0.025,1,0,0,150,0.0001,1,5,5e-06,300,15,4,0.69739
9,0.025,1,0,0,150,0.0001,5,5,1e-05,300,15,4,0.65548
3,0.025,1,0,0,150,0.0001,5,5,1e-05,300,5,4,0.641993
4,0.025,1,0,0,150,0.0001,5,5,5e-06,300,5,4,0.556895
10,0.025,1,0,0,150,0.0001,5,5,5e-06,300,15,4,0.556657
11,0.025,1,0,0,150,0.0001,5,5,1e-06,300,15,4,0.465403
5,0.025,1,0,0,150,0.0001,5,5,1e-06,300,5,4,0.462598


In [129]:
# show only the columns that changed in value 
params_df.loc[:, (params_df != params_df.ix[0]).any()]

Unnamed: 0,min_count,sample,window,AUC
0,1,1e-05,5,0.712672
1,1,5e-06,5,0.705181
6,1,1e-05,15,0.698711
7,1,5e-06,15,0.69739
9,5,1e-05,15,0.65548
3,5,1e-05,5,0.641993
4,5,5e-06,5,0.556895
10,5,5e-06,15,0.556657
11,5,1e-06,15,0.465403
5,5,1e-06,5,0.462598


In [None]:
""" PARAMETER TIPS: 

sample: CRITICAL: 1e-6 terrible, 5e-6 and 1e-5 better - try 5e-5 
min_count: 1 works well with small data set
window: 5 and 15 both worked reasonably well
size: 300 works well
iter: 150 seems ok, try more - sentence similarity in paper used ~400 to converge






"""



In [92]:
# read in paramater file to see what has been done before
all_params_df = pd.read_csv(parameters_and_errors_name)

In [93]:
all_params_df

Unnamed: 0,alpha,dbow_words,dm,hs,iter,min_alpha,min_count,negative,sample,size,window,workers,AUC
0,0.025,1,0,0,100,0.0001,10,5,1e-05,300,15,4,0.737806
1,0.025,1,0,0,100,0.0001,10,5,1e-05,100,15,4,0.737249
2,0.025,1,0,0,100,0.0001,5,5,1e-05,100,15,4,0.737077
3,0.025,1,0,0,100,0.0001,10,5,1e-05,200,15,4,0.737076
4,0.025,1,0,0,100,0.0001,5,5,1e-05,200,15,4,0.736904
5,0.025,1,0,0,100,0.0001,5,5,1e-05,300,15,4,0.736056


In [None]:
# 0.672 with 100-length vectors and 20 epochs
# 0.671 with 300-length vectors and 20 epochs
# 0.74 with 200-length vectors and 200 epochs
# 0.74 with 200-length vectors and 100 epochs

# model seems to tail off in accuracy around 100 epochs

# Manual runs with accuracy check at each epoch

In [None]:
# try a run with manual epochs and recording accuracy after each epoch
model = gensim.models.doc2vec.Doc2Vec(documents=None, dm=dm, size=size, window=window, 
                                      alpha=alpha, min_alpha=min_alpha, min_count=min_count, 
                                      sample=sample, workers=workers, hs=hs, negative=negative, 
                                      dbow_words=dbow_words, iter=1)

In [None]:
from random import shuffle

# number of epochs to train
# need to manually decrease alpha in this case
epochs = 100
alpha = 0.025
min_alpha = 0.0001
alpha_delta = (alpha - min_alpha) / epochs

model.build_vocab(all_docs)
with elapsed_timer() as elapsed:
    for epoch in range(epochs): 
        model.alpha, model.min_alpha = alpha, alpha
        # shuffle documents
        shuffle(all_docs)
        # train model
        model.train(all_docs)
        # evaluate model 
        error = calculate_AUC(model, doc_names_and_duplicate_class)
        alpha -= alpha_delta
        print "AUC for epoch", epoch, ":", error
    duration = '%.1f' % elapsed()
    print("completed training in %s minutes" % (duration/60.))