In [1]:
import gensim
import pickle
from nltk.tokenize import casual_tokenize, TreebankWordTokenizer
import re

In [4]:
def is_english(word):
    #not quite, it accepts latin characters
    try:
        word.encode('ISO-8859-1')
    except UnicodeEncodeError:
        return False
    return True

def is_number(word):
    #plus hexadecimal numbers and 1 letter + numbers
    return re.search(r'(^0x)|(^[0-9\-\.:xv]+$)|(^\w\d+$)', word)

def text_from_number(word):
    return re.search(r'^\d+([a-z]+)$', word)

def remove_punctuation(word):
    return re.sub(r'[\./]$', '', word)

def tokenize(doc):
    prog = re.compile(r'^[!#\$%`~<>\'\(\)*+\^,\./\+_\-\?]+$')
    words = [remove_punctuation(word) for word in TreebankWordTokenizer().tokenize(doc) 
             if not prog.search(word) and is_english(word)]
    words = [word for word in words if not is_number(word)]
    return words

## Generate pickle file

Execute if files need to be generated, otherwise, just load the pickles 🥒

In [None]:
question_pairs = pickle.load(open('data/question_pairs.list.pkl', 'rb'))

In [54]:
questions = []
qids = set()
for pair in question_pairs:
    qid1, qid2, question1, question2 = pair[1:5]

    if qid1 not in qids:
        qids.add(qid1)
        questions.append((question1, qid1))

    if qid2 not in qids:
        qids.add(qid2)
        questions.append((question2, qid2))

#should this be saved?
outfile = open('data/question_ids.list.pkl','wb')
pickle.dump(questions, outfile)
outfile.close()

You can skip this if you generated the pickle file in the cell before

In [3]:
questions = pickle.load(open('data/question_ids.list.pkl', 'rb'))

### Creating tagged documents for the Doc2Vec model

In [None]:
documents = [gensim.models.doc2vec.TaggedDocument(tokenize(question), [qid]) for question, qid in questions]
picfile = open('data/d2v_tagged_documents.list.pkl', 'wb')
pickle.dump(documents, picfile)
picfile.close()

## Training model

In [8]:
import multiprocessing
print(gensim.models.doc2vec.FAST_VERSION)
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [11]:
#example
#no hierarchical softmax
#no negative sampling
#cores = multiprocessing.cpu_count() #for some reason 1 worker is faster
model = gensim.models.Doc2Vec(vector_size=100, window=2, hs=0, min_count=2, workers=1, epochs=50)
model.build_vocab(documents[:1000])

In [12]:
print(len(model.wv.vocab))
model.wv.vocab

44410


{'effort': <gensim.models.keyedvectors.Vocab at 0x7f9cb694e160>,
 'karela': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8198>,
 'espncricinfo': <gensim.models.keyedvectors.Vocab at 0x7f9cb656dd30>,
 'insult': <gensim.models.keyedvectors.Vocab at 0x7f9c890a82b0>,
 'idea': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8ba8>,
 'juno': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8fd0>,
 'categories': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8048>,
 'ghats': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8128>,
 'gown': <gensim.models.keyedvectors.Vocab at 0x7f9cb694e208>,
 'reversing': <gensim.models.keyedvectors.Vocab at 0x7f9cb6c299b0>,
 'intrested': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8a58>,
 '5+': <gensim.models.keyedvectors.Vocab at 0x7f9c890a8470>,
 'manually': <gensim.models.keyedvectors.Vocab at 0x7f9cb694e240>,
 'cry': <gensim.models.keyedvectors.Vocab at 0x7f9cb6781dd8>,
 'attaining': <gensim.models.keyedvectors.Vocab at 0x7f9c86a9a710>,
 'reis': <gensim.m

### Examples

In [13]:
%time model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 37min 53s, sys: 10min 20s, total: 48min 13s
Wall time: 30min 49s


In [15]:
#no hierarchical softmax
#no negative sampling
model = gensim.models.Doc2Vec(vector_size=100, window=2, hs=0, min_count=2, workers=1, epochs=50)
model.build_vocab(documents)

1


In [16]:
%time model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 11min 59s, sys: 0 ns, total: 11min 59s
Wall time: 11min 57s


In [18]:
%time model2.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 17min 34s, sys: 1min 18s, total: 18min 52s
Wall time: 12min 2s


### Actual training

In [7]:
from time import gmtime, strftime
#just to log time

In [15]:
#train doc2vec with all the corpus
#cores = multiprocessing.cpu_count()
cores = 1
word_window = 2
min_count = 2 #ignore words with less than this frequency
epochs = 100 #training cycles
hierarchical_softmax = 0
sampling = 0.00001 #1e-5
ns = 5 #number of negative sample (5-20), 0: not used

for ndim in [10, 100, 200, 300]:
    print('{} - model with {} dimensions'.format(strftime('%H:%M:%S'), ndim))
    model = gensim.models.Doc2Vec(vector_size=ndim, window=word_window, hs=0, sample=sampling, negative=ns
                                  min_count=min_count, workers=cores)
    print('{} - Building vocabulary...'.format(strftime('%H:%M:%S')))
    model.build_vocab(documents)
    print('{} - Training model...'.format(strftime('%H:%M:%S')))
    %time model.train(documents, total_examples=model.corpus_count, epochs=epochs)
    print('{} - Done.\n'.format(strftime('%H:%M:%S')))
    model.save('data/questions.{}d.d2v'.format(ndim))

17:45:26 - model with 200 dimensions
17:45:26 - Building vocabulary...
17:45:44 - Training model...
CPU times: user 24min 28s, sys: 13.2 s, total: 24min 42s
Wall time: 25min 16s
18:11:01 - Done.
18:11:11 - model with 300 dimensions
18:11:11 - Building vocabulary...
18:11:32 - Training model...
CPU times: user 26min 39s, sys: 12.7 s, total: 26min 52s
Wall time: 27min 25s
18:38:58 - Done.


### Evaluation

Remember to initialize tokenizer at the beginning

In [5]:
#this comes for the preprocessing ipython notebook
question_pairs = pickle.load(open('data/question_pairs.pkl', 'rb'))

In [1]:
from math import sqrt, acos, pi
def cosine_similarity(v1, v2):
    #numpy arrays
    vec_sum = (v1*v2).sum()
    v1_sum = (v1**2).sum()
    v2_sum = (v2**2).sum()
    
    similarity = vec_sum/(sqrt(v1_sum)*sqrt(v2_sum))
    
    return 1 - (acos(similarity)/pi)

In [9]:
for ndim in [10, 100, 200, 300]:
    print('Evaluating {} dimensions'.format(ndim))
    outfile = open('data/d2v_similarities_{}d.txt'.format(ndim), 'w')
    print('{} - Loading model...'.format(strftime('%H:%M:%S')))
    model = gensim.models.Doc2Vec.load('data/questions.{}d.d2v'.format(ndim))
    print('{} - Starting evaluation...'.format(strftime('%H:%M:%S')))
    for line in question_pairs:
        pair_id = line[0]
        question1, question2, label = line[3:6]

        tokenized_question1 = tokenize(question1)
        tokenized_question2 = tokenize(question2)

        vec_q1 = model.infer_vector(tokenized_question1)
        vec_q2 = model.infer_vector(tokenized_question2)

        similarity = cosine_similarity(vec_q1, vec_q2)

        outfile.write("{}|{}|{}\n".format(pair_id, label, similarity))
        
    print('{} - Evaluation finished.'.format(strftime('%H:%M:%S')))
    outfile.close()



Evaluating 10 dimensions
12:41:43 - Loading model...
12:41:44 - Starting evaluation...
12:46:00 - Evaluation finished.


In [None]:
#write 10d, 200d and 300d