In [None]:
import gensim
import pickle
from nltk.tokenize import casual_tokenize
from nltk.corpus import stopwords
import re

In [None]:
stoplist = set(stopwords.words('english'))
def is_english(word):
    #not quite, it accepts some latin characters
    try:
        word.encode('ISO-8859-1')
    except UnicodeEncodeError:
        return False
    return True

def is_number(word):
    #plus hexadecimal numbers and 1 letter + numbers
    return re.search(r'(^0x)|(^[0-9\-\.:xv]+$)|(^\w\d+$)', word)

def text_from_number(word):
    return re.search(r'^\d+([a-z]+)$', word)

def remove_punctuation(word):
    if not re.search(r'\.com?\b|\.in\b|\.org\b|\.be\b|\.xyz\b|\.net\b|\.us\b', word):
        new_words = []
        for term in word.split('/'):
            characters = set("!#$%&'()*+^,")
            new_word = term.translate({ord(char) : None for char in characters})
            delete_chars = "+_\-\."
            new_word = re.sub(r'^[{}]+|[{}]+$'.format(delete_chars, delete_chars), '', new_word)
            new_word = re.sub(r'^¿([^\W_])', r'\1', new_word) #deletes '¿' at the beginning of the word
            if new_word:
                new_words.append(new_word)
            
        return new_words
    return [re.sub(r'^[/\'\+]+|/$', '', word)]

def tokenize(doc, remove_stopwords=True):
    if remove_stopwords:
        words = [word.strip() for word in casual_tokenize(doc) if word.strip() not in stoplist]
    else:
        words = [word for word in word_tokenize(doc)]
    words = [token for word in words for token in remove_punctuation(word)]
    words = [word for word in words if len(word) > 1 and not is_number(word) and is_english(word)]
    words = [text_from_number(word).groups()[0] if text_from_number(word) else word for word in words]
    return words

In [None]:
question_pairs = pickle.load(open('data/question_pairs.list.pkl', 'rb'))

## Generate pickle file

Execute if files need to be generated, otherwise, just load the pickles 🥒

In [None]:
questions = []
qids = set()
for pair in question_pairs:
    qid1, qid2, question1, question2 = pair[1:5]

    if qid1 not in qids:
        qids.add(qid1)
        questions.append((question1, qid1))

    if qid2 not in qids:
        qids.add(qid2)
        questions.append((question2, qid2))

#should this be saved?
outfile = open('data/question_ids.list.pkl','wb')
pickle.dump(questions, outfile)
outfile.close()

You can skip this if you generated the pickle file in the cell before

In [None]:
questions = pickle.load(open('data/question_ids.list.pkl', 'rb'))

### Creating tagged documents for the Doc2Vec model

In [None]:
documents = [gensim.models.doc2vec.TaggedDocument(tokenize(question), [qid]) for question, qid in questions]
picfile = open('data/d2v_tagged_documents.list.pkl', 'wb')
pickle.dump(documents, picfile)
picfile.close()

## Training model

In [None]:
import multiprocessing
print(gensim.models.doc2vec.FAST_VERSION)
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

### Examples

In [None]:
%time model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
#no hierarchical softmax
#no negative sampling
#cores = multiprocessing.cpu_count() #for some reason 1 worker is faster
model = gensim.models.Doc2Vec(vector_size=100, window=2, hs=0, min_count=2, workers=1, epochs=50)
model.build_vocab(documents[:10])

In [None]:
print(len(model.wv.vocab))
model.wv.vocab

In [None]:
%time model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
%time model2.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### Actual training

In [None]:
from time import gmtime, strftime
#just to log time

In [None]:
#train doc2vec with all the corpus
#cores = multiprocessing.cpu_count()
cores = 1
word_window = 2
min_count = 2 #ignore words with less than this frequency
epochs = 100 #training cycles
hierarchical_softmax = 0
sampling = 0.00001 #1e-5
ns = 5 #number of negative sample (5-20), 0: not used

for ndim in [10, 100, 200, 300]:
    print('{} - model with {} dimensions'.format(strftime('%H:%M:%S'), ndim))
    model = gensim.models.Doc2Vec(vector_size=ndim, window=word_window, hs=0, sample=sampling, negative=ns,
                                  min_count=min_count, workers=cores)
    print('{} - Building vocabulary...'.format(strftime('%H:%M:%S')))
    model.build_vocab(documents)
    print('{} - Training model...'.format(strftime('%H:%M:%S')))
    %time model.train(documents, total_examples=model.corpus_count, epochs=epochs)
    print('{} - Done.\n'.format(strftime('%H:%M:%S')))
    model.save('data/questions.{}d.d2v'.format(ndim))

### Evaluation

Remember to initialize tokenizer at the beginning

In [None]:
#this comes for the preprocessing ipython notebook
question_pairs = pickle.load(open('data/question_pairs.list.pkl', 'rb'))

In [None]:
from math import sqrt, acos, pi
def cosine_similarity(v1, v2):
    #numpy arrays
    vec_sum = (v1*v2).sum()
    v1_sum = (v1**2).sum()
    v2_sum = (v2**2).sum()
    
    similarity = vec_sum/(sqrt(v1_sum)*sqrt(v2_sum))
    
    if similarity > 1:
        similarity = 1

    return 1 - (acos(similarity)/pi)

In [None]:
for ndim in [10, 100, 200, 300]:
    print('Evaluating {} dimensions'.format(ndim))
    outfile = open('data/d2v_similarities_{}d.txt'.format(ndim), 'w')
    print('{} - Loading model...'.format(strftime('%H:%M:%S')))
    model = gensim.models.Doc2Vec.load('data/questions.{}d.d2v'.format(ndim))
    print('{} - Starting evaluation...'.format(strftime('%H:%M:%S')))
    for line in question_pairs:
        pair_id = line[0]
        question1, question2, label = line[3:6]

        tokenized_question1 = tokenize(question1)
        tokenized_question2 = tokenize(question2)

        vec_q1 = model.infer_vector(tokenized_question1)
        vec_q2 = model.infer_vector(tokenized_question2)

        similarity = cosine_similarity(vec_q1, vec_q2)

        outfile.write("{}|{}|{}\n".format(pair_id, label, similarity))
        
    print('{} - Evaluation finished.'.format(strftime('%H:%M:%S')))
    outfile.close()

## Results

In [None]:
outfile = open('data/d2v_results.txt', 'w')
for ndim in [10, 100, 200, 300]:
    filename = 'data/d2v_similarities_{}d.txt'.format(ndim)
    values_file = open('data/d2v_{}d_values.txt'.format(ndim), 'w')

    with open(filename) as infile:
        tp = fp = tn = fn = 0
        for line in infile:
            pair_id, label, similarity = line.split('|')
            pair_id = int(pair_id)
            label = int(label)
            similarity = float(similarity)

            predicted_label = 0 if similarity < 0.7 else 1

            if label == 1: #positive
                if label == predicted_label: #true positive
                    tp += 1
                else: #false negative
                    fn += 1
            else: #negatives
                if label == predicted_label: #true negative
                    tn += 1
                else: #false positive
                    fp += 1
            q1 = question_pairs[pair_id][3]
            q2 = question_pairs[pair_id][4]
            values_file.write('{} - {} - {} - {}\n{}\n{}\n==================\n'.format(pair_id, label, predicted_label, \
                                                                                     similarity, q1, q2))

        wstr = '\nD2V - {} dimensions\n'.format(ndim) +\
               '\t\t\tpredicted_no\t\tpredicted_yes\n'+\
               'actual_no\t\t    {}\t\t    {}\n'.format(tn, fp)+\
               'actual_yes\t\t    {}\t\t    {}\n\n'.format(fn, tp)
        outfile.write(wstr)
        accuracy = (tp + tn)/(tp + tn + fp + fn)
        precision = tp/(tp + fp)
        recall = tp/(tp + fn)

        outfile.write('accuracy: {0:.3f}\nprecision: {0:.3f}\nrecall: {0:.3f}\n'.format(accuracy, precision, recall))
outfile.close()