## Getting Started

In [1]:
import gensim
import os
import collections
import random

In [2]:
train_file = "/home/ubuntu/summarization_query_oriented/data/train_20k.txt"
test_file = "/home/ubuntu/summarization_query_oriented/data/test_20k_500.txt"

## Preprocess Text for Doc2Vec

In [3]:
def read_corpus(fname, tokens_only=False):
#    with open(fname, encoding="iso-8859-1") as f:
    with open(fname) as f:        
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line, deacc=True)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line, deacc=True), [i])

In [4]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

In [5]:
print test_corpus[:5]

[[u'bishop', u'of', u'ross', u'ireland', u'pre', u'and', u'post', u'reformation', u'bishops'], [u'cathedral', u'church', u'of', u'st', u'fachtna', u'ross', u'cathedral'], [u'dean', u'of', u'ross', u'ireland', u'list', u'of', u'church', u'of', u'ireland', u'deans'], [u'diocese', u'of', u'cork', u'cloyne', u'and', u'ross', u'current', u'church', u'of', u'ireland', u'diocese'], [u'diocese', u'of', u'cork', u'and', u'ross', u'current', u'roman', u'catholic', u'diocese', u'catholic', u'encyclopedia', u'article']]


In [7]:
print len(train_corpus)

322329


## Training the model

In [8]:
dm = 0
min_count = 5
window = 10
size = 200
sample = 1e-4
negative = 5
workers = 4


In [9]:
model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = 10)

In [10]:
model.build_vocab(train_corpus)

In [11]:
%time model.train(train_corpus)

CPU times: user 13min 51s, sys: 12.2 s, total: 14min 3s
Wall time: 4min 20s


89592315

In [13]:
epoch = 10
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
model.save_word2vec_format("./models/"+model_name+".txt",binary=False)
model.save_word2vec_format("./models/"+model_name+".bin",binary=True)
model.save("./models/"+model_name+".d2v")
print("model saved")

model saved


In [48]:
model = gensim.models.doc2vec.Doc2Vec.load("./models/dm_0_mc_5_w_10_size_200_neg_5_ep_10.d2v")

In [49]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([-0.5137133 , -0.51006556, -0.11830132, -0.30620816, -0.07370269,
       -0.32979652,  0.10358199,  0.14556497, -0.20586514,  0.0707668 ,
        0.06466345, -0.0965753 , -0.15249442,  0.34066898,  0.28361437,
        0.21483296, -0.32304013,  0.27307925, -0.13661683, -0.25461715,
       -0.04815507,  0.17275754,  0.20011021, -0.18823542,  0.17159139,
        0.31891128, -0.02880051,  0.20645294, -0.52760053, -0.23112002,
       -0.00798294,  0.2309345 ,  0.03766512, -0.08731538, -0.26094475,
       -0.12472125, -0.28028807,  0.00601941,  0.06894776, -0.07222554,
        0.06686044,  0.14920135, -0.22315767,  0.47051835, -0.11467268,
       -0.0226957 , -0.33209822, -0.59473658, -0.05181553,  0.19567379,
       -0.21873623,  0.0087072 , -0.00172378,  0.00622102,  0.09201023,
       -0.09075417, -0.06632642,  0.26457009, -0.07466321, -0.42977872,
       -0.029328  ,  0.02617765,  0.05255985, -0.2704019 , -0.2180271 ,
       -0.19279453, -0.09608238,  0.15916196,  0.36908406,  0.07

In [21]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

KeyboardInterrupt: 

In [None]:
model.save("/home/ubuntu/summarization_query_oriented/models/100.model")

In [39]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus))
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print(u'Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (44): «charles james branch»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d200,n5,hs,mc5,s0.0001,t4):

MOST (38480, 0.603034257888794): «multron charles»

MEDIAN (122111, 0.1656966358423233): «mitty offers the following honors ap and accelerated courses nils allesson nicolaus allonius»

LEAST (47316, -0.2878539562225342): «»

