## Getting Started

In [7]:
import gensim
import os
import collections
import random

In [8]:
train_file = "/home/ubuntu/summarization_query_oriented/data/train_20k.txt"
test_file = "/home/ubuntu/summarization_query_oriented/data/test_20k_500.txt"

## Preprocess Text for Doc2Vec

In [9]:
def read_corpus(fname, tokens_only=False):
#    with open(fname, encoding="iso-8859-1") as f:
    with open(fname) as f:        
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line, deacc=True)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line, deacc=True), [i])

In [10]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

In [21]:
print test_corpus[0]

[u'bishop', u'of', u'ross', u'ireland', u'pre', u'and', u'post', u'reformation', u'bishops']


In [12]:
print len(train_corpus)

322329


## Training the model

In [13]:
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 20

In [14]:
model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

In [15]:
model.build_vocab(train_corpus)

### Training the model (may be long !)

In [10]:
%time model.train(train_corpus)

CPU times: user 36min 34s, sys: 30.2 s, total: 37min 4s
Wall time: 11min 14s


179178523

In [11]:
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
model.save("./models/"+model_name+".d2v")
print("model saved")

model saved


### Loading it

In [16]:
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
model_d2v = model.load("./models/"+model_name+".d2v")
print("model loaded")

model loaded


## Testing it !

In [17]:
model_d2v.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])


array([  1.94291547e-02,  -6.70507848e-01,  -2.26440653e-01,
        -1.37340426e-01,  -8.35170820e-02,   1.42552927e-01,
        -1.29473403e-01,   1.92456543e-01,  -4.53324825e-01,
        -1.17267422e-01,   4.75911461e-02,   9.05732960e-02,
        -4.17214222e-02,  -3.94426733e-01,   2.02206358e-01,
         1.09554358e-01,   1.40604600e-01,   5.75654134e-02,
         3.40830684e-01,  -4.91876334e-01,  -3.38415116e-01,
         9.42548364e-02,   1.15097314e-02,  -3.56676608e-01,
         6.34246230e-01,   1.39211323e-02,  -1.67301565e-01,
         2.17303842e-01,   1.49504036e-01,   1.06433123e-01,
         1.23297833e-01,   7.93358833e-02,   7.62027428e-02,
         3.01561892e-01,  -5.39335430e-01,  -1.75229162e-02,
        -5.59485657e-03,   1.30450845e-01,   5.04945330e-02,
         3.42386849e-02,  -9.35370177e-02,  -1.54827684e-01,
        -1.75499935e-02,   1.13537386e-01,  -2.83598959e-01,
        -5.21475673e-02,   2.95050889e-01,  -6.58204556e-01,
         2.75894493e-01,

In [19]:
ranks = []
second_ranks = []
for doc_id in range(100):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [20]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, 100)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print(u'Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (55): «talks about people stomachs»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d400,n5,hs,mc5,s0.0001,t4):

MOST (55564, 0.22469821572303772): «gratian is assassinated and constantine iii general magister militum declares himself roman emperor to extend his dominion over gaul and spain he takes practically all the roman garrisons from britain and crosses the english channel constantine occupies arles and established tenuous authority over gaul sharing control with marauding barbarians this is generally seen as the beginning of rome withdrawal from britain»

MEDIAN (166894, -5.2908435463905334e-05): «beechcraft queen air»

LEAST (210436, -0.21172496676445007): «with no constitutional amendment done annadurai declared january the th republic day of india and also the day the constitution which in essence enshrined hindi as the official language of india came into practice as day of mourning this move was opposed by the then chief minister of madras state as blasphemous