## Getting Started

In [18]:
import gensim
import os
import collections
import random
from hard_coded import lang_model_dir 

In [1]:
train_file = "/home/ubuntu/summarization_query_oriented/data/wikipedia/txt/td_qfs_rank_1_all.txt"


## Preprocess Text for Doc2Vec

In [12]:
def read_corpus(fname, tokens_only=False):
#    with open(fname, encoding="iso-8859-1") as f:
    stopwords = stop_words()
    
    with open(fname) as f:        
        for i, line in enumerate(f):
            if tokens_only:
                prep = gensim.utils.simple_preprocess(line, deacc=True)
                yield remove_stopwords(prep, stopwords)
            else:
                # For training data, add tags
                prep = gensim.utils.simple_preprocess(line, deacc=True)
                yield gensim.models.doc2vec.TaggedDocument(remove_stopwords(prep,stopwords), [i])

In [13]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(train_file, tokens_only=True))

In [14]:
train_corpus[0]

TaggedDocument(words=[u'bauxite', u'pneumoconiosis', u'shaver', u'disease', u'corundum', u'smelter', u'lung', u'bauxite', u'lung', u'bauxite', u'smelters', u'disease', u'progressive', u'form', u'pneumoconiosis', u'caused', u'exposure', u'bauxite', u'fumes', u'aluminium', u'silica', u'particulates'], tags=[0])

In [6]:
print test_corpus[0]

[u'bauxite', u'pneumoconiosis', u'also', u'known', u'as', u'shaver', u'disease', u'corundum', u'smelter', u'lung', u'bauxite', u'lung', u'or', u'bauxite', u'smelters', u'disease', u'is', u'progressive', u'form', u'of', u'pneumoconiosis', u'caused', u'by', u'exposure', u'to', u'bauxite', u'fumes', u'which', u'contain', u'aluminium', u'and', u'silica', u'particulates']


In [8]:
print len(train_corpus)

62695


## Training the model

In [27]:
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 100

In [28]:
model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

In [29]:
model.build_vocab(train_corpus)

### Training the model (may be long !)

In [30]:
%time model.train(train_corpus)

CPU times: user 29min 45s, sys: 27.2 s, total: 30min 12s
Wall time: 11min 46s


164696722

In [31]:

model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)+"_wosw"
model.save(lang_model_dir+model_name+".d2v")
print("model saved")

model saved


### Loading it

In [32]:
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)+"_wosw"
model_d2v = model.load(lang_model_dir+model_name+".d2v")
print("model loaded")

model loaded


## Testing it !

In [33]:
model_d2v.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([  1.47184506e-01,  -1.95598930e-01,  -3.19777653e-02,
        -1.03911087e-01,   2.32490450e-01,  -6.91882819e-02,
         6.19045757e-02,   1.15574608e-02,  -1.80576012e-01,
         1.41080201e-01,  -1.44291282e-01,  -5.73484264e-02,
        -5.07302806e-02,   7.89929181e-03,  -3.26590464e-02,
         2.20667765e-01,   2.65102815e-02,   9.24123004e-02,
         7.66479373e-02,  -3.17525044e-02,   2.60160148e-01,
         8.81015807e-02,  -1.36549443e-01,   2.89000627e-02,
         2.06925552e-02,  -1.86722070e-01,   1.09193765e-01,
        -3.67489108e-03,   1.06802985e-01,  -1.51116684e-01,
         8.86295512e-02,  -3.20413619e-01,   2.15845376e-01,
        -1.21278584e-01,   9.86410212e-03,   2.29954138e-01,
         2.41720065e-01,  -1.47043809e-01,  -3.24985199e-02,
         3.38195153e-02,  -2.95624256e-01,   2.67844826e-01,
         1.48904566e-02,  -9.95730702e-03,  -1.09819502e-01,
        -1.51106283e-01,  -1.35333404e-01,   4.37500514e-02,
         2.41985396e-01,

In [34]:
ranks = []
second_ranks = []
for doc_id in range(100):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [38]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, 100)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print(u'Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 1),('MOST', 2),('MOST', 3), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (48): «great famine major food crisis strike europe century millions northern europe died extended number years marking clear earlier period growth prosperity centuries starting bad weather spring widespread crop failures lasted summer europe fully recover period marked extreme levels criminal activity disease mass death infanticide cannibalism consequences church european society future calamities follow century famines medieval britain medieval france england population people died famine»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d400,n5,hs,mc5,s0.0001,t4):

MOST (6591, 0.3839375376701355): «great famine crisis strike europe late middle ages period witnessed heaviest loss population france reduced half medieval britain afflicted famines france suffered effects period europe devastated mid century black death deadly pandemics human history killed estimated people europe third european population time»

MOST (67, 0.34114229679107666): «great famine lasted killed te