## Getting Started

In [1]:
import gensim
import os
import collections
import random

In [2]:
train_file = "/home/ubuntu/summarization_query_oriented/data/train_20k.txt"
test_file = "/home/ubuntu/summarization_query_oriented/data/test_20k_500.txt"

## Preprocess Text for Doc2Vec

In [3]:
def read_corpus(fname, tokens_only=False):
#    with open(fname, encoding="iso-8859-1") as f:
    with open(fname) as f:        
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line, deacc=True)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line, deacc=True), [i])

In [4]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

In [None]:
print test_corpus[1][40]

## Training the model

In [5]:
min_count=5
window=10
size=200
sample=1e-4
negative=5
workers=4

In [6]:
model = gensim.models.doc2vec.Doc2Vec(min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers)

In [7]:
model.build_vocab(train_corpus)

In [None]:
for epoch in range(25): 
    print(epoch)
    model.train(train_corpus)
    if epoch%5 == 0:
        model_name = "mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_spl_"+str(sample)+"_neg_"+str(negative)+"_ep_"+str(epoch)
        model.save("/home/ubuntu/summarization_query_oriented/models/"+model_name+".model")
        print("model saved")

In [None]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [None]:
collections.Counter(ranks)  #96% accuracy

In [None]:
model.save_word2vec_format("/home/ubuntu/summarization_query_oriented/models/100_bin.model",binary=True)

In [None]:
model.save("/home/ubuntu/summarization_query_oriented/models/100.model")