In [None]:
# SET UP
import pyndri
import pyndri.compat
import gensim
import copy
import logging
import time
import numpy as np
import logging
import sys
import collections
import io
from scipy import spatial
import pickle
tfidf_rankings = pickle.load(open("./tfidf_top.p", "rb")) # top 1000 rankings per query with tf-idf from Task 1
index = pyndri.Index('index/')

### Parsing the query file
You can parse the query file (`ap_88_89/topics_title`) using the following snippet:

In [10]:
def parse_topics(file_or_files,
                 max_topics=sys.maxsize, delimiter=';'):
    assert max_topics >= 0 or max_topics is None

    topics = collections.OrderedDict()

    if not isinstance(file_or_files, list) and \
            not isinstance(file_or_files, tuple):
        if hasattr(file_or_files, '__iter__'):
            file_or_files = list(file_or_files)
        else:
            file_or_files = [file_or_files]

    for f in file_or_files:
        assert isinstance(f, io.IOBase)

        for line in f:
            assert(isinstance(line, str))

            line = line.strip()

            if not line:
                continue

            topic_id, terms = line.split(delimiter, 1)

            if topic_id in topics and (topics[topic_id] != terms):
                    logging.error('Duplicate topic "%s" (%s vs. %s).',
                                  topic_id,
                                  topics[topic_id],
                                  terms)

            topics[topic_id] = terms

            if max_topics > 0 and len(topics) >= max_topics:
                break

    return topics

# with open('./ap_88_89/topics_title', 'r') as f_topics:
#     print(parse_topics([f_topics]))

# Information Retrieval I #
## Answers for Task 2: retrieval models [100 points + 10 bonus points] ##
**TA**: Christophe Van Gysel (cvangysel@uva.nl; C3.258B, Science Park 904)

**Students: Fije van Overeem (10373535) and Diede Rusticus (1090486)**

To (re-)train models, uncomment the line of code where model is called.

Table Of Contents 

1) Word2Vec 

a) connector class
b) representation of query and documents
c) rerank task 1's tf-idf ranking based on cosine similarity

2) LSI 

a) connector class
b) representation of query and documents
c) rerank task 1's tf-idf ranking based on cosine similarity

3) LDA 

a) connector class
b) representation of query and documents
c) rerank task 1's tf-idf ranking based on cosine similarity

4) Doc2Vec

a) connector class
b) representation of query and documents
c) rerank task 1's tf-idf ranking based on cosine similarity

NB: t-testing is done in significance testing iPython-file.

In [125]:
token2id, id2token, _ = index.get_dictionary()
#print(list(id2token.items())[:15])

In [126]:
query_tokens = index.tokenize("University of Massachusetts")
print("Query by tokens:", query_tokens)
query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
print("Query by ids with stopwords:", query_id_tokens)
query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]
print("Query by ids without stopwords:", query_id_tokens)
print(token2id.get('university'))

Query by tokens: ['university', '', 'massachusetts']
Query by ids with stopwords: [200, 0, 894]
Query by ids without stopwords: [200, 894]
200


## Word2Vec ##

### Connector class for word2vec as provided by assignment.###

In [128]:
word2vec_init = gensim.models.Word2Vec(
    size=64,  # Embedding size - initially 300
    window=5,  # One-sided window size, init was 5
    sg=True,  # Skip-gram.
    min_count=1,  # Minimum word frequency.
    sample=1e-3,  # Sub-sample threshold.
    hs=False,  # Hierarchical softmax.
    negative=10,  # Number of negative examples.
    iter=1,  # Number of iterations.
    workers=8,  # Number of workers.
)

dictionary = pyndri.extract_dictionary(index)
start = time.time()
sentences = pyndri.compat.IndriSentences(index, dictionary, max_documents=50000) # max_documents due to time complexity

logging.info('Constructing word2vec vocabulary.')

# Build vocab.
word2vec_init.build_vocab(sentences, trim_rule=None)

models = [word2vec_init]
print('going in epoch')
for epoch in range(1, 5+1):#range(1, 5 + 1):
    start = time.time()
    logging.info('Epoch %d', epoch)
    model = copy.deepcopy(models[-1])
    model.train(sentences)

    models.append(model)
    print(time.time()-start)
logging.info('Trained models: %s', models)
#print(models[-1]) # last model is most advanced
models[-1].save('Small_Morning_Run')


INFO:root:Constructing word2vec vocabulary.
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 2607270 words, keeping 73866 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 5208413 words, keeping 101162 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 7779447 words, keeping 121185 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 10402346 words, keeping 138774 word types
INFO:gensim.models.word2vec:collected 153153 word types from a corpus of 12981963 raw words and 50000 sentences
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:min_count=1 retains 153153 unique words (100% of original 153153, drops 0)
INFO:gensim.models.word2vec:min_count=1 leaves 12981963 word corpus (100% of original 1298

going in epoch


INFO:gensim.models.word2vec:training model with 8 workers on 153153 vocabulary and 64 features, using sg=1 hs=False sample=0.001 negative=10 window=5
INFO:gensim.models.word2vec:expecting 50000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 0.07% examples, 7241 words/s, in_qsize 2, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.54% examples, 26169 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.92% examples, 29066 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.33% examples, 32711 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.64% examples, 30163 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.21% examples, 33532 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.84% examples, 35338 words/s, in_qsize 12, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.44% examples, 36384 words/

384.2206084728241


INFO:gensim.models.word2vec:training model with 8 workers on 153153 vocabulary and 64 features, using sg=1 hs=False sample=0.001 negative=10 window=5
INFO:gensim.models.word2vec:expecting 50000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 0.07% examples, 5117 words/s, in_qsize 2, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.47% examples, 19394 words/s, in_qsize 2, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.69% examples, 21101 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.18% examples, 27177 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.49% examples, 28906 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.82% examples, 31121 words/s, in_qsize 11, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.21% examples, 33197 words/s, in_qsize 14, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.50% examples, 32834 words

301.8993101119995


INFO:gensim.models.word2vec:training model with 8 workers on 153153 vocabulary and 64 features, using sg=1 hs=False sample=0.001 negative=10 window=5
INFO:gensim.models.word2vec:expecting 50000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 0.07% examples, 6864 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.69% examples, 27710 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.33% examples, 34145 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.90% examples, 37284 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.50% examples, 39456 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.14% examples, 41020 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.72% examples, 42121 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 4.32% examples, 42914 w

295.6640512943268


INFO:gensim.models.word2vec:training model with 8 workers on 153153 vocabulary and 64 features, using sg=1 hs=False sample=0.001 negative=10 window=5
INFO:gensim.models.word2vec:expecting 50000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 0.07% examples, 6992 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.68% examples, 27714 words/s, in_qsize 10, out_qsize 1
INFO:gensim.models.word2vec:PROGRESS: at 1.33% examples, 34386 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.90% examples, 37345 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.50% examples, 39561 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.14% examples, 40481 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.72% examples, 41807 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 4.32% examples, 42669 w

341.5130100250244


INFO:gensim.models.word2vec:training model with 8 workers on 153153 vocabulary and 64 features, using sg=1 hs=False sample=0.001 negative=10 window=5
INFO:gensim.models.word2vec:expecting 50000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 0.07% examples, 6748 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 0.68% examples, 26710 words/s, in_qsize 13, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.33% examples, 33138 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 1.90% examples, 36284 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 2.50% examples, 38147 words/s, in_qsize 15, out_qsize 1
INFO:gensim.models.word2vec:PROGRESS: at 3.14% examples, 39581 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 3.72% examples, 40045 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 4.32% examples, 42214 w

356.14865922927856


INFO:gensim.utils:saved Small_Morning_Run


In [120]:
# Load your Word2Vec model
savedW2Vmodel = gensim.models.Word2Vec.load('Small_Morning_Run')


INFO:gensim.utils:loading Word2Vec object from Small_Morning_Run
INFO:gensim.utils:loading wv recursively from Small_Morning_Run.wv.* with mmap=None
INFO:gensim.utils:loading docvecs recursively from Small_Morning_Run.docvecs.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loaded Small_Morning_Run


Doc2Vec(dbow,d100,n10,s0.001,t8)


### Build representation for q and d ###

In [139]:
# Takes average of all query terms in 1 query, so the query is represented by 1 vector
def get_q_rep(index, query_str, w2vmodel):
    query_tokens = index.tokenize(query_str)
    vectors = []
    for token in query_tokens:
        vector = w2vmodel['token']
        vectors.append(vector)

    avg_query = np.mean(np.array(vectors), axis=0)
    return avg_query

# Takes average of all terms in 1 document, so the document is represented by 1 vector
def get_d_rep(document, w2vmodel):
    wordIDs = document[1][:]
    vectors = []
    for wordID in wordIDs:
        if wordID > 0:
            wordstring = id2token[wordID]
            if wordstring in w2vmodel:
                vector = w2vmodel[wordstring]
                vectors.append(vector)

    avg_doc = np.mean(np.array(vectors), axis=0)
    return avg_doc

#avg_d = get_d_rep(index.document(1),savedW2Vmodel)
#mutual_info_score(avg_q, avg_d)

0.348099537217


0.65190053

### Check cosine similarity between a word2vec-query representation and a (word2vec-representation) of set of documents.###

In [130]:
def check_cosine_given_query(avg_q, queryID, internal_docIDs):
    word2vec_dict = {}
    for internal_docID in internal_docIDs: # amount of documents
        avg_d = get_d_rep(index.document(internal_docID),savedW2Vmodel)
        cosine = cosine_similarity(numpy.reshape(avg_q, (1, -1)),  numpy.reshape(avg_d, (1, -1)))[0][0] 
        external_ID = index.document(internal_docID)[0]
        word2vec_dict[external_ID] = cosine
    return word2vec_dict

In [12]:
def make_dict_format(dic):
    results = []
    for doc, score in dic.items():
        results.append((score, doc))
    return tuple(results)

### Rerank documents for queries of TF-IDFs with word2vec model. ###

In [133]:
def check_cosine_all_queries(tfidf_rankings):
    dic = {}
    for query_id, ranked_docs in tfidf_rankings.items():
        query_string = query_dict[str(query_id)]
        avg_q = get_q_rep(index, query_string, savedW2Vmodel)
        cosines_per_q = check_cosine_given_query(avg_q, query_id, ranked_docs)
        dic[query_id] = make_dict_format(cosines_per_q)
    return dic

new_rankings = check_cosine_all_queries(tfidf_rankings)

KeyError: (3066, 1)

 ### Connector Class between Gensim and Pyndri for LSI and LDA model ###

In [34]:
# Connector class for LSI and LDA model
import logging
logging.basicConfig(level=logging.INFO)
class IndriBOW(gensim.interfaces.CorpusABC):

    def __init__(self, index, dictionary, max_documents=None):
        assert isinstance(index, pyndri.Index)

        self.index = index
        self.dictionary = dictionary

        self.max_documents = max_documents

    def _maximum_document(self):
        if self.max_documents is None:
            return self.index.maximum_document()
        else:
            return min(
                self.max_documents + self.index.document_base(),
                self.index.maximum_document())

    def __iter__(self): # both used for LSI and LDI
        for int_doc_id in range(self.index.document_base(),
                                self._maximum_document()):
            ext_doc_id, tokens = self.index.document(int_doc_id)

            yield self.dictionary.doc2bow(
                self.dictionary[token_id]
                for token_id in tokens
                if token_id > 0 and token_id in self.dictionary)

    def __len__(self):
        return self._maximum_document() - self.index.document_base()

### Train LSI Model ###

In [134]:
## Build LSI Model ##

'''LSI compares how often words appear together in the same document
and compares how often those occurences happen in all of the documents 
that the dictionary has in its index.'''

logging.getLogger().setLevel(logging.INFO)
def train_lsi():
    sentences = IndriBOW(index, dictionary)
    lsi = gensim.models.lsimodel.LsiModel(
        corpus = sentences,
        id2word = dictionary.id2token
    )
    return lsi
#lsimodel = train_lsi()
#lsimodel.save('LSI_model')


#### q and d representation with the lsi model so we can rank documents based on query. ####

In [135]:
lsimodel = gensim.models.LsiModel.load('LSI_model')

def get_q_rep(index, query_str, lsimodel):
    pairs = lsimodel[dictionary.doc2bow(index.tokenize(query_str))]
    weights = []
    for pair in pairs:
        weights.append(pair[1])
    return weights
    
__, id2token, _ = index.get_dictionary()
def get_docrep(index, doc_id):
    docrep = [id2token[word] for word in index.document(doc_id)[1] if word > 0]
    drep = lsimodel[dictionary.doc2bow(docrep)]
    weights = []
    for pair in drep:
        weights.append(pair[1])
    return weights

INFO:gensim.utils:loading LsiModel object from LSI_model
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:loaded LSI_model
INFO:gensim.utils:loading LsiModel object from LSI_model.projection
INFO:gensim.utils:loading u from LSI_model.projection.u.npy with mmap=None
INFO:gensim.utils:loaded LSI_model.projection


### Check cosine similarity between query-representation for LSI and documents ###

In [84]:
def check_cosine_given_query_lsi(avg_q, queryID, internal_docIDs):
    lsi_dict = {}
    for internal_docID in internal_docIDs: # amount of documents
        avg_d = get_docrep(index, internal_docID)
        cosine = cosine_similarity(numpy.reshape(avg_q, (1, -1)),  numpy.reshape(avg_d, (1, -1)))[0][0] 
        external_ID = index.document(internal_docID)[0]
        lsi_dict[external_ID] = cosine
    return lsi_dict

### Rerank the tf-idf 1000 rankings per query with LSI model ###

In [85]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import mutual_info_score #is equivalent to KL divergence
import numpy
def check_cosine_all_queries_lsi(tfidf_rankings):
    dic = {}
    for query_id, ranked_docs in tfidf_rankings.items():
        query_string = query_dict[str(query_id)]
        avg_q = get_q_rep(index, query_string, lsimodel)
        cosines_per_q = check_cosine_given_query_lsi(avg_q, query_id, ranked_docs)
        dic[query_id] = make_dict_format(cosines_per_q)
    return dic

#new_rankings_lsi = check_cosine_all_queries_lsi(tfidf_rankings)

### Train LDA Model (uses same connector class as LSI) ###

In [136]:
## Latent Dirichlet Allocation ##
# Uses same connector class as LSI (doc2bow)
logging.getLogger().setLevel(logging.INFO)
def train_lda():
    sentences = IndriBOW(index, dictionary)
    lda = gensim.models.ldamodel.LdaModel(
        corpus = sentences,
        id2word = dictionary.id2token
    )
    return lda
#ldamodel = train_lda()
ldamodel.save('LDA_overnight_model')

INFO:gensim.utils:saving LdaState object under LDA_overnight_model.state, separately None
INFO:gensim.utils:storing np array 'sstats' to LDA_overnight_model.state.sstats.npy
INFO:gensim.utils:saved LDA_overnight_model.state
INFO:gensim.utils:saving LdaModel object under LDA_overnight_model, separately ['expElogbeta', 'sstats']
INFO:gensim.utils:not storing attribute state
INFO:gensim.utils:not storing attribute dispatcher
INFO:gensim.utils:not storing attribute id2word
INFO:gensim.utils:storing np array 'expElogbeta' to LDA_overnight_model.expElogbeta.npy
INFO:gensim.utils:saved LDA_overnight_model


### Get query representation and document representation for the LDA model ###

In [112]:
# LDA only takes the topics where there's a probability, so the vector isn't the goal size of 200.
# We fill up the vectors with 0- probabilities for the rest of the topics 
def get_q_rep_lda(index, query_str, ldamodel):
    pairs = ldamodel[dictionary.doc2bow(index.tokenize(query_str))]
    topicnrs = []
    weights = []
    count = 0
    for pair in pairs:
        topicnrs.append(pair[0])
    for i in range(200):# goal vector size
        if i in topicnrs:
            weights.append(pairs[count][1])
            count =+ 1
        else:
            weights.append(0.0)
    return weights

__, id2token, _ = index.get_dictionary()
def get_docrep_lda(index, doc_id):
    docrep = [id2token[word] for word in index.document(doc_id)[1] if word > 0]
    drep = ldamodel[dictionary.doc2bow(docrep)]
    weights = []
    for pair in drep:
        weights.append(pair[1])
    return weights

 ### Check cosine similarity between query and documents for LDA representations ###

In [113]:
def check_cosine_given_query_lda(avg_q, queryID, internal_docIDs):
    lda_dict = {}
    for internal_docID in internal_docIDs: # amount of documents
        avg_d = get_docrep(index, internal_docID)
        cosine = cosine_similarity(numpy.reshape(avg_q, (1, -1)),  numpy.reshape(avg_d, (1, -1)))[0][0] 
        external_ID = index.document(internal_docID)[0]
        lda_dict[external_ID] = cosine
    return lda_dict

### Rerank the tf-idf rankings per query based on LDA model ###

In [114]:
# q and d representation with the lda model so we can rank documents based on query.
ldamodel = gensim.models.LdaModel.load('LDA_overnight_model')
def check_cosine_all_queries_lda(tfidf_rankings):
    dic = {}
    for query_id, ranked_docs in tfidf_rankings.items():
        query_string = query_dict[str(query_id)]
        avg_q = get_q_rep_lda(index, query_string, ldamodel)
        cosines_per_q = check_cosine_given_query_lda(avg_q, query_id, ranked_docs)
        dic[query_id] = make_dict_format(cosines_per_q)
    return dic
Lda_rerank = check_cosine_all_queries_lda(tfidf_rankings)

INFO:gensim.utils:loading LdaModel object from LDA_overnight_model
INFO:gensim.utils:loading expElogbeta from LDA_overnight_model.expElogbeta.npy with mmap=None
INFO:gensim.utils:setting ignored attribute state to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:setting ignored attribute id2word to None
INFO:gensim.utils:loaded LDA_overnight_model
INFO:gensim.utils:loading LdaModel object from LDA_overnight_model.state
INFO:gensim.utils:loading sstats from LDA_overnight_model.state.sstats.npy with mmap=None
INFO:gensim.utils:loaded LDA_overnight_model.state


## Doc2 Vec ##

## Doc2Vec Connector Class##

In [None]:
class IndriDoc2Vec(gensim.interfaces.CorpusABC):

    def __init__(self, index, dictionary, max_documents=None):
        assert isinstance(index, pyndri.Index)

        self.index = index
        self.dictionary = dictionary

        self.max_documents = max_documents

    def _maximum_document(self):
        if self.max_documents is None:
            return self.index.maximum_document()
        else:
            return min(
                self.max_documents + self.index.document_base(),
                self.index.maximum_document())

    def __iter__(self): # Doc2Vec
        for int_doc_id in range(self.index.document_base(),
                                self._maximum_document()):
            ext_doc_id, tokens = self.index.document(int_doc_id)

            yield gensim.models.doc2vec.TaggedDocument(words =[self.dictionary[token_id]
                                                              for token_id in tokens if token_id >0
                                                               and token_id in self.dictionary],
                                                      tags = ['doc_'+str( int_doc_id) ])

    def __len__(self):
        return self._maximum_document() - self.index.document_base()


## Build Doc2Vec model ##

In [19]:
doc2vec_init = gensim.models.doc2vec.Doc2Vec(
 #   size=size,  # Embedding size - initially 300
  #  window=window,  # One-sided window size, init was 5
    #sg=True,  # Skip-gram.
    min_count=1,  # Minimum word frequency.
    sample=1e-3,  # Sub-sample threshold.
    hs=False,  # Hierarchical softmax.
    negative=10,  # Number of negative examples.
    iter=1,  # Number of iterations.
    workers=8,  # Number of workers.
    dm_mean=None, 
    dm=10, 
    dbow_words=0, 
    dm_concat=0, 
)

dictionary = pyndri.extract_dictionary(index)
start = time.time()
sentences = IndriDoc2Vec(index, dictionary) # max_documents due to time complexity
#print('initialised sentences')
#print(time.time()-start)

logging.info('Constructing doc2vec vocabulary.')

# Build vocab.
doc2vec_init.build_vocab(sentences, trim_rule=None)

models = [doc2vec_init]

for epoch in range(1, 3+1):# init range(1, 5 + 1). Takes much time. Better would be at least 15 epochs.
    logging.info('Epoch %d', epoch)
    model = copy.deepcopy(models[-1])
    model.train(sentences)
    models.append(model)
    
logging.info('Trained models: %s', models)
#print(models[-1]) # last model is most advanced
models[-1].save('doc2vec_Alldocs_32LV')
savedD2Vmodel = gensim.models.Doc2Vec.load('doc2vec_Alldocs_32LV')
#print(savedD2Vmodel)

#Doc2Vec_model = gensim.models.Doc2Vec(sentences, size=32, window=8, min_count=5, workers=4)
#savedD2Vmodel.syn0[0]
#savedD2Vmodel.most_similar()

967.7279672622681




array([-0.00374502,  0.00308837, -0.00472419, -0.00193337, -0.00035506,
       -0.00016074,  0.00098103,  0.00092244, -0.00118338, -0.00332338,
       -0.0013285 ,  0.00133657,  0.00192708,  0.00192939,  0.00342257,
       -0.00439199,  0.00283154,  0.00358476, -0.00412811, -0.00191401,
       -0.00284428,  0.00235222, -0.00356911,  0.0018685 , -0.00344937,
       -0.00368249,  0.00289228,  0.00340447,  0.00430199,  0.00477111,
        0.00151259, -0.00288733, -0.00170883, -0.00153607,  0.00459886,
       -0.00419869,  0.0047371 , -0.00303401, -0.00194053,  0.00410392,
        0.00484982, -0.00267162,  0.0042858 ,  0.00018068, -0.00444503,
       -0.00353812,  0.00084476,  0.00319608, -0.0039742 , -0.0008061 ,
        0.00390346,  0.00032077,  0.00438578, -0.00069475, -0.00272398,
       -0.00450479, -0.0021173 , -0.00016915,  0.00370967,  0.00117157,
        0.00237067,  0.00239566,  0.00377652, -0.0041114 , -0.00108679,
       -0.00439702, -0.00441423,  0.00058906,  0.00354364,  0.00

### Get query rep for doc2vec ###

In [137]:
def get_q_rep(index, query_str, d2vmodel):
    query_tokens = index.tokenize(query_str) # " hoi ik ben diede " = "hoi", "ik...
    vectors = []
    for token in query_tokens:
        vector = d2vmodel['token']
        vectors.append(vector)

    avg_query = np.mean(np.array(vectors), axis=0)
    return avg_query

### Get cosine similarity between query and document ###

In [23]:
def check_cosine_given_query(avg_q, queryID, internal_docIDs):
    word2vec_dict = {}
    for internal_docID in internal_docIDs: # amount of documents
        avg_d = savedD2Vmodel.docvecs[internal_docID]
        cosine = cosine_similarity(numpy.reshape(avg_q, (1, -1)),  numpy.reshape(avg_d, (1, -1)))[0][0] 
        external_ID = index.document(internal_docID)[0]
        doc2vec_dict[external_ID] = cosine
    return doc2vec_dict

### Rerank TFIDF with doc2vec ###

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import mutual_info_score #is equivalent to KL divergence
import numpy
def check_cosine_all_queries(tfidf_rankings):
    dic = {}
    for query_id, ranked_docs in tfidf_rankings.items():
        query_string = query_dict[str(query_id)]
        avg_q = get_q_rep(index, query_string, savedD2Vmodel)
        cosines_per_q = check_cosine_given_query(avg_q, query_id, ranked_docs)
        dic[query_id] = make_dict_format(cosines_per_q)
    return dic

new_rankings_doc2vec = check_cosine_all_queries(tfidf_rankings)

### With this code we took all rerankings and write run-files so we can evaluate. ###

In [121]:
def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            
# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
# write_run(
#     model_name='lda',
#     data=Lda_rerank,
#     out_f=open('tfidf_lda.run', 'w'),
#     max_objects_per_query=1000)