In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('./doc2vec.wv')

In [None]:
wv.most_similar("travel")

In [None]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('./doc2vec.model')

In [None]:
import simplejson


def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

In [None]:
train_corpus_untagged = json_load("./doc2vec.corpus.json")

In [None]:
from gensim.models.doc2vec import TaggedDocument

def gen_tagged_docs(corpus_untagged):
    return [TaggedDocument(doc["words"], doc["tags"]) for doc in corpus_untagged]

In [None]:
train_corpus = gen_tagged_docs(train_corpus_untagged)

In [None]:
tag_id_mapping = json_load('./doc2vec.tag_id_mapping.json')

In [None]:
id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
doc = train_corpus[0].words
print(doc)

# Using words
inferred_vector = model.infer_vector(doc)
sims = model.dv.most_similar([inferred_vector], topn=10)
for doc_id, factor in sims:
    print(factor, id_tag_mapping[doc_id])

print("************")    

# Using doc vector
inferred_vector = model.dv[train_corpus[0].tags[0]]
sims = model.dv.most_similar([inferred_vector], topn=10)
for doc_id, factor in sims:
    print(factor, id_tag_mapping[doc_id])

In [None]:
import random

random.seed(42)


ranks = []
second_ranks = []

train_corpus_copy = train_corpus.copy()
random.shuffle(train_corpus_copy)
sample_train_corpus = train_corpus_copy[:50]
for sent_id in range(len(sample_train_corpus)):
    inferred_vector = model.infer_vector(sample_train_corpus[sent_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_mapping))
    most_similar_tag_indices = [
        [docid for docid, sim in sims].index(tag)
        for tag in sample_train_corpus[sent_id].tags
    ]
    rank = min(most_similar_tag_indices)
    second_rank = max(most_similar_tag_indices) + 1
    ranks.append(rank)
    second_ranks.append(second_rank)

In [None]:
%matplotlib inline

import collections
import matplotlib.pyplot as plt


counter = collections.Counter(ranks)
sum_0 = sum([v for k, v in counter.items() if k <= 0])
sum_all_else = sum([v for k, v in counter.items() if k > 0])
plt.bar([0,1], [sum_0, sum_all_else])
print([sum_0, sum_all_else])

In [None]:
print('Training example correctly matched (%): ', 100 * sum_0 / (sum_0 + sum_all_else))
print('Training example incorrectly matched (%): ', 100 * sum_all_else / (sum_0 + sum_all_else))

In [None]:
sent_id = 42
sentence = train_corpus[sent_id]
article_tag_id = sentence.tags[0]
inferred_vector = model.infer_vector(sentence.words)
sims = model.dv.most_similar([inferred_vector], topn=len(train_corpus))
print('Document ({}): «{}»\n'.format(id_tag_mapping[article_tag_id], ' '.join(sentence.words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

most_similar_tag_indices = [
    [docid for docid, sim in sims].index(tag)
    for tag in train_corpus[sent_id].tags

]

for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('JUST-OUTSIDE-TAGS', max(most_similar_tag_indices) + 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], id_tag_mapping[sims[index][0]]))

In [None]:
corpus_test_untagged = json_load('./doc2vec.corpus.test.json')

In [None]:
corpus_test = gen_tagged_docs(corpus_test_untagged)

In [None]:
doc = corpus_test[-1].words
print(doc)

# Using words
inferred_vector = model.infer_vector(doc)
sims = model.dv.most_similar([inferred_vector], topn=10)
for doc_id, factor in sims:
    print(factor, id_tag_mapping[doc_id])


In [None]:
import random

random.seed(42)


ranks = []
second_ranks = []

corpus_test_copy = corpus_test.copy()
random.shuffle(corpus_test_copy)
sample_corpus_test = corpus_test_copy[:50]
for sent_id in range(len(sample_corpus_test)):
    inferred_vector = model.infer_vector(sample_corpus_test[sent_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_mapping))
    most_similar_tag_indices = [
        [docid for docid, sim in sims].index(tag)
        for tag in sample_corpus_test[sent_id].tags if tag
    ]
    rank = min(most_similar_tag_indices)
    second_rank = max(most_similar_tag_indices) + 1
    ranks.append(rank)
    second_ranks.append(second_rank)

In [None]:
%matplotlib inline

import collections
import matplotlib.pyplot as plt


counter = collections.Counter(ranks)
sum_0 = sum([v for k, v in counter.items() if k <= 0])
sum_all_else = sum([v for k, v in counter.items() if k > 0])
plt.bar([0,1], [sum_0, sum_all_else])
print([sum_0, sum_all_else])

In [None]:
print('Test example correctly matched (%): ', 100 * sum_0 / (sum_0 + sum_all_else))
print('Test example incorrectly matched (%): ', 100 * sum_all_else / (sum_0 + sum_all_else))

In [None]:
sent_id = 42
sentence = corpus_test[sent_id]
inferred_vector = model.infer_vector(sentence.words)
sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_mapping))
print('Document: «{}»\n'.format(' '.join(sentence.words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

most_similar_tag_indices = [
    [docid for docid, sim in sims].index(tag)
    for tag in corpus_test[sent_id].tags if tag

]

for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('JUST-OUTSIDE-TAGS', max(most_similar_tag_indices) + 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], id_tag_mapping[sims[index][0]]))