In [None]:
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
import pickle
from gensim.models.doc2vec import Doc2Vec
import numpy as np
import os

## Load Doc2Vec

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_1dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)

## Word Similarity

In [None]:
doc2vec.wv.most_similar(positive=['film'], topn=5)

In [None]:
doc2vec.wv.most_similar(positive=['astonishing'], topn=5)

In [None]:
doc2vec.wv.most_similar(positive=['actor'], topn=5)

In [None]:
doc2vec.wv.most_similar(positive=['thriller'], topn=5)

In [None]:
doc2vec.wv.most_similar(positive=['fish'], topn=5)

In [None]:
doc2vec.wv.most_similar(positive=['inference'], topn=7)

In [None]:
doc2vec.wv.most_similar(positive=['FABOULOUS'], topn=6)

## Docs Similarity

In [None]:
max_sim = 0
idx = 0

for i in range(1000):
    sim = doc2vec.docvecs.most_similar(positive=[i], topn=5)[0][1]
    if sim > max_sim and sim < 0.8:
        max_sim = sim
        idx = i
        
idx

In [None]:
doc2vec.docvecs.most_similar(positive=[20], topn=50)

In [None]:
doc2vec.docvecs.most_similar(positive=[98085], topn=5)

In [None]:
reviews[473]

In [None]:
reviews[80661]

In [None]:
reviews[20]

In [None]:
reviews[17691]

In [None]:
rs = [20, 473, 37232, 98085]
for r in rs:
    print(reviews[r])
    print()

In [None]:
data = np.zeros((4, 4))
for i, r in enumerate(rs):
    for j, r2 in enumerate(rs):
        print("Similarity between {} and {}: {}".format(r, r2, doc2vec.docvecs.similarity(r, r2)))
        data[i, j] = doc2vec.docvecs.similarity(r, r2)

In [None]:
import seaborn as sns; sns.set()
import pylab
# data = np.array([[0.99, 0.50, 0.44, 0.39],
#         [0.50, 1.00, 0.33, 0.31], 
#         [0.44, 0.33, 1.00, 0.46], 
#         [0.39, 0.31, 0.46, 0.99]])
ax = sns.heatmap(data, vmin=0, vmax=1, cmap='RdBu_r', annot=True, xticklabels=['A', 'B', 'C', 'D'], yticklabels=['A', 'B', 'C', 'D'])
pylab.savefig('plots/docs_similarity_heatmap.pdf')

## Doc-Word Similarity

In [None]:
short_reviews = [196, 296, 99052, 99021]

for r in short_reviews:
    print()
    print(reviews[r])
    print()
    docvec = doc2vec.docvecs[r]
    print(doc2vec.wv.most_similar(positive=[docvec], topn=5))

## Algebraic Operations

In [None]:
def compute_analogy(word1, word2, word3):
    vector = doc2vec.wv[word1] - doc2vec.wv[word2] + doc2vec.wv[word3]
    return doc2vec.wv.similar_by_vector(vector, topn=5)

### Movies Related

In [None]:
compute_analogy('actor', 'man', 'woman')

In [None]:
compute_analogy('film', 'movie', 'horror')

In [None]:
compute_analogy('hero', 'man', 'woman')

### Adjectives

In [None]:
compute_analogy('big', 'bigger', 'good')

In [None]:
compute_analogy('worse', 'bad', 'good')

In [None]:
compute_analogy('fast', 'faster', 'slow')

### Capitals

In [None]:
compute_analogy('France', 'Paris', 'Japan')

In [None]:
compute_analogy('France', 'Paris', 'Germany')

### Famous People Roles

In [None]:
compute_analogy('Einstein', 'scientist', 'Mozart')

In [None]:
compute_analogy('Einstein', 'scientist', 'Picasso')

## Load Data

In [None]:
data_path = 'datasets/unsup'

In [None]:
data_paths = [
    'datasets/unsupervised/aclImdb/train/neg',
    'datasets/unsupervised/aclImdb/train/pos',
    'datasets/unsupervised/aclImdb/train/unsup',
    'datasets/unsupervised/aclImdb/test/neg',
    'datasets/unsupervised/aclImdb/test/pos'
]

reviews = []


for data_path in data_paths:
    for f in sorted(os.listdir(data_path)):
        with open(data_path + '/' + f, 'r') as file:
            text = file.read()
            reviews.append(text)

In [None]:
for i, review in enumerate(reviews[-1000:]):
    if len(review) < 250:
        print(reviews.index(review))
        print(review)
        print()
        print()

In [None]:
for i, review in enumerate(reviews[-2000:]):
    if len(review) < 400:
        print(reviews.index(review))
        print(review)
        print()
        print()

In [None]:
reviews[196]

In [None]:
reviews[220]

In [None]:
reviews[99021]

In [None]:
reviews[99125]

In [None]:
doc2vec.docvecs.similarity(26, 297)

In [None]:
doc2vec.docvecs.similarity(416, 99021)

In [None]:
doc2vec.docvecs.similarity(416, 99125)

In [None]:
doc2vec.docvecs.similarity(99021, 99125)

In [None]:
doc2vec.docvecs.similarity(297, 216)

In [None]:
data = pd.read_csv('datasets/imdb/imdb_master.csv', encoding = "ISO-8859-1")

In [None]:
for i in range(len(data['review'][:1000])):
    if len(data['review'][i]) < 500:
        print(i)
        print(data['review'][i])
        print()
        print()

In [None]:
tknzr = TweetTokenizer()

reviews = data['review'][:1000].apply(lambda x: tknzr.tokenize(x))

In [None]:
53, 80, 71 # Bad, Bad, Good

In [None]:
vector1 = doc2vec.infer_vector(reviews[53])
vector2 = doc2vec.infer_vector(reviews[80])
vector3 = doc2vec.infer_vector(reviews[71])

In [None]:
doc2vec.docvecs.similarity(1, 2)

In [None]:
token = "words associated with my research questions".split()
new_vector = doc2vec.infer_vector(token)
sims = doc2vec.docvecs.most_similar([new_vector])

In [None]:
sims