In [1]:
'''
Run a hyperparameter search over the doc2vec model! 
'''

from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
from Extensions import SVMDoc2Vec, DocFeaturizer


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from gensim.models.doc2vec import Doc2Vec
from sklearn.decomposition import PCA

import seaborn as sns
import numpy as np

ax_size = 16
title_size=18

save_dir = "doc2vec_models/"
import os
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
good = ["good", "great","amazing","best"]
bad = ["avoid","boring","terrible","bad"]
extreme = ["incredibly", "particularly", "very"] # e.g., extreme words, or neutral
neutral = ["the", "a", "but"]

# model_pth= f"{save_dir}sample_model.pkl"
# DF.model.save(model_pth)

corpus=MovieReviewCorpus(stemming=False,pos=False,use_imdb=True)
mini_corpus=MovieReviewCorpus(stemming=False,pos=False,use_imdb=False)

num train: 25000, num test: 25000
tot num reviews: 50000
num train: 1800, num test: 200
tot num reviews: 2000


In [2]:
# feature_dims = [10, 50, 100, 200, 400]
# windows=[3,5,10]
# accs = np.zeros([len(feature_dims), len(windows)])
# np.savetxt("currentState.txt", accs)
# accs2 = np.loadtxt("currentState.txt")
# accs2

In [None]:
feature_dims = [10, 50, 100, 200, 400]
windows=[3,5,10] # original paper looked at 10 and 400
# min_counts=[2,5]
# dm_concats=[0,1]
min_count = 2 
dm_concat = 1
epochs=30 # for hyperparam search
inf_epochs = 10
accs = np.zeros([len(feature_dims), len(windows)])
for i, feature_dim in enumerate(feature_dims):
    for j, window in enumerate(windows): 
        print("features: ", feature_dim, " window: ", window)
        DF = DocFeaturizer(feature_dim, window, dm=1, dbow_words=0, 
                           dm_concat=dm_concat,min_count=min_count)
        DF.train_model(corpus.unsup_train, epochs)
        SVMDV=SVMDoc2Vec(model=DF,bigrams=False,trigrams=False,discard_closed_class=False,
                        inf_epochs=inf_epochs)
        SVMDV.train(mini_corpus.train) 
        SVMDV.test(mini_corpus.test)
        acc= SVMDV.getAccuracy()
        accs[i,j] = acc
        print("Acc: ", acc)
        np.savetxt("currentState.txt",accs)

features:  10  window:  3
kernel:  rbf
kernel:  rbf
Acc:  0.705
features:  10  window:  5
kernel:  rbf
kernel:  rbf
Acc:  0.615
features:  10  window:  10
kernel:  rbf
kernel:  rbf
Acc:  0.685
features:  50  window:  3
kernel:  rbf
kernel:  rbf
Acc:  0.67
features:  50  window:  5
kernel:  rbf
kernel:  rbf
Acc:  0.63
features:  50  window:  10
kernel:  rbf
kernel:  rbf
Acc:  0.695
features:  100  window:  3
kernel:  rbf
kernel:  rbf
Acc:  0.62
features:  100  window:  5
kernel:  rbf
kernel:  rbf
Acc:  0.645
features:  100  window:  10
kernel:  rbf
kernel:  rbf
Acc:  0.66
features:  200  window:  3
kernel:  rbf
kernel:  rbf
Acc:  0.615
features:  200  window:  5


In [None]:
fig, ax = plt.subplots()
sns.heatmap(accs,xticklabels=[str(window) for window in windows], yticklabels=[str(feature_dim) for feature_dim in feature_dims],cmap="coolwarm",ax=ax)
ax.set_xlabel("Window Size",fontsize=ax_size)
ax.set_ylabel("Feature Dimensionality",fontsize=ax_size)
ax.set_title(f"PV-DM: Feature Dimensionality vs. Window Size",fontsize=title_size) 
plt.savefig(f"featureDimWindowSize.png", dpi=400, bbox_inches="tight")

In [None]:
feature_dim = 200
windows=5
dm_concats = [0,1]
dbow_words_vals = [0,1]
# dm_means = [0,1]
epochs=30 # for hyperparam search
inf_epochs = 10
accs2 = np.zeros([len(dm_concats), len(dbow_words_vals)])
for i, dm_concat in enumerate(dm_concats):
    for j, dbow_words in enumerate(dbow_words_vals): 
        print("dm concat: ", dm_concat, " window: ", window)
        DF = DocFeaturizer(feature_dim, window, dm=0, dbow_words=dbow_words, 
                           dm_concat=dm_concat,min_count=min_count)
        DF.train_model(corpus.unsup_train, epochs)
        SVMDV=SVMDoc2Vec(model=DF,bigrams=False,trigrams=False,discard_closed_class=False,
                        inf_epochs=inf_epochs)
        SVMDV.train(mini_corpus.train) 
        SVMDV.test(mini_corpus.test)
        acc= SVMDV.getAccuracy()
        accs2[i,j] = acc
        print("Acc: ", accs2)
        np.savetxt("currentState_search2.txt",accs2)
        
# TODO: port this data in to a table!!

In [None]:
feature_dims = [10, 50, 100, 200, 400]
windows=[3,5,10] # original paper looked at 10 and 400
# min_counts=[2,5]
# dm_concats=[0,1]
min_count = 2 
dm_concat = 1
epochs=50 # for hyperparam search
inf_epochs = 30
accs = np.zeros([len(feature_dims), len(windows)])
for i, feature_dim in enumerate(feature_dims):
    for j, window in enumerate(windows): 
        print("features: ", feature_dim, " window: ", window)
        DF = DocFeaturizer(feature_dim, window, dm=1, dbow_words=0, 
                           dm_concat=dm_concat,min_count=min_count)
        DF.train_model(corpus.unsup_train, epochs)
        SVMDV=SVMDoc2Vec(model=DF,bigrams=False,trigrams=False,discard_closed_class=False,
                        inf_epochs=inf_epochs)
        SVMDV.train(mini_corpus.train) 
        SVMDV.test(mini_corpus.test)
        acc= SVMDV.getAccuracy()
        accs[i,j] = acc
        print("Acc: ", acc)
        np.savetxt("currentState.txt",accs)

In [None]:
fig, ax = plt.subplots()
sns.heatmap(accs,xticklabels=[str(window) for window in windows], yticklabels=[str(feature_dim) for feature_dim in feature_dims],cmap="coolwarm",ax=ax)
ax.set_xlabel("Window Size",fontsize=ax_size)
ax.set_ylabel("Feature Dimensionality",fontsize=ax_size)
ax.set_title(f"PV-DM: Feature Dimensionality vs. Window Size",fontsize=title_size) 
plt.savefig(f"featureDimWindowSize2.png", dpi=400, bbox_inches="tight")

In [None]:
feature_dims = [10, 50, 100, 200, 400]
windows=[3,5,10] # original paper looked at 10 and 400
# min_counts=[2,5]
# dm_concats=[0,1]
min_count = 2 
dm_concat = 1
epochs=50 # for hyperparam search
inf_epochs = 30
accs3 = np.zeros([len(feature_dims), len(windows)])
for i, feature_dim in enumerate(feature_dims):
    for j, window in enumerate(windows): 
        print("features: ", feature_dim, " window: ", window)
        DF = DocFeaturizer(feature_dim, window, dm=0, dbow_words=1, 
                           dm_concat=dm_concat,min_count=min_count)
        DF.train_model(corpus.unsup_train, epochs)
        SVMDV=SVMDoc2Vec(model=DF,bigrams=False,trigrams=False,discard_closed_class=False,
                        inf_epochs=inf_epochs)
        SVMDV.train(mini_corpus.train) 
        SVMDV.test(mini_corpus.test)
        acc= SVMDV.getAccuracy()
        accs3[i,j] = acc
        print("Acc: ", acc)
        np.savetxt("currentState_dbow.txt",accs3)
        
fig, ax = plt.subplots()
sns.heatmap(accs3,xticklabels=[str(window) for window in windows], yticklabels=[str(feature_dim) for feature_dim in feature_dims],cmap="coolwarm",ax=ax)
ax.set_xlabel("Window Size",fontsize=ax_size)
ax.set_ylabel("Feature Dimensionality",fontsize=ax_size)
ax.set_title(f"PV-DM: Feature Dimensionality vs. Window Size",fontsize=title_size) 
plt.savefig(f"featureDimWindowSize3.png", dpi=400, bbox_inches="tight")

In [None]:

# feature_dim = 100
# window = 3
# epochs = 50
# DF = DocFeaturizer(feature_dim, window, dm=0, dbow_words=1, dm_concat=0)
# # DF.train_model(corpus.train, epochs)
# # SVMDV=SVMDoc2Vec(model=DF,bigrams=False,trigrams=False,discard_closed_class=False)
# # SVMDV.train(corpus.train)

# model = Doc2Vec.load(f"{save_dir}mini_data_model.pkl")
# wv = model.wv
# dv= model.dv
# vocab = list(model.wv.index_to_key)
# X = wv[vocab]

In [None]:
# !pip3 install testfixtures


In [None]:
# help from: https://github.com/RaRe-Technologies/gensim/blob/3c3506d51a2caf6b890de3b1b32a8b85f7566ca5/docs/notebooks/doc2vec-IMDB.ipynb
# cite original paper showing combined = better
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec


In [None]:
'''
analyses to run 
vector addition (see pg 4 of orig doc2vec paper -- analogies) ex. pv(lady gaga) - wv(american) + wv(japanese) = japanese version of lady gaga

'''



In [None]:
dv

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,6))
fig.suptitle('Class Separation of Documents', fontsize=title_size + 3)

corpus_to_plot = mini_corpus # TODO: change to be the IMDB corpus!!!!

labels = [label for label, _ in corpus_to_plot.train]
dv = model.dv
doc_ids = list(range(len(dv)))
X_docs = dv[doc_ids]
neg_ids = labels.index("NEG")

ax = axes[0]

pca_docs = PCA(n_components=2)
X_pca_docs = pca_docs.fit_transform(X_docs)
# plot positive and negative separately
ax.scatter(X_pca_docs[:neg_ids, 0], X_pca_docs[:neg_ids, 1],label="POS")
ax.scatter(X_pca_docs[neg_ids:, 0], X_pca_docs[neg_ids:, 1],label="NEG")
ax.legend()
ax.set_xlabel("PC 1",fontsize=ax_size)
ax.set_ylabel("PC 2",fontsize=ax_size)
ax.set_title(f"PCA",fontsize=title_size) 
ax.legend()


ax = axes[1]

tsne_docs = TSNE(n_components=2)
X_tsne_docs = tsne_docs.fit_transform(X_docs)
# plot positive and negative separately
ax.scatter(X_tsne_docs[:neg_ids, 0], X_tsne_docs[:neg_ids, 1],label="POS")
ax.scatter(X_tsne_docs[neg_ids:, 0], X_tsne_docs[neg_ids:, 1],label="NEG")
ax.legend()
ax.set_xlabel("Dim 1",fontsize=ax_size)
ax.set_ylabel("Dim 2",fontsize=ax_size)
ax.set_title(f"t-SNE",fontsize=title_size) 
ax.legend()

plt.savefig(f"dimRedIMDB.png", dpi=400, bbox_inches="tight")

In [None]:
import numpy as np
sample_words = good + neutral + extreme + bad
V = len(sample_words)
sims = np.zeros([V, V])
for i, w_i in enumerate(sample_words):
    for j, w_j in enumerate(sample_words):
        sims[i][j] = wv.similarity(w_i, w_j)
        
fig, ax = plt.subplots()
sns.heatmap(sims,xticklabels=sample_words, yticklabels=sample_words,cmap="coolwarm",ax=ax)
# ax.set_xlabel("Word",fontsize=ax_size)
# ax.set_ylabel("Word",fontsize=ax_size)
ax.set_title(f"Word Cosine-Similarity",fontsize=title_size) 
plt.savefig(f"wordSim.png", dpi=400, bbox_inches="tight") # TODO: repeat this for the original training set case too!! 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,6))
fig.suptitle('Word Embedding Dimensionality Reduction', fontsize=title_size + 3)

corpus_to_plot = mini_corpus # TODO: change to be the IMDB corpus!!!!

wv = model.wv
word_ids = list(range(len(wv)))
X_words = wv[word_ids]

ax = axes[0]

pca_words = PCA(n_components=2)
X_pca_words = pca_words.fit_transform(X_words)
# plot positive and negative separately
# ax.scatter(X_pca_words[:neg_ids, 0], X_pca_words[:neg_ids, 1],label="POS")
# ax.scatter(X_pca_words[neg_ids:, 0], X_pca_words[neg_ids:, 1],label="NEG")
ax.scatter(X_pca_words[:, 0], X_pca_words[:, 1])
ax.set_xlabel("PC 1",fontsize=ax_size)
ax.set_ylabel("PC 2",fontsize=ax_size)
ax.set_title(f"PCA",fontsize=title_size) 


ax = axes[1]

tsne_words = TSNE(n_components=2)
X_tsne_words = tsne_words.fit_transform(X_words)
# plot positive and negative separately
ax.scatter(X_tsne_words[:, 0], X_tsne_words[:, 1])
ax.set_xlabel("Dim 1",fontsize=ax_size)
ax.set_ylabel("Dim 2",fontsize=ax_size)
ax.set_title(f"t-SNE",fontsize=title_size) 

plt.savefig(f"dimRedIMDB_words.png", dpi=400, bbox_inches="tight")

In [None]:


sorted_docs = sorted([(idx, doc) for idx, (_, doc) in enumerate(mini_corpus.train)], key=lambda x: len(x[1]), reverse=False)
idx = 1

print("Doc length: ", len(sorted_docs[idx][1]))
doc_id = sorted_docs[idx][0]
doc = ' '.join(sorted_docs[idx][1])
print("Sample doc: ", doc)

dv.most_similar(reviews[100])

In [None]:
wv.evaluate_word_analogies(["good"])

In [None]:
[' '.join(review) for review in reviews[:5]]