In [None]:
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
import pickle
from gensim.models.doc2vec import Doc2Vec
import numpy as np

# Model

In [None]:
model = pickle.load(open( "models/svm/best_doc2vec+svm_model.dump", "rb" ))

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_0dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)

# IMDB Dataset

In [None]:
data = pd.read_csv('datasets/imdb/imdb_master.csv', encoding = "ISO-8859-1")

In [None]:
data_test = data[data['type']=='test']

In [None]:
tknzr = TweetTokenizer()

data_test['review'] = data_test['review'].apply(lambda x: tknzr.tokenize(x))
data_test['label'] = data_test['label'].apply(lambda x: 1 if x == 'pos' else 0)

In [None]:
X_test = data_test['review'].values
y_test = data_test['label'].values

In [None]:
X_test_feat = np.array([doc2vec.infer_vector(x) for x in X_test])

In [None]:
y_pred = model.predict(X_test_feat)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test_feat)))

# 2018 Selected Reviews

In [None]:
data_test = pd.read_csv('datasets/imdb/imdb_2018.csv')

In [None]:
tknzr = TweetTokenizer()

data_test['review'] = data_test['review'].apply(lambda x: tknzr.tokenize(x))
data_test['label'] = data_test['label'].apply(lambda x: 1 if x == 'pos' else 0)

In [None]:
X_test = data_test['review'].values
y_test = data_test['label'].values

In [None]:
avg_len = sum([len(x) for x in X_test]) / len(X_test)
avg_len

In [None]:
X_test_feat = np.array([doc2vec.infer_vector(x) for x in X_test])

In [None]:
y_pred = model.predict(X_test_feat)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test_feat)))

In [None]:
for i, y in enumerate(y_test):
    if y_test[i] != y_pred[i]:
        print("Review number {}: model predicted {} but correct label was {}".format(i, y_pred[i], y_test[i]))
        print("The review has {} stars".format(data_test['stars'][i]))
        print()
        print(' '.join(data_test['review'][i]))
        print()
        print()

Compute decision function for misclassified reviews

In [None]:
model.decision_function(X_test_feat[0].reshape(1, -1))

In [None]:
model.decision_function(X_test_feat[13].reshape(1, -1))

In [None]:
model.decision_function(X_test_feat[23].reshape(1, -1))

In [None]:
np.mean(np.abs(model.decision_function(X_test_feat_correct)))

In [None]:
(len(X_test[13]) + len(X_test[0]) + len(X_test[23])) / 3

In [None]:
X_test_feat_correct = X_test_feat[y_pred == y_test]