In [None]:
%load_ext autoreload
%autoreload 2


import numpy as np
import math
from sklearn.model_selection import train_test_split
from multinomial_naive_bayes import MultinomialNaiveBayes
from data_processing import preprocess_data, get_dictionary, featurize_data
from statistical_tests import sign_test, permutation_test
from cross_validation import cross_validation, evaluate_classifier
from sklearn.svm import SVC
from gensim.models.doc2vec import Doc2Vec
import pickle

## Read Data

In [None]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [None]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

## Model Selection

In [None]:
X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

In [None]:
k = 10
idxs = np.array(range(len(y)))
    
folds_idxs = [[] for _ in range(k)]
for idx in idxs:
    fold = idx % k
    folds_idxs[fold].append(idx)

In [None]:
val_fold = 0
val_idxs = folds_idxs[val_fold]
train_idxs = list(set(np.concatenate(folds_idxs)) - set(val_idxs))

X_train = X[train_idxs]
y_train = y[train_idxs]

X_val = X[val_idxs]
y_val = y[val_idxs]

In [None]:
model = SVC(kernel='linear', shrinking=False)

### Models

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_5vecsize_1mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_50vecsize_1mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_100vecsize_1mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_200vecsize_1mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_100vecsize_3mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_200vecsize_3mincount_1dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_100vecsize_1mincount_0dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_200vecsize_1mincount_0dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_100vecsize_3mincount_0dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_100vecsize_1mincount_0dm_10epochs.model")

In [None]:
%%time
evaluate_classifier(model, X_train, y_train, X_val, y_val, doc2vec_file="d2v_200vecsize_3mincount_0dm_10epochs.model")

## Cross Validation

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_0dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)

In [None]:
X_feat = np.array([doc2vec.infer_vector(x) for x in X_train])

In [None]:
model = SVC(kernel='linear', shrinking=False)

In [None]:
cross_validation(model, X_feat, y_train, featurized=True)

# Train Best Model

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_0dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)
X_feat = np.array([doc2vec.infer_vector(x) for x in X])

In [None]:
model = SVC(kernel='linear', shrinking=False)
model.fit(X_feat, y)

In [None]:
pickle.dump(model, open('models/best_doc2vec+svm_model.dump', 'wb'))

## Statistical Test

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_0dm_10epochs.model"
doc2vec = Doc2Vec.load('doc2vec/' + best_doc2vec_file)
X_feat = np.array([doc2vec.infer_vector(x) for x in X_train])

In [None]:
model = SVC(kernel='linear', shrinking=False)
y1_pred, _ = cross_validation(model, X_feat, y_train, featurized=True)

In [None]:
other_doc2vec_file = "d2v_100vecsize_1mincount_0dm_10epochs.model"
other_doc2vec = Doc2Vec.load('doc2vec/' + other_doc2vec_file)
X_feat = np.array([other_doc2vec.infer_vector(x) for x in X_train])

In [None]:
model = SVC(kernel='linear', shrinking=False)
y2_pred, y_true = cross_validation(model, X_feat, y_train, featurized=True)

In [None]:
permutation_test(y1_pred, y2_pred, y_true)