In [None]:
%load_ext autoreload
%autoreload 2


import numpy as np
import math
from sklearn.model_selection import train_test_split
from multinomial_naive_bayes import MultinomialNaiveBayes
from data_processing import preprocess_data, get_dictionary, featurize_data
from statistical_tests import sign_test, permutation_test
from cross_validation import cross_validation, evaluate_classifier
from sklearn.svm import SVC
from gensim.models.doc2vec import Doc2Vec
import pickle

# Data

In [None]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [None]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

In [None]:
X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

In [None]:
k = 10
idxs = np.array(range(len(y)))
    
folds_idxs = [[] for _ in range(k)]
for idx in idxs:
    fold = idx % k
    folds_idxs[fold].append(idx)

In [None]:
val_fold = 0
val_idxs = folds_idxs[val_fold]
train_idxs = list(set(np.concatenate(folds_idxs)) - set(val_idxs))

X_train = X[train_idxs]
y_train = y[train_idxs]

X_val = X[val_idxs]
y_val = y[val_idxs]

# SVM

In [None]:
svm = SVC(kernel='linear', shrinking=False)

# dbow Model

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_0dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)
X_train_feat = np.array([doc2vec.infer_vector(x) for x in X_train])

In [None]:
y1_pred, y_true = cross_validation(svm, X_train_feat, y_train, featurized=True)

# dmpv Model

In [None]:
best_doc2vec_file = "d2v_200vecsize_1mincount_1dm_10epochs.model"
doc2vec = Doc2Vec.load('models/doc2vec/' + best_doc2vec_file)
X_train_feat = np.array([doc2vec.infer_vector(x) for x in X_train])

In [None]:
y2_pred, y_true = cross_validation(svm, X_train_feat, y_train, featurized=True)

# BoW Model

In [None]:
y3_pred, y_true = cross_validation(svm, X_train, y_train, unigram=True, bigram=False)

# Statistical Test

In [None]:
permutation_test(y1_pred, y2_pred, y_true)

In [None]:
permutation_test(y1_pred, y3_pred, y_true)