In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import math
from sklearn.model_selection import train_test_split
from multinomial_naive_bayes import MultinomialNaiveBayes
from data_processing import preprocess_data, get_dictionary, featurize_data
from statistical_tests import sign_test, permutation_test
from cross_validation import cross_validation, evaluate_classifier
from sklearn.svm import SVC

## Read Data

In [None]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [None]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

## Held out Test Set

In [None]:
X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

In [None]:
k = 10
idxs = np.array(range(len(y)))
    
folds_idxs = [[] for _ in range(k)]
for idx in idxs:
    fold = idx % k
    folds_idxs[fold].append(idx)

In [None]:
val_fold = 0
val_idxs = folds_idxs[val_fold]
train_idxs = list(set(np.concatenate(folds_idxs)) - set(val_idxs))

X_train = X[train_idxs]
y_train = y[train_idxs]

X_val = X[val_idxs]
y_val = y[val_idxs]

### Models

In [None]:
model = SVC(kernel='linear', shrinking=False)

In [None]:
%%time
y1_pred = evaluate_classifier(model, X_train, y_train, X_val, y_val, unigram=True, bigram=False)


In [None]:
%%time
y2_pred = evaluate_classifier(model, X_train, y_train, X_val, y_val, unigram=False, bigram=True)


In [None]:
%%time
y3_pred = evaluate_classifier(model, X_train, y_train, X_val, y_val, unigram=True, bigram=True)


## Cross Validation

In [None]:
def run_full_cv():
    unigrams = [True, False]
    bigrams = [False, True]
    unigram_cutoff = 4
    bigram_cutoff = 7
    
    X = np.array(X_pos + X_neg)
    y = np.array(y_pos + y_neg)
    
    for unigram in unigrams:
        for bigram in bigrams:
            if not unigram and not bigram:
                continue
            print("unigram: {}, bigram: {}, unigram_cutoff: {}, bigram_cutoff: {}".format(unigram, bigram, unigram_cutoff, bigram_cutoff))
            model = SVC(kernel='linear', shrinking=False)
            cross_validation(model, X, y, unigram=unigram, bigram=bigram, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff)

## Monte Carlo Permutation Test

In [None]:
permutation_test(y1_pred, y2_pred, y_val)

In [None]:
permutation_test(y1_pred, y3_pred, y_val)