# Sentiment Classification with Multinomial Naive Bayes

In [None]:
%load_ext autoreload
%autoreload 2


import numpy as np
import math
from sklearn.model_selection import train_test_split
from multinomial_naive_bayes import MultinomialNaiveBayes
from util import preprocess_data, get_dictionary, featurize_data, sign_test, cross_validation
import scipy

## Read Data

In [None]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [None]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

In [None]:
X_train = X_pos[:900] + X_neg[:900]
y_train = y_pos[:900] + y_neg[:900]

X_test = X_pos[900:] + X_neg[900:]
y_test = y_pos[900:] + y_neg[900:]

## Test Model with Held Out Set

In [None]:
smoothing = 0
unigram_cutoff = 1
bigram_cutoff = 7
unigram=True
bigram=False

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=smoothing)

model.fit(X_feat_train, y_train)

In [None]:
%%time
y_pred = model.predict(X_feat_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

In [None]:
def run():
    # Test all combinations of the models on the held out test set
    smoothings = [0, 1]
    unigrams = [True, False]
    bigrams = [False, True]
    unigram_cutoffs = [1, 4]
    bigram_cutoffs = [1, 7]
    
    for unigram in unigrams:
        for bigram in bigrams:
            for unigram_cutoff in unigram_cutoffs:
                for bigram_cutoff in bigram_cutoffs:
                    for smoothing in smoothings:
                        if not unigram and not bigram:
                            continue
                        if not unigram and unigram_cutoff == 4:
                            continue
                        if not bigram and bigram_cutoff == 7:
                            continue
                        X_train = X_pos[:900] + X_neg[:900]
                        y_train = y_pos[:900] + y_neg[:900]

                        X_test = X_pos[900:] + X_neg[900:]
                        y_test = y_pos[900:] + y_neg[900:]
                        print("unigram: {}, bigram: {}, unigram_cutoff: {}, bigram_cutoff: {}, smoothing: {}".format(unigram, bigram, unigram_cutoff, bigram_cutoff, smoothing))
                        token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

                        X_train = featurize_data(X_train, token_to_idx)
                        X_test = featurize_data(X_test, token_to_idx)

                        model = MultinomialNaiveBayes(classes, len(X_train[0]), smoothing_value=smoothing)
                        model.fit(X_train, y_train)
                        y_pred = model.predict(X_test)
                        n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

                        print("{0:.2f}% of sentences are correctly classified \n".format(n_correct * 100 / len(X_test)))

In [None]:
run()

## Sign Test on Held Out Test Set

Let's compare models that use smoothing with models that don't use smoothing

### Unigrams

In [None]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

### Bigrams

In [None]:
unigram_cutoff = 1
bigram_cutoff = 7
unigram=False
bigram=True

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

### Unigrams + Bigrams

In [None]:
unigram_cutoff = 4
bigram_cutoff = 7
unigram=True
bigram=True

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

### Unigrams vs Bigrams both with smoothing

In [None]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model1.fit(X_feat_train, y_train)
y1_pred = model1.predict(X_feat_test)

In [None]:
unigram_cutoff = 1
bigram_cutoff = 7
unigram=False
bigram=True

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2.fit(X_feat_train, y_train)
y2_pred = model2.predict(X_feat_test)

In [None]:
sign_test(y1_pred, y2_pred, y_test)

### Unigrams+ Bigrams vs Unigrams both with smoothing

In [None]:
unigram_cutoff = 4
bigram_cutoff = 7
unigram=True
bigram=True

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model1.fit(X_feat_train, y_train)
y1_pred = model1.predict(X_feat_test)

In [None]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [None]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

In [None]:
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2.fit(X_feat_train, y_train)
y2_pred = model2.predict(X_feat_test)

In [None]:
sign_test(y1_pred, y2_pred, y_test)

## Cross Validation

### Evaluate all models using cross validation

In [None]:
def run_full_cv(X_pos, X_neg, y_pos, y_neg):
    # Run cross validation for all combinations of the models
    smoothings = [0, 1]
    unigrams = [True, False]
    bigrams = [False, True]
    unigram_cutoff = 4
    bigram_cutoff = 7
    
    X = np.array(X_pos + X_neg)
    y = np.array(y_pos + y_neg)
    
    for unigram in unigrams:
        for bigram in bigrams:
            for smoothing in smoothings:
                if not unigram and not bigram:
                    continue
                print("unigram: {}, bigram: {}, unigram_cutoff: {}, bigram_cutoff: {}, smoothing: {}".format(unigram, bigram, unigram_cutoff, bigram_cutoff, smoothing))

                model = MultinomialNaiveBayes(classes, len(X[0]), smoothing_value=smoothing)
                cross_validation(model, X, y, unigram=unigram, bigram=bigram, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff)

In [None]:
run_full_cv(X_pos, X_neg, y_pos, y_neg)

## Sign Test
Run the sign test on cross validation results

### Unigrams

In [None]:
X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

In [None]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [None]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

y1_pred, y1_test = cross_validation(model1, X, y, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)
y2_pred, y2_test = cross_validation(model2, X, y, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

assert(np.array_equal(y1_test, y2_test))
sign_test(y1_pred, y2_pred, y1_test)