# Sentiment Classification with Multinomial Naive Bayes

In [50]:
%load_ext autoreload
%autoreload 2


import numpy as np
import math
from sklearn.model_selection import train_test_split
from multinomial_naive_bayes import MultinomialNaiveBayes
from util import preprocess_data, get_dictionary, featurize_data, sign_test, cross_validation
from sklearn.svm import SVC

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read Data

In [51]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [52]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

In [53]:
# token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

# X_train = featurize_data(X_train, token_to_idx)
# X_test = featurize_data(X_test, token_to_idx)

## Test Model with Held Out Set

In [54]:
X_train = X_pos[:900] + X_neg[:900]
y_train = y_pos[:900] + y_neg[:900]

X_test = X_pos[900:] + X_neg[900:]
y_test = y_pos[900:] + y_neg[900:]

In [55]:
smoothing = 1
unigram_cutoff = 1
bigram_cutoff = 7
unigram=True
bigram=False

In [56]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 52555 features


### Multinomial Naive Bayes

In [57]:
model = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=smoothing)

model.fit(X_feat_train, y_train)

In [58]:
%%time
y_pred = model.predict(X_feat_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

82.50% of sentences are correctly classified
CPU times: user 151 ms, sys: 89.9 ms, total: 241 ms
Wall time: 280 ms


## Sign Test

Let's compare models that use smoothing with models that don't use smoothing

### Unigrams

In [14]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [15]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 18333 features


In [16]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

  log_cond_prob = np.log(self.class_to_feature_to_cond_prob[c])


p: 0.0003943508751024046


### Bigrams

In [17]:
unigram_cutoff = 1
bigram_cutoff = 7
unigram=False
bigram=True

In [18]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 24990 features


In [19]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

  log_cond_prob = np.log(self.class_to_feature_to_cond_prob[c])


p: 0.04800146605383206


### Unigrams + Bigrams

In [20]:
unigram_cutoff = 4
bigram_cutoff = 7
unigram=True
bigram=True

In [21]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 43323 features


In [22]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

  log_cond_prob = np.log(self.class_to_feature_to_cond_prob[c])


p: 7.051394532266118e-05


### Unigrams vs Bigrams both with smoothing

In [54]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [55]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 18333 features


In [56]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model1.fit(X_feat_train, y_train)
y1_pred = model1.predict(X_feat_test)

In [57]:
unigram_cutoff = 1
bigram_cutoff = 7
unigram=False
bigram=True

In [58]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 24990 features


In [59]:
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2.fit(X_feat_train, y_train)
y2_pred = model2.predict(X_feat_test)

In [60]:
sign_test(y1_pred, y2_pred, y_test)

p: 0.7779207003752835


### Unigrams+ Bigrams vs Unigrams both with smoothing

In [45]:
unigram_cutoff = 4
bigram_cutoff = 7
unigram=True
bigram=True

In [46]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 43323 features


In [47]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model1.fit(X_feat_train, y_train)
y1_pred = model1.predict(X_feat_test)

In [48]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [49]:
token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

X_feat_train = featurize_data(X_train, token_to_idx)
X_feat_test = featurize_data(X_test, token_to_idx)

Generated 18333 features


In [52]:
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2.fit(X_feat_train, y_train)
y2_pred = model2.predict(X_feat_test)

In [53]:
sign_test(y1_pred, y2_pred, y_test)

p: 0.5726756419095728


## Cross Validation

In [32]:
X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

In [33]:
unigram_cutoff = 4
bigram_cutoff = 1
unigram=True
bigram=False

In [36]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

y1_pred = cross_validation(model1, X, y, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)
y2_pred = cross_validation(model2, X, y, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

sign_test(y1_pred, y2_pred, y)

Running iteration 1 out of 10 of cross validation
Generated 18403 features
78.50% of sentences are correctly classified 

Running iteration 2 out of 10 of cross validation
Generated 18309 features
83.00% of sentences are correctly classified 

Running iteration 3 out of 10 of cross validation
Generated 18353 features
82.50% of sentences are correctly classified 

Running iteration 4 out of 10 of cross validation
Generated 18338 features
83.50% of sentences are correctly classified 

Running iteration 5 out of 10 of cross validation
Generated 18350 features
79.00% of sentences are correctly classified 

Running iteration 6 out of 10 of cross validation
Generated 18285 features
81.50% of sentences are correctly classified 

Running iteration 7 out of 10 of cross validation
Generated 18265 features
83.50% of sentences are correctly classified 

Running iteration 8 out of 10 of cross validation
Generated 18367 features
79.50% of sentences are correctly classified 

Running iteration 9 out 

  log_cond_prob = np.log(self.class_to_feature_to_cond_prob[c])


57.00% of sentences are correctly classified 

Running iteration 2 out of 10 of cross validation
Generated 18309 features
58.50% of sentences are correctly classified 

Running iteration 3 out of 10 of cross validation
Generated 18353 features
63.00% of sentences are correctly classified 

Running iteration 4 out of 10 of cross validation
Generated 18338 features
62.50% of sentences are correctly classified 

Running iteration 5 out of 10 of cross validation
Generated 18350 features
61.50% of sentences are correctly classified 

Running iteration 6 out of 10 of cross validation
Generated 18285 features
61.50% of sentences are correctly classified 

Running iteration 7 out of 10 of cross validation
Generated 18265 features
60.50% of sentences are correctly classified 

Running iteration 8 out of 10 of cross validation
Generated 18367 features
59.50% of sentences are correctly classified 

Running iteration 9 out of 10 of cross validation
Generated 18398 features
60.50% of sentences are 

OverflowError: integer division result too large for a float

In [None]:
# token_to_idx = get_dictionary(X_train, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff, unigram=unigram, bigram=bigram)

# X_feat_train = featurize_data(X_train, token_to_idx)
# X_feat_test = featurize_data(X_test, token_to_idx)

In [22]:
model1 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=1)
model2 = MultinomialNaiveBayes(classes, len(X_feat_train[0]), smoothing_value=0)

# Train models on the same data
model1.fit(X_feat_train, y_train)
model2.fit(X_feat_train, y_train)

# Test models
y1_pred = model1.predict(X_feat_test)
y2_pred = model2.predict(X_feat_test)

sign_test(y1_pred, y2_pred, y_test)

  log_cond_prob = np.log(self.class_to_feature_to_cond_prob[c])


p: 7.051394532266118e-05


In [None]:
def run_full_cv():
    smoothings = [0, 1]
    unigrams = [True, False]
    bigrams = [False, True]
    unigram_cutoff = 4
    bigram_cutoff = 7
    
    X = np.array(X_pos + X_neg)
    y = np.array(y_pos + y_neg)
    
    for unigram in unigrams:
        for bigram in bigrams:
            for smoothing in smoothings:
                if not unigram and not bigram:
                    continue
                print("unigram: {}, bigram: {}, unigram_cutoff: {}, bigram_cutoff: {}, smoothing: {}".format(unigram, bigram, unigram_cutoff, bigram_cutoff, smoothing))

                model = MultinomialNaiveBayes(classes, len(X_train[0]), smoothing_value=smoothing)
                cross_validation(model, X, y, unigram=unigram, bigram=bigram, unigram_cutoff=unigram_cutoff, bigram_cutoff=bigram_cutoff)

In [None]:
run_full_cv()