# Naive Bayes Text Classification
In this notebook, you will first see a simple Naive Bayes (NB) classifier, which is trained on a tiny toy corpus to classify texts into categories of 'sports' and 'not sports'. Then you are asked to apply and adjust the NB classifier to perform sentiment analysis. 

In [1]:
# toy tiny corpus: sports and non-sports sentences
sports = ['A great game', 'Very clean match','A clean but forgettable game']
non_sports = ['The election was over','It was a close election']

In [2]:
# build vocabulary
all_words = []
sport_words = []
non_sport_words = []
for sent in sports:
    sport_words += [ww.lower() for ww in sent.split()]
for sent in non_sports:
    non_sport_words += [ww.lower() for ww in sent.split()]

all_words = sport_words + non_sport_words
vocab = list(set(all_words))

print(all_words)
print(len(vocab), vocab)

print('sport token nums', len(sport_words))
print('sport type nums', len(set(sport_words)))
print('non-sport token nums', len(non_sport_words))
print('non-sport type nums', len(set(non_sport_words)))

['a', 'great', 'game', 'very', 'clean', 'match', 'a', 'clean', 'but', 'forgettable', 'game', 'the', 'election', 'was', 'over', 'it', 'was', 'a', 'close', 'election']
14 ['was', 'it', 'great', 'but', 'very', 'clean', 'forgettable', 'election', 'match', 'over', 'game', 'a', 'the', 'close']
sport token nums 11
sport type nums 8
non-sport token nums 9
non-sport type nums 7


In [3]:
# get the prior distribution
prior_sport = len(sports)*1./(len(sports)+len(non_sports))
prior_non_sport = len(non_sports)*1./(len(sports)+len(non_sports))

In [7]:
# get the word frequencies, which will be later used to compute likelihood
from nltk import FreqDist
sport_fd = FreqDist(sport_words)
non_sport_fd = FreqDist(non_sport_words)

print(sport_fd['close'])

0


In [11]:
# NB classifier
import numpy as np
def predict_class(words):
    sport_likelihood = []
    non_likelihood = []
    for ww in words:
        sport_likelihood.append((sport_fd[ww]+1.)/(len(sport_words)+len(vocab)))
        non_likelihood.append((non_sport_fd[ww]+1.)/(len(non_sport_words)+len(vocab)))
    print(sport_likelihood)
    print(non_likelihood)
    s_loglhd = np.sum([np.log(l) for l in sport_likelihood])
    n_loglhd = np.sum([np.log(l) for l in non_likelihood])
    print(s_loglhd, n_loglhd)
    sprob = np.log(prior_sport)+s_loglhd
    nprob = np.log(prior_non_sport)+n_loglhd
    if sprob > nprob: return 'sport'
    else: return 'non_sport'
    
print(predict_class('a very interesting game'.split()))

[0.12, 0.08, 0.04, 0.12]
[0.08695652173913043, 0.043478260869565216, 0.043478260869565216, 0.043478260869565216]
-9.985131541576639 -11.848829683156653
sport


## Exercise: NB-based Sentiment Analysis
*Sentiment analysis* is probably the most commerical application of text classification. It takes a customer review and checks the overall sentiment of the review. Here we use the movie review corpus to train a NB-based sentiment analyzer. 

In [12]:
# obtain the data
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

print('document num', len(documents))
print('labels:', set([dd[1] for dd in documents]))
print(documents[0][0], documents[0][1])

document num 2000
labels: {'neg', 'pos'}
['ugh', '.', 'that', 'about', 'sums', 'this', 'movie', 'up', '.', 'just', ',', 'ugh', '.', 'the', 'original', 'godzilla', 'movies', 'are', 'somewhat', 'of', 'a', 'cult', 'classic', ',', 'and', 'when', 'reviewing', 'the', 'previous', 'films', ',', 'each', 'film', 'had', 'a', 'certain', 'degree', 'of', 'intelligence', '.', 'and', 'that', 'was', 'the', 'reason', 'they', 'found', 'such', 'an', 'enviable', 'cult', 'following', ';', 'in', 'spite', 'of', 'bad', 'special', 'effects', ',', 'horrible', 'dubbing', ',', 'and', 'a', 'man', 'in', 'a', 'lizard', 'suit', ',', 'they', 'maintained', 'a', 'certain', 'degree', 'of', '.', '.', '.', 'how', 'to', 'put', 'this', '?', '.', '.', '.', 'dignity', '?', 'not', 'quite', 'the', 'word', 'i', "'", 'm', 'looking', 'for', '.', 'you', 'understand', ',', 'right', '?', 'and', 'in', '50', 'years', ',', 'godzilla', 'has', 'maintained', 'that', 'degree', 'of', "'", 'whatever', "'", '.', 'leave', 'it', 'to', 'america', '

In [15]:
# split the data into train, dev-test and test

train_data = documents[:1200]
dev_data = documents[1200:1600]
test_data = documents[1600:]

In [32]:
neg_corpus = [train_data[i][0] for i,v in enumerate(train_data) for j,k in enumerate(v) if k=='neg']
pos_corpus = [train_data[i][0] for i,v in enumerate(train_data) for j,k in enumerate(v) if k=='pos']

In [36]:
pos_corpus = [v[0] for i,v in enumerate(train_data) if v[-1]=='pos']
len(pos_corpus)

601

In [33]:
len(pos_corpus)

601

In [28]:

# build the prior probability of pos and neg (based on train_data)
prior_pos = len(pos_corpus)*1./(len(pos_corpus)+len(neg_corpus))
prior_neg = len(neg_corpus)*1./(len(pos_corpus)+len(neg_corpus))

In [30]:
print(prior_pos, prior_neg)

0.5008333333333334 0.49916666666666665


In [49]:
# build vocabulary based on train_data
# you may investigate whether to remove stopwords and punctuations and 
# whether to apply lemmatization/stemming, and compare their performance on dev-test set 

all_words = []
pos_words = []
neg_words = []
for sent in pos_corpus:
    pos_words += [ww.lower() for ww in sent]
for sent in neg_corpus:
    neg_words += [ww.lower() for ww in sent]

all_words = pos_words + neg_words
vocab = list(set(all_words))

#print(all_words)
#print(len(vocab), vocab)
print(len(vocab))
print('sport token nums', len(pos_words))
print('sport type nums', len(set(pos_words)))
print('non-sport token nums', len(neg_words))
print('non-sport type nums', len(set(neg_words)))

32362
sport token nums 498108
sport type nums 24308
non-sport token nums 453816
non-sport type nums 22919


In [42]:
# for each class (pos and neg), maintain the frequency of each type, so as to compute likelihood
from nltk import FreqDist
pos_fd = FreqDist(pos_words)
neg_fd = FreqDist(neg_words)

In [50]:
print(neg_fd['interested'])

39


In [54]:
# build the class prediction function
def predict_sentiment(words):
    pos_likelihood = []
    neg_likelihood = []
    for ww in words:
        pos_likelihood.append((pos_fd[ww]+1.)/(len(pos_words)+len(vocab)))
        neg_likelihood.append((neg_fd[ww]+1.)/(len(neg_words)+len(vocab)))
    #print(pos_likelihood)
    #print(neg_likelihood)
    s_loglhd = np.sum([np.log(l) for l in pos_likelihood])
    n_loglhd = np.sum([np.log(l) for l in neg_likelihood])
    #print(s_loglhd, n_loglhd)
    sprob = np.log(prior_pos)+s_loglhd
    nprob = np.log(prior_neg)+n_loglhd
    if sprob > nprob: 
        return 'pos'
    else: 
        return 'neg'
    
# evaluate your model's performance on the dev-test set
dev_pred_labels = []
dev_true_labels = [ll for (dd,ll) in dev_data]
for tt,_ in dev_data:
    dev_pred_labels.append(predict_sentiment(tt))

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print('acc', accuracy_score(dev_true_labels, dev_pred_labels))
print(precision_recall_fscore_support(dev_true_labels, dev_pred_labels, average=None, labels=['pos', 'neg']))

# develop different models (with and without stopwords/punctuations/stemming/lemmatization),
# and select the best model by its performance on the dev-test set;
# the selected best model will be applied to test data in the next step

acc 0.825
(array([0.82741117, 0.8226601 ]), array([0.81909548, 0.83084577]), array([0.82323232, 0.82673267]), array([199, 201]))


In [55]:
# test the performance of the best model on test set
test_pred_labels = []
test_true_labels = [ll for (dd,ll) in test_data]
for tt,_ in test_data:
    test_pred_labels.append(predict_sentiment(tt))

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print('acc', accuracy_score(test_true_labels, test_pred_labels))
print(precision_recall_fscore_support(test_true_labels, test_pred_labels, average=None, labels=['pos', 'neg']))


acc 0.8075
(array([0.82887701, 0.78873239]), array([0.775, 0.84 ]), array([0.80103359, 0.81355932]), array([200, 200]))
