# Niave Bayes for Movie Reviews
* Make sure you have the file movie.csv in the same directory as this ipython notebook file

In [8]:
import pandas as pd #To prepare the data
import numpy as np #For the log function

### loading the data

In [9]:
#import the data using pandas
movie_data = pd.read_csv('movie.csv')
print("There are {} rows in the data set".format(len(movie_data)))
movie_data.head()

There are 2000 rows in the data set


Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plenty...
1,Pos,every now and then a movie comes along from a ...
2,Pos,you ve got mail works alot better than it dese...
3,Pos,jaws is a rare film that grabs your attentio...
4,Pos,moviemaking is a lot like being the general ma...


### Divide the data into training and testing data

In [10]:
#First randomly select 20% of the data (400 rows)
movie_sample = movie_data.sample(400)
#The training data and the testing data MUST be seperated 
training_frame = movie_data[~movie_data.index.isin(movie_sample.index)]
testing_frame = movie_sample

In [11]:
def preprocessing(s):
    s = s.lower().replace('.',' .').replace(',',' ,').replace(';',' ;').replace('"',' " ')
    return s

### Define the dictionaries C, D

In [12]:
C = set(training_frame['class']) # Neg, Pos
D = dict()
for i in range(len(training_frame)):
    D[training_frame.iloc[i,1]] = training_frame.iloc[i,0]

### The Naive Bayes algorithm

In [13]:
def train_NB(C,D):
    V = set([word for doc in D.keys() for word in preprocessing(doc).split()])
    N = len(D)
    prior = dict()
    cond_prob = dict()
    N_c = dict()
    T = dict()
    text_c = dict()
    for c in C:
        text_c[c] = []
        for doc in D.items():
            if doc[1] == c:
                for word in doc[0].split():
                    text_c[c].append(word)
        N_c[c] = len([doc for doc in D.items() if doc[1] == c])
        prior[c] = float(N_c[c])/N
        cond_prob[c] = dict()
        T[c] = dict()
        for term in V:
            T[c][term] = text_c[c].count(term)
        for term in V:
            cond_prob[c][term] = float(T[c][term] + 1)/(sum(T[c].values()) + len(V))
 
    return V, prior, cond_prob

In [14]:
def test_NB(C,V,prior,cond_prob,d):
    W = []
    for word in d.split():
        if word in V:
            W.append(word)
    score = dict()
    for c in C:
        score[c] = np.log(prior[c])
        for term in W:
            score[c] += np.log(cond_prob[c][term])
    max_category = sorted(score.items(),key=lambda x: x[1],reverse= True)[0][0]
    return max_category

### Training
* on my computer this took about 10 minutes.  

In [None]:
V, prior, cond_prob = train_NB(C,D)

In [12]:
test_NB(C,V,prior,cond_prob,test_doc)

'china'

In [12]:
correct = 0
incorrect = 0
for i in range(len(testing_frame)): 
    if test_NB(C,V,prior,cond_prob,testing_frame.iloc[i,1]) == testing_frame.iloc[i,0]:
        correct += 1
    else:
        incorrect += 1
    print (test_NB(C,V,prior,cond_prob,testing_frame.iloc[i,1]), testing_frame.iloc[i,0], correct/(correct+incorrect))
accuracy = correct/(correct + incorrect)
print("Accuracy = {} %".format(accuracy))

Neg Pos 0.0
Neg Neg 0.5
Pos Neg 0.3333333333333333
Pos Pos 0.5
Neg Neg 0.6
Pos Neg 0.5
Pos Pos 0.5714285714285714
Neg Neg 0.625
Neg Pos 0.5555555555555556
Pos Pos 0.6
Pos Pos 0.6363636363636364
Neg Neg 0.6666666666666666
Pos Neg 0.6153846153846154
Pos Pos 0.6428571428571429
Pos Pos 0.6666666666666666
Neg Neg 0.6875
Pos Pos 0.7058823529411765
Neg Pos 0.6666666666666666
Pos Neg 0.631578947368421
Pos Pos 0.65
Neg Neg 0.6666666666666666
Neg Neg 0.6818181818181818
Neg Pos 0.6521739130434783
Neg Neg 0.6666666666666666
Pos Pos 0.68
Neg Neg 0.6923076923076923
Neg Neg 0.7037037037037037
Neg Neg 0.7142857142857143
Pos Pos 0.7241379310344828
Pos Pos 0.7333333333333333
Pos Pos 0.7419354838709677
Pos Pos 0.75
Neg Neg 0.7575757575757576
Neg Neg 0.7647058823529411
Pos Pos 0.7714285714285715
Neg Neg 0.7777777777777778
Neg Neg 0.7837837837837838
Neg Neg 0.7894736842105263
Neg Neg 0.7948717948717948
Neg Neg 0.8
Neg Neg 0.8048780487804879
Neg Pos 0.7857142857142857
Pos Neg 0.7674418604651163
Neg Neg 0.77

Neg Neg 0.8224543080939948
Pos Pos 0.8229166666666666
Neg Neg 0.8233766233766234
Pos Pos 0.8238341968911918
Pos Pos 0.8242894056847545
Pos Neg 0.8221649484536082
Pos Pos 0.8226221079691517
Pos Pos 0.823076923076923
Neg Pos 0.8209718670076727
Pos Pos 0.8214285714285714
Neg Neg 0.821882951653944
Pos Pos 0.8223350253807107
Pos Pos 0.8227848101265823
Pos Pos 0.8232323232323232
Pos Pos 0.8236775818639799
Neg Neg 0.8241206030150754
Neg Pos 0.8220551378446115
Neg Neg 0.8225
Accuracy = 0.8225 %


# About 80% accuracy