# Movie Reviews: Positive? Negative?

In [1]:
from nltk.corpus import movie_reviews
import nltk
import numpy as np
import pandas as pd
import random
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [24]:
features = [documents[i][0] for i in range(len(documents))]

In [7]:
features = [' '.join(features[i]) for i in range(len(features))]
features[0]

'" you \' ve got mail " is the very definition of a " cute " movie . it \' s got cute stars , a cute story - line , and even cute pets . despite this over - whelming cuteness ( or maybe because of it ) , i loved " you \' ve got mail " . i know , i know . i \' m supposed to be a film critic . i \' m not supposed to love transparent and formula pictures like this . but darn it , somewhere along the way , " you \' ve got mail " reeled me in , hook , line , and sinker . the movie stars tom hanks and meg ryan as two somewhat unhappy new yorkers who meet in an aol online chat room and quickly become close friends , without ever knowing each others identity . as the movie progresses , though , they begin to discover that they have a lot in common with each other . meanwhile , in " real - life " , they are bitter competitors . she owns a quaint little children \' s bookstore , he owns one of those gigantic super - bookstores that offer impersonal service and discount prices . while his store t

In [8]:
vec = CountVectorizer(binary=True)
X = vec.fit_transform(features).A
X.shape

(2000, 39659)

In [11]:
labels = np.array([documents[i][1] for i in range(len(documents))])
labels[:5]

array(['pos', 'pos', 'neg', 'neg', 'neg'], dtype='<U3')

In [12]:
le = LabelEncoder()
le.fit(movie_reviews.categories())
y = le.transform(labels)
y[:5]

array([1, 1, 0, 0, 0])

In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

In [15]:
# Decision Tree
clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(Xtrain, ytrain)
pred_tree = clf_tree.predict(Xtest)

In [16]:
print('Accuracy:', round(accuracy_score(ytest, pred_tree), 3))

Accuracy: 0.598


In [17]:
print(pd.DataFrame(confusion_matrix(ytest, pred_tree), 
                   index=['neg', 'pos'], 
                   columns=['predict neg','predict pos']))

     predict neg  predict pos
neg          165          125
pos          116          194


In [18]:
# Naive Bayes
clf_nb = MultinomialNB()
clf_nb.fit(Xtrain, ytrain)
pred_nb = clf_nb.predict(Xtest)

In [19]:
print('Accuracy:', round(accuracy_score(ytest, pred_nb), 3))

Accuracy: 0.818


In [20]:
print(pd.DataFrame(confusion_matrix(ytest, pred_nb), 
                   index=['neg', 'pos'], 
                   columns=['predict neg','predict pos']))

     predict neg  predict pos
neg          250           40
pos           69          241


In [21]:
# Maximum Entropy
clf_mx = LogisticRegression()
clf_mx.fit(Xtrain, ytrain)
pred_mx = clf_mx.predict(Xtest)

In [22]:
print('Accuracy:', round(accuracy_score(ytest, pred_mx), 3))

Accuracy: 0.852


In [23]:
print(pd.DataFrame(confusion_matrix(ytest, pred_mx), 
                   index=['neg', 'pos'], 
                   columns=['predict neg','predict pos']))

     predict neg  predict pos
neg          248           42
pos           47          263
