In [2]:
import pandas as pd
import numpy as np
import os

In [12]:
train_pos_files= os.listdir("train/pos/")
train_neg_files= os.listdir("train/neg")
test_pos_files= os.listdir("test/pos/")
test_neg_files= os.listdir("test/neg")

In [13]:
train_reviews=[]
for  pfile in train_pos_files:
    with open("train/pos/"+ pfile,encoding="latin1") as f:
        train_reviews.append(f.read())
for  nfile in train_neg_files:
    with open("train/neg/"+ nfile,encoding="latin1") as f:
        train_reviews.append(f.read())

In [14]:
train_reviews[0:3]

['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.',
 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D\'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detec

In [15]:
test_reviews=[]
for  pfile in test_pos_files:
    with open("test/pos/"+ pfile,encoding="latin1") as f:
        test_reviews.append(f.read())
for  nfile in test_neg_files:
    with open("test/neg/"+ nfile,encoding="latin1") as f:
        test_reviews.append(f.read())

In [16]:
print(len(train_reviews))
print(len(test_reviews))

25000
25000


In [33]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]*")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(train_reviews)
reviews_test_clean = preprocess_reviews(test_reviews)

In [34]:
reviews_train_clean[0:3]

['for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt',
 'bizarre horror movie filled with famous faces but stolen by cristina raines later of tvs flamingo road as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the gateway to hell the scenes with raines modeling are very well captured the mood music is perfect deborah raffin is charming as cristinas pal but when raines moves into a creepy brooklyn heights brownstone inhabited by a blind priest on the top floor things really start cooking the neighbors including a fantastically wicked burgess meredith and kinky couple sylvia miles & beverly dangelo are a diabolical lot and eli wallach is great fun as a wily police detective the movie is nearly a cr

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [36]:
X.shape

(25000, 93422)

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.87776
Accuracy for C=0.05: 0.88624
Accuracy for C=0.25: 0.88528
Accuracy for C=0.5: 0.88368
Accuracy for C=1: 0.88208


for c=0.05, acc=max; .886

In [39]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))



Final Accuracy: 0.88192


let’s look at the 5 most discriminating words for both positive and negative reviews. We’ll do this by looking at the largest and smallest coefficients, respectively.

In [40]:

feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
#     ('excellent', 0.9288812418118644)
#     ('perfect', 0.7934641227980576)
#     ('great', 0.675040909917553)
#     ('amazing', 0.6160398142631545)
#     ('superb', 0.6063967799425831)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('excellent', 0.9325214611945213)
('perfect', 0.7908008581797604)
('great', 0.6750726790852292)
('amazing', 0.6171374315767796)
('superb', 0.603478574271153)
('worst', -1.3648307920035132)
('waste', -1.1706093997182965)
('awful', -1.027488757989607)
('poorly', -0.8748598541893737)
('boring', -0.8563512732338213)
