In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import os
import re

In [6]:
review_train = []
for line in open('../IMDB/movie_data/full_train.txt', 'r', encoding="utf8"):
    review_train.append(line.strip())
    
review_test = []
for line in open('../IMDB/movie_data/full_test.txt', 'r', encoding="utf8"):
    review_test.append(line.strip())

In [7]:
review_train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

In [11]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

review_train_clean = preprocess_reviews(review_train)
review_test_clean = preprocess_reviews(review_test)

In [13]:
review_train_clean[5]

'this isnt the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robins new love of the thriller but this isnt a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until williams character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all its worth a watch though its definitely not friday saturday night fare it rates a 77 10 from the fiend '

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(review_train_clean)
X = cv.transform(review_train_clean)
X_test = cv.transform(review_test_clean)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
target = [1 if i < 12500 else 0 for i in range(25000)]

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)



In [24]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))    



Accuracy for C=0.01: 0.87376
Accuracy for C=0.05: 0.8784
Accuracy for C=0.25: 0.87824
Accuracy for C=0.5: 0.87568
Accuracy for C=1: 0.87376


In [26]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))



Final Accuracy: 0.88152


In [27]:
feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), final_model.coef_[0])}

In [28]:
for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(best_positive)

('excellent', 0.9292548987041048)
('perfect', 0.7907005549883321)
('great', 0.674532352444648)
('amazing', 0.6127039793857285)
('superb', 0.6019368114073794)


In [29]:
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:5]:
    print(best_negative)

('worst', -1.3645958742800888)
('waste', -1.1664242375718927)
('awful', -1.0324190112779585)
('poorly', -0.8752018717969205)
('boring', -0.8563543399032894)


https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184