In [1]:
# Importing the text files and appending the reviews in lists

reviews_train = []
for line in open('movie_data/full_train.txt', 'r', encoding="utf8"):    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('movie_data/full_test.txt', 'r', encoding="utf8"):    
    reviews_test.append(line.strip())

# Assigning the Labels for the train and test data
target = [1 if i < 12500 else 0 for i in range(25000)]

In [2]:
# Removing unnessary symbols and converting the reviews to lowercase

import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [3]:
# Vectorizing the data and removing stop words

from sklearn.feature_extraction.text import CountVectorizer

stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

In [5]:
# Finding the correct C parameter for the SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)
for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, svm.predict(X_val))))    

Accuracy for C=0.001: 0.8824
Accuracy for C=0.005: 0.88752
Accuracy for C=0.01: 0.888
Accuracy for C=0.05: 0.8864
Accuracy for C=0.1: 0.88656


In [6]:
# Final Model

final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target, final.predict(X_test)))

Final Accuracy: 0.90064


In [7]:
# Top Ten Features

feature_to_coef = {
    word: coef for word, coef in zip(ngram_vectorizer.get_feature_names(), final.coef_[0])
}
print("Top 10 Best Positive Words")
for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]:
    print (best_positive)
    
print("\n\n")
print("Top 10 Best Negative Words")
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:10]:
    print (best_negative)

Top 10 Best Positive Words
('excellent', 0.2293215003000768)
('perfect', 0.18456042139913043)
('great', 0.17897485734680837)
('wonderful', 0.16014961375172793)
('amazing', 0.1541167865466618)
('superb', 0.14690756754242126)
('enjoyable', 0.14346762985178346)
('best', 0.13042554793274896)
('today', 0.12939426866280332)
('fun', 0.12682167448162868)



Top 10 Best Negative Words
('worst', -0.35899086740800534)
('awful', -0.255057522591135)
('boring', -0.24068185067131959)
('waste', -0.23683697684851077)
('bad', -0.22181964820606456)
('poor', -0.20193935428363136)
('terrible', -0.19984465028953458)
('dull', -0.18413720898632774)
('poorly', -0.1753406869434352)
('disappointment', -0.1748853421303067)
