# Large Movie Review Dataset Sentiment Analysis

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read in the review text into lists

reviews_train = []
for line in open('/content/drive/MyDrive/NLP_movie_review/aclImdb/movie_data/full_train.txt', 'r'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('/content/drive/MyDrive/NLP_movie_review/aclImdb/movie_data/full_test.txt', 'r'):
    
    reviews_test.append(line.strip())

In [3]:
print("Length of Train set: ", len(reviews_train))
print("Length of Test set: ", len(reviews_test))

Length of Train set:  25000
Length of Test set:  25000


In [4]:
# prepare the target: 1 for positive and -1 for negative

target = [1 if i < 12500 else -1 for i in range(25000)]

In [5]:
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [6]:
reviews_test[88]

"I am a huge fan of Harald Zwart, and I just knew that I had to see this movie, even though I can't say I'm a soccer fan. But watching this just filled my heart with joy, and I had a great time in the movies watching it.<br /><br />Bjørn Fast Nagell does a tremendous job directing this movie, and even though you notice the main characters are new at acting, they grow with the movie and makes it what it is. Even though it is supposed to be a soccer movie, there is surprisingly little soccer in it. The whole idea is to show the six guys making up the word N O R W A Y on their trip to the World Cup in soccer playing in Germany this year. <br /><br />If you're only gonna see one Norwegian movie this year, this is the one.."

In [7]:
# regex for preprocessing the text, removing the space and punctuations, also turning words to lower case

import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

# Logistic Regression 1
Using:
1. 1 to 3 ngram
2. a simplier list of stop words
3. Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

stop_words = ['in', 'of', 'at', 'a', 'the'] # a more simple list of stop words
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words) #using 1 to 3 ngrams
ngram_vectorizer.fit(reviews_train_clean)
train_set = ngram_vectorizer.transform(reviews_train_clean)
test_set = ngram_vectorizer.transform(reviews_test_clean)

X_train = train_set
y_train = target
X_test, X_val, y_test, y_val = train_test_split(
      test_set, target, train_size = 0.5
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


Accuracy for C=0.01: 0.88576
Accuracy for C=0.05: 0.89512
Accuracy for C=0.25: 0.89624
Accuracy for C=0.5: 0.8964
Accuracy for C=1: 0.89656


In [9]:
final_baseline = LogisticRegression(C=1) # taking c with highest accuracy
final_baseline.fit(X_train, y_train)
print ("Final Accuracy of Best Model: %s" 
       % accuracy_score(y_test, final_baseline.predict(X_test)))

print("Confusion Matrix of Best Model:")
print(confusion_matrix(y_test, final_baseline.predict(X_test)))

Final Accuracy of Best Model: 0.9024
Confusion Matrix of Best Model:
[[5627  634]
 [ 586 5653]]


# Logistic Regression 2
Using:
1. 1 to 3 ngram
2. a simplier list of stop words
3. Lemmatization
4. Logistic Regression

In [10]:
# lemmatization for the corpus

def get_lemmatized_text(corpus):
    
    import nltk
    nltk.download('wordnet')
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
from sklearn.linear_model import LogisticRegression

stop_words = ['in', 'of', 'at', 'a', 'the'] # a more simple list of stop words
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words) #using 1 to 3 ngrams
ngram_vectorizer.fit(lemmatized_reviews_train)
train_set = ngram_vectorizer.transform(lemmatized_reviews_train)
test_set = ngram_vectorizer.transform(lemmatized_reviews_test)

X_train = train_set
y_train = target
X_test, X_val, y_test, y_val = train_test_split(
      test_set, target, train_size = 0.5
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


Accuracy for C=0.01: 0.88656
Accuracy for C=0.05: 0.8936
Accuracy for C=0.25: 0.89632
Accuracy for C=0.5: 0.8968
Accuracy for C=1: 0.89648


In [12]:
final_baseline2 = LogisticRegression(C=0.5) # taking c with highest accuracy
final_baseline2.fit(X_train, y_train)
print ("Final Accuracy of Best Model: %s" 
       % accuracy_score(y_test, final_baseline2.predict(X_test)))

print("Confusion Matrix of Best Model:")
print(confusion_matrix(y_test, final_baseline2.predict(X_test)))

Final Accuracy of Best Model: 0.90224
Confusion Matrix of Best Model:
[[5562  662]
 [ 560 5716]]


# Linear Support Vector Classification 1
Using:
1. 1 to 3 ngram
2. a simplier list of stop words
3. Linear Support Vector  

In [14]:
from sklearn.svm import LinearSVC

stop_words = ['in', 'of', 'at', 'a', 'the'] # a more simple list of stop words
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words) #using 1 to 3 ngrams
ngram_vectorizer.fit(reviews_train_clean)
train_set = ngram_vectorizer.transform(reviews_train_clean)
test_set = ngram_vectorizer.transform(reviews_test_clean)

X_train = train_set
y_train = target
X_test, X_val, y_test, y_val = train_test_split(
      test_set, target, train_size = 0.5
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))


Accuracy for C=0.001: 0.89584
Accuracy for C=0.005: 0.90112
Accuracy for C=0.01: 0.9016
Accuracy for C=0.05: 0.9
Accuracy for C=0.1: 0.89928


In [15]:
final_svm = LinearSVC(C=0.001) # taking c with highest accuracy
final_svm.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_svm.predict(X_test)))

print("Confusion Matrix of Best Model:")
print(confusion_matrix(y_test, final_svm.predict(X_test)))

Final Accuracy: 0.89576
Confusion Matrix of Best Model:
[[5539  689]
 [ 614 5658]]


# Linear Support Vector Classification 2
Using:
1. 1 to 3 ngram
2. a simplier list of stop words
3. Lemmatization
4. Linear Support Vector  

In [16]:
from sklearn.svm import LinearSVC

stop_words = ['in', 'of', 'at', 'a', 'the'] # a more simple list of stop words
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words) #using 1 to 3 ngrams
ngram_vectorizer.fit(lemmatized_reviews_train)
train_set = ngram_vectorizer.transform(lemmatized_reviews_train)
test_set = ngram_vectorizer.transform(lemmatized_reviews_test)

X_train = train_set
y_train = target
X_test, X_val, y_test, y_val = train_test_split(
      test_set, target, train_size = 0.5
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.001: 0.89808
Accuracy for C=0.005: 0.902
Accuracy for C=0.01: 0.9024
Accuracy for C=0.05: 0.9032
Accuracy for C=0.1: 0.9024


In [17]:
final_svm2 = LinearSVC(C=0.05) # taking c=0.05 for highest accuracy
final_svm2.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_svm2.predict(X_test)))

print("Confusion Matrix of Best Model:")
print(confusion_matrix(y_test, final_svm2.predict(X_test)))

Final Accuracy: 0.898
Confusion Matrix of Best Model:
[[5529  688]
 [ 587 5696]]


Experimented with linear models of logistic regression and Linear Support Vector Classification, both models gave similar performance, with accuracy close to 90%.

Also, tried out both linear models with features after lemmatization. The results did not show lemmatization helps in improving accuracy greatly.

In [21]:
# save "final_baseline" logistic regression model to disk

Pkl_Filename = "/content/drive/MyDrive/NLP_movie_review/final_baseline_lr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_baseline, file)