In [1]:
pathname = "/Users/Admin/Documents/Study/studypython/movie_data"

file1 = open(pathname+"/full_train.txt", "r", encoding="utf-8")
file2 = open(pathname+"/full_test.txt", "r", encoding="utf-8")


In [2]:
train_reviews = []

for line in file1:
    train_reviews.append(line.strip())

test_reviews = []

for line in file2:
    test_reviews.append(line.strip())

In [3]:
print(train_reviews[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [4]:
### Data Preprocessing ##
import re

# 1. Remove punctuations and special characters
pattern1 = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
pattern2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

train_reviews = [pattern1.sub("", item.lower()) for item in train_reviews]
train_reviews = [pattern2.sub(" ", item.lower()) for item in train_reviews]

test_reviews = [pattern1.sub("", item.lower()) for item in test_reviews]
test_reviews = [pattern2.sub(" ", item.lower()) for item in test_reviews]


In [5]:
print(train_reviews[0])

bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt


In [6]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

# 2. Remove Stop words
stop_words = set(stopwords.words('english'))

train_reviews_wo_stop = []

for item in train_reviews:
    words = word_tokenize(item)
    words_wo_stop = [w for w in words if w not in stop_words]
    item_wo_stop = ' '.join(words_wo_stop)
    train_reviews_wo_stop.append(item_wo_stop)

test_reviews_wo_stop = []

for item in test_reviews:
    words = word_tokenize(item)
    words_wo_stop = [w for w in words if w not in stop_words]
    item_wo_stop = ' '.join(words_wo_stop)
    test_reviews_wo_stop.append(item_wo_stop)
    


In [7]:
print(train_reviews_wo_stop[0])

bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell highs satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity isnt


In [8]:
# 3. a. Stemming

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

train_reviews_wostop_stemmed = []

for item in train_reviews_wo_stop:
    words = word_tokenize(item)
    stem_words = [porter_stemmer.stem(w) for w in words]
    item_stem = ' '.join(stem_words)
    train_reviews_wostop_stemmed.append(item_stem)

test_reviews_wostop_stemmed = []

for item in test_reviews_wo_stop:
    words = word_tokenize(item)
    stem_words = [porter_stemmer.stem(w) for w in words]
    item_stem = ' '.join(stem_words)
    test_reviews_wostop_stemmed.append(item_stem)



In [9]:
print(test_reviews_wostop_stemmed[0])

went saw movi last night coax friend mine ill admit reluct see knew ashton kutcher abl comedi wrong kutcher play charact jake fischer well kevin costner play ben randal profession sign good movi toy emot one exactli entir theater sold overcom laughter first half movi move tear second half exit theater saw mani women tear mani full grown men well tri desper let anyon see cri movi great suggest go see judg


In [10]:
### Feature Engineering - Vectorization ##

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

train_reviews_counts = count_vect.fit_transform(train_reviews_wostop_stemmed)

train_reviews_tfidf = tfidf_transformer.fit_transform(train_reviews_counts)

test_reviews_counts = count_vect.transform(test_reviews_wostop_stemmed)

test_reviews_tfidf = tfidf_transformer.transform(test_reviews_counts)

In [11]:
test_reviews_tfidf.shape

(25000, 65269)

In [12]:
### Building a Classifier ##

# target labels for the training and test data : the first 12.5k are positive and the last 12.5k are negative
train_target = [1 if i < 12500 else 0 for i in range(25000)]
test_target = [1 if i < 12500 else 0 for i in range(25000)]

# Train the classifier
# 1. Naive Bayes:
from sklearn.naive_bayes import MultinomialNB
cl1 = MultinomialNB()
cl1.fit(train_reviews_tfidf, train_target)

# 2. Logistic Regression:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression(C = 0.05)
cl2.fit(train_reviews_tfidf, train_target)

# 3. SVM:
from sklearn.linear_model import SGDClassifier
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_reviews_tfidf, train_target)



SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [13]:
# Predict the test data labels:

pr1 = cl1.predict(test_reviews_tfidf)
pr2 = cl2.predict(test_reviews_tfidf)
pr3 = cl3.predict(test_reviews_tfidf)


In [14]:
# Find the accuracy of the various classifiers

from sklearn.metrics import accuracy_score
print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))


Naive Bayes: 0.82064
Logistic Regression: 0.84772
SVM: 0.85288


In [16]:
## function to convert pos_tag to wordnet compatible pos tags ##
from collections import defaultdict
from nltk.corpus import wordnet

tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV
    

In [17]:
####### With Lemmatization #######

# 3. b. Lemmatization

# POS Tagging: (In order to lemmatize we need the POS tags)

import nltk
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

train_reviews_wostop_lemma = []

for item in train_reviews_wo_stop:
    words = word_tokenize(item)
    postags = nltk.pos_tag(words)
    wordlist, taglist = zip(*postags)
    lemmas = [lemmatizer.lemmatize(w, pos = tag_map[t[0]]) for w,t in zip(wordlist,taglist)]
    item_lemmas = ' '.join(lemmas)
    train_reviews_wostop_lemma.append(item_lemmas)
    
test_reviews_wostop_lemma = []

for item in test_reviews_wo_stop:
    words = word_tokenize(item)
    postags = nltk.pos_tag(words)
    wordlist, taglist = zip(*postags)
    lemmas = [lemmatizer.lemmatize(w, pos = tag_map[t[0]]) for w,t in zip(wordlist,taglist)]
    item_lemmas = ' '.join(lemmas)
    test_reviews_wostop_lemma.append(item_lemmas)

In [18]:
print(test_reviews_wostop_lemma[0])

go saw movie last night coax friend mine ill admit reluctant see knew ashton kutcher able comedy wrong kutcher played character jake fischer well kevin costner play ben randall professionalism sign good movie toy emotion one exactly entire theater sell overcome laughter first half movie move tear second half exit theater saw many woman tear many full grow men well try desperately let anyone see crying movie great suggest go see judge


In [19]:
### Feature Engineering - Vectorization ##

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

train_reviews_counts = count_vect.fit_transform(train_reviews_wostop_lemma)

train_reviews_tfidf = tfidf_transformer.fit_transform(train_reviews_counts)

test_reviews_counts = count_vect.transform(test_reviews_wostop_lemma)

test_reviews_tfidf = tfidf_transformer.transform(test_reviews_counts)

In [21]:
train_reviews_tfidf.shape

(25000, 82993)

In [22]:
### Building a Classifier ##

# Train the classifier
# 1. Naive Bayes:
from sklearn.naive_bayes import MultinomialNB
cl1 = MultinomialNB()
cl1.fit(train_reviews_tfidf, train_target)

# 2. Logistic Regression:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression(C = 0.05)
cl2.fit(train_reviews_tfidf, train_target)

# 3. SVM:
from sklearn.linear_model import SGDClassifier
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_reviews_tfidf, train_target)



SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [23]:
# Predict the test data labels:

pr1 = cl1.predict(test_reviews_tfidf)
pr2 = cl2.predict(test_reviews_tfidf)
pr3 = cl3.predict(test_reviews_tfidf)

In [24]:
# Find the accuracy of the various classifiers

from sklearn.metrics import accuracy_score
print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))


Naive Bayes: 0.82536
Logistic Regression: 0.84236
SVM: 0.84464


In [28]:
#### So as we can see, Lemmatizing doesn't seem to have improved the accuracy that much.In fact in the case of Logistic
# Regression and SVM, the accuracy has decreased. ####

## Let us Vectorize with n-gram (previously it was unigram) - getting a Document-Term TF_IDF matrix with
# bigram - using the original text (only applying stop word removal and NOT DOING stemming/lemmatization)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2))
train_reviews_tfidf = tfidf.fit_transform(train_reviews)
test_reviews_tfidf = tfidf.transform(test_reviews)


In [29]:
### Building a Classifier ##

# Train the classifier
# 1. Naive Bayes:
from sklearn.naive_bayes import MultinomialNB
cl1 = MultinomialNB()
cl1.fit(train_reviews_tfidf, train_target)

# 2. Logistic Regression:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression(C = 0.05)
cl2.fit(train_reviews_tfidf, train_target)

# 3. SVM:
from sklearn.linear_model import SGDClassifier
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_reviews_tfidf, train_target)


SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [30]:
# Predict the test data labels:

pr1 = cl1.predict(test_reviews_tfidf)
pr2 = cl2.predict(test_reviews_tfidf)
pr3 = cl3.predict(test_reviews_tfidf)

In [31]:
# Find the accuracy of the various classifiers

from sklearn.metrics import accuracy_score
print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))


Naive Bayes: 0.8548
Logistic Regression: 0.83268
SVM: 0.79808


In [32]:
## Let us Vectorize with n-gram (previously it was unigram) - getting a Document-Term TF_IDF matrix with
# bigram - using the original text (applying stop word removal and lemmatization)

train_reviews_wostop_lemma

tfidf = TfidfVectorizer(ngram_range=(1,2))
train_reviews_tfidf = tfidf.fit_transform(train_reviews_wostop_lemma)
test_reviews_tfidf = tfidf.transform(test_reviews_wostop_lemma)

### Building a Classifier ##

# Train the classifier
# 1. Naive Bayes:
from sklearn.naive_bayes import MultinomialNB
cl1 = MultinomialNB()
cl1.fit(train_reviews_tfidf, train_target)

# 2. Logistic Regression:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression(C = 0.05)
cl2.fit(train_reviews_tfidf, train_target)

# 3. SVM:
from sklearn.linear_model import SGDClassifier
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_reviews_tfidf, train_target)

# Predict the test data labels:

pr1 = cl1.predict(test_reviews_tfidf)
pr2 = cl2.predict(test_reviews_tfidf)
pr3 = cl3.predict(test_reviews_tfidf)

# Find the accuracy of the various classifiers

from sklearn.metrics import accuracy_score
print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))


Naive Bayes: 0.85592
Logistic Regression: 0.8338
SVM: 0.80456


In [33]:
## Let us Vectorize with n-gram (previously it was unigram) - getting a Document-Term TF_IDF matrix with
# trigram - using the original text (applying stop word removal and lemmatization)

train_reviews_wostop_lemma

tfidf = TfidfVectorizer(ngram_range=(1,3))
train_reviews_tfidf = tfidf.fit_transform(train_reviews_wostop_lemma)
test_reviews_tfidf = tfidf.transform(test_reviews_wostop_lemma)

### Building a Classifier ##

# Train the classifier
# 1. Naive Bayes:
from sklearn.naive_bayes import MultinomialNB
cl1 = MultinomialNB()
cl1.fit(train_reviews_tfidf, train_target)

# 2. Logistic Regression:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression(C = 0.05)
cl2.fit(train_reviews_tfidf, train_target)

# 3. SVM:
from sklearn.linear_model import SGDClassifier
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_reviews_tfidf, train_target)

# Predict the test data labels:

pr1 = cl1.predict(test_reviews_tfidf)
pr2 = cl2.predict(test_reviews_tfidf)
pr3 = cl3.predict(test_reviews_tfidf)

# Find the accuracy of the various classifiers

from sklearn.metrics import accuracy_score
print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))


Naive Bayes: 0.85928
Logistic Regression: 0.82944
SVM: 0.82068


# Comparison of the Results:


## Remove Stop Words + Stemming (Porter) + Unigram TF-IDF

### Naive Bayes: 0.82064     Logistic Regression: 0.84772    SVM: 0.85288



## Remove Stop Words + Lemmatization (Wordnet) + Unigram TF-IDF

### Naive Bayes: 0.82536    Logistic Regression: 0.84236    SVM: 0.84464



## Remove Stop Words + Bigram TF-IDF

### Naive Bayes: 0.8548    Logistic Regression: 0.83268    SVM: 0.79808



## Remove Stop Words + Lemmatization (Wordnet) + Bigram TF-IDF

### Naive Bayes: 0.85592    Logistic Regression: 0.8338    SVM: 0.80456



## Remove Stop Words + Lemmatization (Wordnet) + Trigram TF-IDF

### Naive Bayes: 0.85928    Logistic Regression: 0.82944    SVM: 0.82068