## importing libraries

In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

'''
some important links
https://www.kaggle.com/vladislavslevin/nltk-word2vec-bag-of-words-bags-of-popcorn
https://www.kaggle.com/sameerdev7/93-f-score-bag-of-words-m-bags-of-popcorn-with-rf
'''

## load the dataset

In [2]:
df = pd.read_csv('/users/dharamvir/Downloads/homl2020imdb/train.csv')

In [3]:
df.shape

(35000, 2)

In [4]:
df = df.drop_duplicates()

In [5]:
df.shape

(34793, 2)

In [6]:
#checking class distrubution
df.sentiment.value_counts() #the data seems balanced

positive    17435
negative    17358
Name: sentiment, dtype: int64

## preprocess the data

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
encoder = LabelEncoder()
df.sentiment = encoder.fit_transform(df.sentiment)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
#regex expressions
def prepro_review(review):
    # remove all html tags
    review = BeautifulSoup(review,'html.parser').get_text()
    
    # Replace email addresses with 'email'
    review = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$',' ',review)
    
    # Replace URLs with 'webaddress'
    review = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',' ', review)
    
    # Replace money symbols with 'moneysymb'
    review = re.sub(r'£|\$', ' ', review)
    
    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    review = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',' ', review)
    
    # remove number
    review = re.sub(r'\d+(\.\d+)?', ' ', review)
    
    # Remove punctuation
    review = re.sub(r'[^\w\d\s]', ' ', review)
    
    # Replace whitespace between terms with a single space
    review = re.sub(r'\s+', ' ', review)
    
    # Remove leading and trailing whitespace
    review = re.sub(r'^\s+|\s+?$', '', review)
    
    # change words to lower case
    review = review.lower()
    
    return review

In [10]:
df['processed_review'] = df['review'].apply(prepro_review)

In [11]:
df['processed_review'][0]

'one of the other reviewers has mentioned that after watching just oz episode you ll be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare forget pretty p

In [12]:
from nltk.corpus import stopwords

# remove stop words from text messages
stop_words = set(stopwords.words('english'))

df['processed_review'] = df['processed_review'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
   

In [13]:
df['processed_review'][0]

'one reviewers mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows dare forget pretty pictures painted mainstream audiences forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitches due lack street skill

In [14]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [15]:
# Remove word stems using a Porter stemmer
#lemmetizing each word
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['processed_review'] =  df['processed_review'].apply(lambda x: ' '.join(lemmatizer.lemmatize(term, get_wordnet_pos(term)) for term in x.split()))


In [16]:
df['processed_review'][0]

'one reviewer mention watch oz episode hooked right exactly happen first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watch developed taste oz get accustom high level graphic violence violence injustice crooked guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison experience watch oz may become comfortable

## Generating Features

In [17]:
# use CountVectorizer scikit-learn object to create bag of words
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 

In [18]:
clean_train_reviews = []
for review in df['processed_review']:
    clean_train_reviews.append(review)

In [20]:
len(clean_train_reviews)

34793

In [21]:
# fit-transform learns the vocabulary and transforms training data into feature vectors
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# transform list of strings to numpy array for more efficiency
train_data_features = train_data_features.toarray()

In [22]:
train_data_features.shape

(34793, 5000)

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(train_data_features,df["sentiment"],test_size=0.2,stratify=df["sentiment"],random_state=60616)

In [29]:
X_train.shape

(27834, 5000)

## Scikit-Learn Classifiers

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(n_neighbors=2),
    DecisionTreeClassifier(random_state=60616),
    RandomForestClassifier(n_estimators = 100),
    LogisticRegression(random_state=60616),
    SGDClassifier(max_iter = 100, tol=1e-3),
    MultinomialNB()
]

for name, classifier in zip(names, classifiers):
    model = classifier
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    accuracy = accuracy_score(y_test,result)
    f1=f1_score(y_test,result)
    print("{} Accuracy: {}".format(name, accuracy*100))
    print("{} F1 Score: {}\n".format(name, f1*100))

K Nearest Neighbors Accuracy: 61.8623365426067
K Nearest Neighbors F1 Score: 56.42036124794745

Decision Tree Accuracy: 71.05906020980026
Decision Tree F1 Score: 70.88753975137323

Random Forest Accuracy: 83.90573358241127
Random Forest F1 Score: 83.6924868957484



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression Accuracy: 86.90903865497917
Logistic Regression F1 Score: 86.90527526232572

SGD Classifier Accuracy: 86.33424342577956
SGD Classifier F1 Score: 86.1228658981468

Naive Bayes Accuracy: 85.12717344446041
Naive Bayes F1 Score: 84.95858160151141



In [38]:
mymodel = LogisticRegression(random_state=60616, max_iter=350)
mymodel.fit(X_train, y_train)
result = mymodel.predict(X_test)

accuracy = accuracy_score(y_test,result)
f1=f1_score(y_test,result)

print("Accuracy: {}".format(accuracy*100))
print("F1 Score: {}\n".format(f1*100))

Accuracy: 86.8659290127892
F1 Score: 86.86026451983899



## getting the prediction of test.csv

In [39]:
test_df = pd.read_csv('/users/dharamvir/Downloads/homl2020imdb/test.csv')

In [40]:
test_df.head()

Unnamed: 0,id,review
0,1,Surviving Christmas is a surprisingly funny mo...
1,2,"Actually this movie has silly moments, both in..."
2,3,If ever there were an inspiring story that cou...
3,4,Hong Kong filmmaker Chang Chang Ho's 1972 mart...
4,5,There was no characterization in this movie an...


In [65]:
test_df.head()

Unnamed: 0,id,review
0,1,Surviving Christmas is a surprisingly funny mo...
1,2,"Actually this movie has silly moments, both in..."
2,3,If ever there were an inspiring story that cou...
3,4,Hong Kong filmmaker Chang Chang Ho's 1972 mart...
4,5,There was no characterization in this movie an...


In [68]:
#preprocessing the test data

test_clean_review = []
for review in test_df['review']:
    review = prepro_review(review)
    
    split_review1 = review.split()
    meaningful_words = [w for w in split_review1 if w not in stop_words]
    sentencewithoutstopword = " ".join(meaningful_words)
    
    split_review2 = sentencewithoutstopword.split()
    lemmetize_words = [lemmatizer.lemmatize(term, get_wordnet_pos(term)) for term in split_review2]
    lemmetizedsentence = " ".join(lemmetize_words)

    test_clean_review.append(lemmetizedsentence)

In [71]:
# get a bag of words of test data
test_data_features = vectorizer.transform(test_clean_review)

# transform to numpy array for more efficiency
test_data_features = test_data_features.toarray()

In [77]:
testresult = mymodel.predict(test_data_features)

In [78]:
output = pd.DataFrame(data={"id":test_df["id"], "sentiment":testresult})

In [81]:
output.to_csv("/users/dharamvir/Downloads/homl2020imdb/test_result.csv", index=False)