In [9]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

import dhrv04_preprocess as pp

In [10]:
data = pd.read_csv('IMDB Dataset.csv')

In [11]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
data.shape

(50000, 2)

In [13]:
# Converting the text to lowercase

data['review'] = data['review'].apply(lambda x: str(x).lower())

### Preprocessing the data

In [14]:
def preprocess_data(df, col):
    
    df[col] = df[col].apply(lambda x: str(x).lower())
    df[col] = df[col].apply(lambda x: pp.getContractionExpansion(x))
    df[col] = df[col].apply(lambda x: pp.RemoveEmails(x))
    df[col] = df[col].apply(lambda x: pp.RemoveHTMLTags(x))
    df[col] = df[col].apply(lambda x: pp.RemoveSpecialChars(x))
    df[col] = df[col].apply(lambda x: pp.RootWord(x))
    df[col] = df[col].apply(lambda x: pp.RemoveStopWords(x))
    

In [15]:
%%time

preprocess_data(data, 'review')

Wall time: 27min 25s


In [16]:
data.sample(5)

Unnamed: 0,review,sentiment
8915,new year 2006 m watch glimmer man like find st...,negative
20903,man love new dvd universal sentinel look good ...,positive
19221,radio good movie honestly cry movie pretty clo...,positive
28395,read book movie care find somewhat trashy tras...,negative
39569,quagmire mediocrity thissimply frostbite worth...,negative


In [17]:
text = ' '.join(data['review'])

In [18]:
len(text)

34625356

In [19]:
text_series = pd.Series(text.split())

In [20]:
freq_comm = text_series.value_counts()

In [33]:
rare_words = freq_comm[-134000:-1]

In [34]:
rare_words

siri                 1
unexpressive         1
onegrante            1
hippocrates          1
lifesomethe          1
impertinent          1
kubricksigne         1
dancingwho           1
interrogationit      1
damagedreminde       1
pathetichave         1
engagingi            1
deservesdeserve      1
ceosee               1
showrequire          1
changedbefore        1
nearturkey           1
bucksbut             1
brownvelvet          1
quotekisse           1
indiemania           1
silencejame          1
drsuess              1
anymorea             1
hispaniclatin        1
witgensteins         1
geothermic           1
lives1               1
pestario             1
tombhis              1
                    ..
finaleany            1
advanceother         1
yearsacting          1
upbig                1
performancebehind    1
fa18s                1
itviewer             1
effectswrong         1
yearsgood            1
frontierof           1
yakkityyak           1
playedand            1
unpowered  

In [23]:
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))

### Converting the data into TF-IDF vector

In [24]:
X = data['review']
y = data['sentiment']

In [25]:
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [26]:
tfidf = TfidfVectorizer()

In [27]:
X_vector = tfidf.fit_transform(X)

In [28]:
X_vector

<50000x72122 sparse matrix of type '<class 'numpy.float64'>'
	with 3888715 stored elements in Compressed Sparse Row format>

### Splitting the data into Test and Train

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.5, random_state = 0, stratify = y)

### Deploying Logistic Regression with Default Parameters

In [36]:
clf_lr = LogisticRegression()

In [37]:
clf_lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
y_pred = clf_lr.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.87      0.88     12500
    positive       0.87      0.90      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [40]:
clf_lr.predict(tfidf.transform(['That was a really bad movie']))

array(['negative'], dtype=object)

In [41]:
clf_lr.predict(tfidf.transform(['American Psycho deserved an Oscar, they were robbed']))

array(['positive'], dtype=object)