# IMDB Movie Reviews Sentiment Analysis and Model Deployment with Flask

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
import preprocess_smz as ps

In [8]:
df = pd.read_csv('data/imdb_reviews.txt', sep = '\t', header = None, names=['reviews','sentiment'])

In [9]:
df.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Text Cleaning

In [12]:
df['reviews'] = df['reviews'].apply(lambda x: ps.cont_exp(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_special_chars(x))

df['reviews'] = df['reviews'].apply(lambda x: ps.remove_accented_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_emails(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_html_tags(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_urls(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.make_base(x))
df['reviews'] = df['reviews'].apply(lambda x: str(x).lower())


df


Unnamed: 0,reviews,sentiment
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lose the flat character ...,0
2,attempt artiness with black white and clever c...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when gerardo i...,1
...,...,...
743,i just get bored watch jessice lange take her ...,0
744,unfortunately any virtue in this film producti...,0
745,in a word it is embarrassing,0
746,exceptionally bad,0


## Data Preparation for Model Training

In [13]:
X = df['reviews']
y = df['sentiment']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [15]:
X_train.shape, X_test.shape

((598,), (150,))

## ML Logistic Regression Model Building 

In [16]:
tfidf_obj = TfidfVectorizer()
lr_obj = LogisticRegression(solver = 'liblinear')
pipe = Pipeline([
    ('tfidf', tfidf_obj ),
    ('clf', lr_obj)
])

In [17]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [18]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = None)

In [19]:
%%time
clf.fit(X_train, y_train)

Wall time: 8.54 s


GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(solver='liblinear'))]),
             n_jobs=-1,
             param_grid={'clf__C': (1, 2), 'clf__penalty': ('l2', 'l1'),
                         'tfidf__analyzer': ('word', 'char', 'char_wb'),
                         'tfidf__max_df': (0.5, 1.0),
                         'tfidf__ngram_range': ((1, 1), (1, 2)),
                         'tfidf__use_idf': (True, False)})

In [20]:
clf.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                ('clf', LogisticRegression(C=2, solver='liblinear'))])

In [21]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [22]:
clf.best_score_

0.7692016806722689

In [23]:
y_pred = clf.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.68      0.70        73
           1       0.71      0.74      0.73        77

    accuracy                           0.71       150
   macro avg       0.71      0.71      0.71       150
weighted avg       0.71      0.71      0.71       150



In [28]:
clf.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                ('clf', LogisticRegression(C=2, solver='liblinear'))])

In [50]:
x = ['Jown wick ... he is best as always', 'you can watch yourself in mirror rather than watching this bad planned silly movie']
clf.predict(x)

array([1, 0], dtype=int64)

## SVM Model 

In [29]:
from sklearn.svm import LinearSVC

In [31]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [32]:
hyperparameters = {
    'tfidf__max_df': [0.5, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__use_idf': [True, False],
    'tfidf__analyzer': ['word', 'char', 'char_wb'],
    'clf__C': [1,2,2.5,3]
}

In [33]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = 5)

In [34]:
%%time
clf.fit(X_train, y_train)

Wall time: 8.45 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', LinearSVC())]),
             n_jobs=-1,
             param_grid={'clf__C': [1, 2, 2.5, 3],
                         'tfidf__analyzer': ['word', 'char', 'char_wb'],
                         'tfidf__max_df': [0.5, 1.0],
                         'tfidf__ngram_range': [(1, 1), (1, 2)],
                         'tfidf__use_idf': [True, False]})

In [35]:
clf.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                ('clf', LinearSVC(C=1))])

In [36]:
clf.best_params_

{'clf__C': 1,
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [37]:
clf.best_score_

0.7808823529411765

In [38]:
y_pred = clf.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70        73
           1       0.72      0.70      0.71        77

    accuracy                           0.71       150
   macro avg       0.71      0.71      0.71       150
weighted avg       0.71      0.71      0.71       150



## Model Testing and Saving 

In [46]:
x = ['Jown wick ... he is best as always', 'you can watch yourself in mirror rather than watching this bad planned silly movie']

In [54]:
clf.predict(["overrated movie"])

array([1], dtype=int64)

In [48]:
import pickle as pkl

In [49]:
pkl.dump(clf, open('model.pkl', 'wb'))