In [1]:
# Importing the necessary packages
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score
import pickle

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Importing the IMDB dataset 
dataset = pd.read_csv('imdb_data.csv')
dataset

Unnamed: 0,review,sentiment
0,one reviewers mention watch 1 oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically theres family little boy jake think ...,negative
4,petter matteis love time money visually stun f...,positive
...,...,...
49995,think movie right good job wasnt creative orig...,positive
49996,bad plot bad dialogue bad act idiotic direct a...,negative
49997,catholic teach parochial elementary school nun...,negative
49998,im go disagree previous comment side maltin on...,negative


## Labelling the sentiments

In [4]:
# Labeling the sentiment data
lb = LabelBinarizer()

# Transformed sentiment data
sentiment_data = lb.fit_transform(dataset['sentiment'])
print(sentiment_data.shape)
print(sentiment_data)

(50000, 1)
[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


## Splitting the dataset

In [5]:
X_train_reviews, X_test_reviews, y_train_sentiments, y_test_sentiments = train_test_split(
    dataset['review'], sentiment_data, test_size = 0.20, random_state = 42)

In [6]:
print(X_train_reviews)
print(y_train_sentiments)

39087    thats keep ask many fight scream match swear g...
30893    watch entire movie could watch entire movie st...
45278    touch love story reminiscent mood love draw he...
16398    latterday fulci schlocker totally abysmal conc...
13653    first firmly believe norwegian movies continua...
                               ...                        
11284    shadow magic recapture joy amazement first mov...
44732    find movie quite enjoyable fairly entertain go...
38158    avoid one terrible movie excite pointless murd...
860      production quite surprise absolutely love obsc...
15795    decent movie although little bite short time p...
Name: review, Length: 40000, dtype: object
[[0]
 [0]
 [1]
 ...
 [0]
 [1]
 [1]]


## Vectorizing the reviews text for both train and test datasets using TF-IDF as a feature extraction

In [7]:
# Tfidf vectorizer
tv = TfidfVectorizer(ngram_range = (1,3))

In [8]:
# Transformed train reviews
tv_train_reviews = tv.fit_transform(X_train_reviews)

# Transformed test reviews
tv_test_reviews = tv.transform(X_test_reviews)

In [9]:
print('Tfidf_train_reviews:', tv_train_reviews.shape)
print('Tfidf_test_reviews:', tv_test_reviews.shape)

Tfidf_train_reviews: (40000, 6984669)
Tfidf_test_reviews: (10000, 6984669)


## Training and fitting the dataset using Multinomial Naive Bayes model 
From our earlier sentiment analysis, it is evident that Multinomial Naive Bayes Classifier has better accuracy in all evaluation metrics compared to other along with minimum training time.

In [11]:
# Training the Multinomial Naive Bayes Classifier
mnb = MultinomialNB()

# Fitting the Multinomial Naive Bayes Classifier for Tfidf features
mnb.fit(tv_train_reviews, y_train_sentiments.ravel())

MultinomialNB()

In [12]:
# Predicting the model for Tfidf features
mnb_tfidf_predictions = mnb.predict(tv_test_reviews)
print(mnb_tfidf_predictions)

[1 1 0 ... 1 0 0]


In [13]:
# Accuracy score for Tfidf features
mnb_tfidf_score = accuracy_score(y_test_sentiments, mnb_tfidf_predictions)
print("mnb_tfidf_score:", mnb_tfidf_score)

mnb_tfidf_score: 0.8903


In [14]:
# Serializing the Multinomial Naive Bayes Classifier using pickle and saving it for our final implementation
pickle.dump(mnb, open('pickle/classifier.pkl', 'wb'))