In [20]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [4]:
data_review = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
data_review.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [7]:
nltk.download("stopwords")

def review_to_words(raw_review):

    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()

    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    words = letters_only.lower().split()

    stops = set(stopwords.words("english"))

    meaningful_words = [w for w in words if not w in stops]

    return " ".join(meaningful_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muher\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
x_not_clean = data_review["review"].values
y = data_review["sentiment"]
X = []

for i in range(0, len(x_not_clean)):
    X.append(review_to_words(x_not_clean[i]))



In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=26)


In [15]:
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(X_train)
X_test_tf = tf_idf.transform(X_test)

In [17]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)

In [18]:
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      2484
           1       0.87      0.84      0.86      2516

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [5]:
import pickle
filename = 'model.sav'
pickle.dump(naive_bayes_classifier, open(filename, 'wb'))

filename = 'tfidf.sav'
pickle.dump(tf_idf, open(filename, 'wb'))