In [1]:
import pandas as pd
import numpy as np 

data_dir = "naive_bayes/movie/input/"
train = pd.read_csv(data_dir + '/labeledTrainData.tsv', delimiter='\t')  
test = pd.read_csv(data_dir + '/testData.tsv', delimiter='\t')  
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [2]:
print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [4]:
import re

def review_processing(review):
    review_text = re.sub("[^a-zA-Z]", " ", review)
    words = review_text.lower()
    return words

full_train_y = train['sentiment']
full_train_x = []
for review in train['review']:
    full_train_x.append(review_processing(review))
full_train_x = np.array(full_train_x)

test_data = []
for review in test['review']:
    test_data.append(review_processing(review))
test_data = np.array(test_data)

print(full_train_x.shape)
print(test_data.shape)

(25000,)
(25000,)


In [6]:
from sklearn.model_selection import train_test_split

data_train, data_validation, labels_train, labels_validation = train_test_split(
    full_train_x, #text
    full_train_y, #label
    test_size=0.2, 
    random_state=0) #seed number

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(stop_words='english')
data_train_count = vectorizer.fit_transform(data_train)
data_validation_count  = vectorizer.transform(data_validation)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_data_train_count = tfidf_vectorizer.fit_transform(data_train)
tfidf_data_validation_count  = tfidf_vectorizer.transform(data_validation)

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB()
clf.fit(data_train_count, labels_train)
predictions = clf.predict(data_validation_count)
print(f"{accuracy_score(labels_validation, predictions)=}")

tfidf_clf = MultinomialNB()
tfidf_clf.fit(tfidf_data_train_count, labels_train)
tfidf_predictions = tfidf_clf.predict(tfidf_data_validation_count)
print(f"{accuracy_score(labels_validation, tfidf_predictions)=}")

accuracy_score(labels_validation, predictions)=0.8678
accuracy_score(labels_validation, tfidf_predictions)=0.8738
