# Importing Libraries

In [24]:
import pandas as pd
import sklearn
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer

# Loading Dataset

In [56]:
df = pd.read_table('Question2 Dataset.tsv',
                   sep='\t', 
                   header=None,
                   names=['id', 'label', 'review'])

  after removing the cwd from sys.path.


# Pre-Processing

In [57]:
df = df.drop(df.index[0])
df['label'] = df.label.map({'0': 0, '1': 1})
df['review'] = df.review.map(lambda x: x.lower())
df['review'] = df.review.map(lambda x: BeautifulSoup(x).get_text())
df['review'] = df.review.str.replace('[^\w\s]', '')
df['review'] = df['review'].apply(nltk.word_tokenize)

stemmer = PorterStemmer()
 
df['review'] = df['review'].apply(lambda x: [stemmer.stem(y) for y in x])

# Raw Count Vectorizer

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
df['review'] = df['review'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['review'])

# Training Naive Bayes Classifier on raw counts

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.25, random_state=69)

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

# Evaluation

In [60]:
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.84064


In [61]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[2709  364]
 [ 632 2545]]


# Tf Idf Transformers

In [62]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

# Training Naive Bayes Classifier on Tf-Idf weights

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.25, random_state=69)

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

# Evaluation

In [64]:
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.85248


In [65]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[2767  306]
 [ 616 2561]]
