# 0. Import Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import nltk
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix as cm
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/eesha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 1. Building the Dataset

### 1.1 Importing CSV

In [2]:
data = pd.read_csv('imdb.csv', encoding='latin-1', index_col = 0)
data = data.drop(columns = ['type', 'file'])
data = data[data.label != 'unsup']

data = data.sample(frac=1)

data.head()

Unnamed: 0,review,label
24982,I am not a golf fan by any means. On May 26 ab...,pos
15700,I was at the premier of the movie last night i...,pos
36141,As a cinema fan White Noise was an utter disap...,neg
49858,"In this 1943 film, Judy Garland is deemed not ...",pos
33979,I would of enjoyed this film but Van Damme jus...,neg


### 1.2 Splitting data into X and y

In [3]:
X = data['review'].values
y = data['label'].values

labels = ['neg', 'pos']

### 1.3 Preprocessing - cleaning data and forming a Bag of Words

In [4]:
def preprocess(X,y):
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)
    lemm = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    Xnew = []
    for text in X:
        text = [i.lower() for i in text if i.isalpha() or i == " "]
        newtext = "".join(text)
        newtext = [lemm.lemmatize(i) for i in newtext.split() if i not in stop_words]
        Xnew.append(" ".join(newtext))
    
    return Xnew, y

def bag_of_words(reviews):
    words_dict = dict([r, True] for r in reviews)
    return words_dict

### 1.4 Build the Datest

In [5]:
def buildDataset(X,y):
    X, y = preprocess(X,y)

    revs = []
    for i in range(len(X)):
        text = X[i]
        revs.append((bag_of_words(text.split()), labels[y[i]] ))

    train, test = revs[:int(len(revs)*0.8)], revs[int(len(revs)*0.8):]
    
    print('Train Length: {}\nTest Length: {}'.format(len(train), len(test)))
    
    return train, test

train, test = buildDataset(X,y)

Train Length: 40000
Test Length: 10000


# 2. Training

In [6]:
clf = NaiveBayesClassifier.train(train)
accuracy = classify.accuracy(clf, test)

print('Accuracy: {}'.format(accuracy*100))

Accuracy: 85.8


# 3. Confusion Matrix

In [8]:
ypred = [clf.classify(t[0]) for t in test]
ytest = y[int(len(y)*0.8):]

matrix = cm(ytest, ypred, labels = ['pos','neg'])

print('Confusion Matrix\n', matrix)

Confusion Matrix
 [[4281  687]
 [ 733 4299]]
