# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm

# Read Data

In [2]:
df = pd.read_json("./reviews_Amazon_Instant_Video_5.json", lines=True, dtype={'overall': np.int64}) # cast "overall" (rating) column to float)

# Pre-process

## Class balance

In [3]:
val_counts = df['overall'].value_counts()
min_val = val_counts.min()
print(val_counts)
X = []
y = []
for i in range(1, 6):
    vals = df[df['overall'] == i][:min_val]
    X += list(vals['reviewText'])
    y += list(vals['overall'])
print(len(X), len(y))

5    20890
4     8446
3     4187
2     1885
1     1718
Name: overall, dtype: int64
8590 8590


## Convert star ratings

In [4]:
mapped_x = []
mapped_y = []
for i in range(len(y)):
    if y[i] > 3:
        mapped_y.append("POSITIVE")
        mapped_x.append(X[i])
    elif y[i] < 3:
        mapped_y.append("NEGATIVE")
        mapped_x.append(X[i])

print(len(mapped_x), len(mapped_y))

6872 6872


## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(mapped_x, mapped_y, train_size=0.67) # train test split with 2/3 train

In [6]:
len(list(filter(lambda x: x == 'POSITIVE', y_train)))
len(list(filter(lambda x: x == 'NEGATIVE', y_train)))

2316

In [7]:
def remove_nums(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = re.sub(r'_', '', text) # remove underscores
    text = re.sub(r'[?,.!@#$%^&*()_+]', '', text)
    return text

In [8]:
vectorizer = TfidfVectorizer(preprocessor=remove_nums, stop_words='english', ngram_range=(1,2))
train_x_vectorized = vectorizer.fit_transform(X_train)
test_x_vectorized = vectorizer.transform(X_test)

# Classify

## Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(train_x_vectorized.toarray(), y_train) 

GaussianNB()

In [10]:
i = 17
print(nb_classifier.predict(test_x_vectorized[i].toarray()))
print(y_test[i])

['POSITIVE']
NEGATIVE


## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(max_depth = 6)
rf_classifier.fit(train_x_vectorized.toarray(), y_train)


RandomForestClassifier(max_depth=6)

## Neighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier

n_classifier = KNeighborsClassifier(n_neighbors = 10)
n_classifier.fit(train_x_vectorized.toarray(), y_train)

KNeighborsClassifier(n_neighbors=10)

## Logistic Regression (does not converge)

In [13]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state = 0)
lr_classifier.fit(train_x_vectorized.toarray(), y_train)

LogisticRegression(random_state=0)

# Evaluate

In [14]:
print(nb_classifier.score(test_x_vectorized.toarray(), y_test))

0.7574955908289241


In [15]:
print(rf_classifier.score(test_x_vectorized.toarray(), y_test))

0.7605820105820106


In [16]:
print(n_classifier.score(test_x_vectorized.toarray(), y_test))

0.8328924162257496


In [17]:
print(lr_classifier.score(test_x_vectorized.toarray(), y_test))

0.8597883597883598
