# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

# Read Data

In [2]:
df = pd.read_json("./reviews_Amazon_Instant_Video_5.json", lines=True, dtype={'overall': np.int64}) # cast "overall" (rating) column to float)

# Pre-process

## Class Balance

In [3]:
val_counts = df['overall'].value_counts()
min_val = val_counts.min()

In [4]:
X = []
y = []
for i in range(1, 6):
    vals = df[df['overall'] == i][:min_val]
    X += list(vals['reviewText'])
    y += list(vals['overall'])

## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.67) # train test split with 2/3 train

In [6]:
def remove_nums(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = re.sub(r'_', '', text) # remove underscores
    return text

In [7]:
vectorizer = CountVectorizer(preprocessor=remove_nums, stop_words='english')
train_x_vectorized = vectorizer.fit_transform(X_train)
test_x_vectorized = vectorizer.transform(X_test)

# Classify

## Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(train_x_vectorized.toarray(), y_train) 

GaussianNB()

In [9]:
i = 14
print(nb_classifier.predict(test_x_vectorized[i].toarray()))
print(y_test[i])
print(X_test[i])

[1]
3
just finished watching the UK version..... still good acting and lots of action,but the story is full of big holes and the ending is very weak......very disappointing after a great season one!


## Decision Trees

In [15]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(train_x_vectorized.toarray(), y_train)

DecisionTreeClassifier()

## Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(max_depth = 6)
rf_classifier.fit(train_x_vectorized.toarray(), y_train)


RandomForestClassifier(max_depth=6)

## Gaussian Process (takes too long)

In [11]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

kernel = 1.0 * RBF(1.0)

gp_classifier = GaussianProcessClassifier(kernel=kernel, random_state=0)
gp_classifier.fit(train_x_vectorized.toarray(), y_train)



## Neighbors

In [10]:
from sklearn.neighbors import KNeighborsClassifier

n_classifier = KNeighborsClassifier(n_neighbors = 10)
n_classifier.fit(train_x_vectorized.toarray(), y_train)

KNeighborsClassifier(n_neighbors=10)

## Logistic Regression (does not converge)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state = 0)
lr_classifier.fit(train_x_vectorized.toarray(), y_train)

# Evaluate

In [10]:
print(nb_classifier.score(test_x_vectorized.toarray(), y_test))

0.31040564373897706


In [25]:
print(rf_classifier.score(test_x_vectorized.toarray(), y_test))

0.41552028218694886


In [20]:
print(dt_classifier.score(test_x_vectorized.toarray(), y_test))

0.3255731922398589


In [11]:
print(n_classifier.score(test_x_vectorized.toarray(), y_test))

0.31146384479717815
