# Prerequisites

In [None]:
# Load dataset from Github
import requests, zipfile, io
r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true') 
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import nltk
import nltk.tokenize
import math
from nltk.corpus import stopwords
from collections import defaultdict
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Preprocessing

In [None]:
def tokenize_without_stopwords(review, stop_words):
    if type(review) == str:
        tokenized = nltk.tokenize.word_tokenize(review)

        return [word for word in tokenized if word not in stop_words]
    else:
        return []

# a product is awesome if the average of its reviews is awesome
# lets try and find whether a review is awesome
# so that looks like... what?

# get a list of all unique non-stop words and the frequency count in this database for that
# and a list of all unique non-stop 2-grams
training_data = pd.read_csv(trainPath)
print(training_data.columns)
print(training_data.iloc[0]['overall'])

# get all words in all of the reviews
print("Mapping reviews to lower case")
training_data['reviewText'] = training_data['reviewText'].apply(lambda x: x.lower() if not type(x) == float else x)

# load a numpy array with each word being a feature w/ a frequency in the whole setup
# get the count of each unique word then order them in a list then make a matrix with rows being the data points and columns being
# the words. So first step is a list of all words

word_frequency = defaultdict(lambda: 0)

stop_words_set = set(stopwords.words('english'))

print("Building word frequency dictionary...")
for review in training_data['reviewText']:
    review_words = tokenize_without_stopwords(review, stop_words_set)

    for word in review_words:
        word_frequency[word] = word_frequency[word] + 1

filter_count = 400
common_words = [word for word, freq in word_frequency.items() if freq > filter_count]
common_words.sort(key=lambda word: word_frequency[word], reverse=True)

print(common_words)

print("Number of unique words", len(word_frequency))
print("Number words that appear more than {} times".format(filter_count), len(
    common_words
))

print("Getting unique id for each word...")
index_by_word = {}

for index, word in enumerate(common_words):
    index_by_word[word] = index

predicted_variable = np.zeros(len(training_data), dtype=np.int8)

print("Creating traning matrix")
data = np.zeros((len(training_data), len(common_words)), dtype=float)
for row_idx, data_row in training_data.iterrows():
    review_words = tokenize_without_stopwords(data_row['reviewText'], stop_words_set)
    if data_row['overall'] >= 4:
        predicted_variable[row_idx] = 1

    for word in review_words:
        if word in index_by_word:
            word_idx = index_by_word[word]
            data[row_idx][word_idx] = data[row_idx][word_idx] + 1
print("Done!")

Index(['reviewerID', 'amazon-id', 'helpful', 'unixReviewTime', 'reviewText',
       'overall', 'reviewTime', 'summary', 'price', 'categories', 'root-genre',
       'title', 'artist', 'label', 'first-release-year', 'songs', 'salesRank',
       'related'],
      dtype='object')
4
Mapping reviews to lower case
Building word frequency dictionary...
[',', '.', "'s", ';', '&', "''", '``', '!', 'album', ')', 'cd', 'music', 'quot', '(', 'one', "n't", 'songs', 'like', 'song', 'great', 'love', '...', 'good', '-', 'best', "'", 'would', 'time', 'first', 'really', ':', 'sound', 'get', 'well', '?', 'much', '#', 'listen', 'even', 'new', 'also', 'beatles', 'voice', 'track', 'still', 'many', "'m", 'heard', 'better', 'think', 'albums', 'tracks', 'band', 'way', 'two', 'could', "'ve", 'back', 'years', 'never', 'ever', 'movie', 'every', 'know', 'work', '--', 'fan', 'say', 'recording', '34', 'listening', 'favorite', 'buy', 'hear', 'sounds', 'rock', 'version', 'beautiful', 'original', 'another', 'lyrics', 'l

# Training + Modals

In [None]:
# Train data
print("Doing some machine learning...")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, predicted_variable, test_size=0.4)

Doing some machine learning...


In [None]:
# GNB
print("Trying gaussian naive bayes")
gnb = sklearn.naive_bayes.GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("F1 score: {}".format(sklearn.metrics.f1_score(y_test, y_pred)))
print("accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, y_pred)))

Trying gaussian naive bayes
F1 score: 0.8743999446634848
accuracy score: 0.7957020702070207


In [None]:
# Decision tree
print("Trying a decision tree")
clf = sklearn.tree.DecisionTreeClassifier(max_depth=4)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("F1 score: {}".format(sklearn.metrics.f1_score(y_test, y_pred)))                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
print("accuracy score: {}".format(sklearn.metrics.accuracy_score(y_test, y_pred)))

Trying a decision tree
F1 score: 0.9296372165185877
accuracy score: 0.8693744374437443
