## Run in Google Colab:
https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb

## Load Data

In [None]:
# Load CSV files from remote repo
import requests
import zipfile
import io
r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from ast import literal_eval
import sklearn.metrics
import sklearn.naive_bayes
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz

# Train text classifier
def trainTextPredictor(df):
    return None, None

# Predict sentiment
def processTextual(classifier, wordIndices, df):
    df2 = pd.DataFrame(0, index=range(len(df.index)), columns=['summary-positive', 'review-positive'])
    return df.join(df2)

# function for normalization
def normalize_column_data(input_data):
    numerical_feautures = ['unixReviewTime','price', 'salesRank', 'helpful']
    for feature in numerical_feautures:    
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'reviewText', 'summary', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    df = df.drop(columns=['label', 'first-release-year'])

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

# Flag to set mode
useTestCSV = False

# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
textClassifier, wordIndices = trainTextPredictor(dfTrain)

# Process textual data
dfTrain = processTextual(textClassifier, wordIndices, dfTrain)
if useTestCSV: dfTest = processTextual(textClassifier, wordIndices, dfTest)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean',
    'summary-positive': 'sum',
    'review-positive': 'sum'
})

# normalization for numerical features
normalize_column_data(trainData)

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
        'summary-positive': 'sum',
        'review-positive': 'sum'
    })
    normalize_column_data(testData)
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    Xtest, ytest = testData, []
else:
    trainData, testData = sklearn.model_selection.train_test_split(trainData, test_size=0.3, shuffle=True)
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    Xtest, ytest = testData.drop(columns='overall'), testData['overall']

# Run ML
"""
gnb = sklearn.naive_bayes.GaussianNB()
gnbTrained = gnb.fit(Xtrain, ytrain)
preds = gnbTrained.predict(Xtest)
"""

# only for numerical data best model
Xnumerical_best_train = Xtrain['helpful'].to_numpy().reshape(-1,1)
Xnumerical_best_test = Xtest['helpful'].to_numpy().reshape(-1,1)
tree_clf = sklearn.tree.DecisionTreeClassifier(max_depth=4)
tree_trained = tree_clf.fit(Xnumerical_best_train, ytrain)
ypreds = tree_trained.predict(Xnumerical_best_test)

# Testing
if not useTestCSV:
    print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': Xtest.index, 'Awesome': preds})
    output.to_csv('./Product_Predictions.csv')

In [76]:
# Decision Tree with K fold for numerical data performance only
Xnumerical_best_train = Xtrain['helpful']
Xnumerical_best_train = Xnumerical_best_train.to_numpy().reshape(-1,1)
kfold = model_selection.KFold(n_splits=5, shuffle=True)
tree_clf = sklearn.tree.DecisionTreeClassifier(max_depth=4)
cv_results_acc = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='accuracy')
cv_results_f1 = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='f1_weighted')
print("f1_weighted score list: {}".format(cv_results_f1))
print("f1_weighted score average: {}".format(np.mean(cv_results_f1)))  
print("accuracy score list: {}".format(cv_results_acc))
print("accuracy score average: {}".format(np.mean(cv_results_acc)))

f1_weighted score list: [0.60724904 0.59836392 0.64437124 0.62662451 0.62482607]
f1_weighted score average: 0.6202869571318353
accuracy score list: [0.63550136 0.61517615 0.63685637 0.62940379 0.64701897]
accuracy score average: 0.6327913279132791
