## Run in Google Colab:
https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb

## Load Data

In [1]:
# Load CSV files from remote repo
import requests
import zipfile
import io
r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [58]:
# Import required libraries
import pandas as pd
import numpy as np
from ast import literal_eval
import sklearn.metrics
import sklearn.naive_bayes
from sklearn import model_selection

# Train text classifier
def trainTextPredictor(df):
    return None, None

# Predict sentiment
def processTextual(classifier, wordIndices, df):
    df2 = pd.DataFrame(0, index=range(len(df.index)), columns=['summary-positive', 'review-positive'])
    return df.join(df2)

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'reviewText', 'summary', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    df = df.drop(columns=['label', 'first-release-year'])

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'] = df['helpful'].fillna(df['helpful'].median())

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

# Flag to set mode
useTestCSV = False

# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
textClassifier, wordIndices = trainTextPredictor(dfTrain)

# Process textual data
dfTrain = processTextual(textClassifier, wordIndices, dfTrain)
if useTestCSV: dfTest = processTextual(textClassifier, wordIndices, dfTest)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean',
    'summary-positive': 'sum',
    'review-positive': 'sum'
})

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
        'summary-positive': 'sum',
        'review-positive': 'sum'
    })
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    Xtest, ytest = testData, []
else:
    trainData, testData = sklearn.model_selection.train_test_split(trainData, test_size=0.4)
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    Xtest, ytest = testData.drop(columns='overall'), testData['overall']

# Run ML
gnb = sklearn.naive_bayes.GaussianNB()
gnbTrained = gnb.fit(Xtrain, ytrain)
preds = gnbTrained.predict(Xtest)

# Testing
if not useTestCSV:
    print(sklearn.metrics.f1_score(ytest, gnbTrained.predict(Xtest), average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': Xtest.index, 'Awesome': preds})
    output.to_csv('./Product_Predictions.csv')

0.46079658211280966
