<a href="https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Data

In [1]:
# Load CSV files from remote repo
import requests
import zipfile
import io

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse
import string
from ast import literal_eval
import nltk.tokenize
import nltk.stem.porter
import math
from nltk.corpus import stopwords
import sklearn.metrics
import statistics
import sklearn.naive_bayes
import sklearn.feature_extraction.text
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn.linear_model
import sklearn.model_selection
import statistics
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Flags
useTestCSV = False
NGRAM_SIZE = 2
COMMON_WORD_THRESHOLD = 10
stemmer = nltk.stem.porter.PorterStemmer()

def preprocessForTextClassification(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
    })

    P['reviewText'] = P['reviewText'] + " " + P['summary']

    return P


# Train text classifier
def trainTextFrequency(df):
    P = preprocessForTextClassification(df)

    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,NGRAM_SIZE))
    X1 = vectorizer.fit_transform(P['reviewText'])

    return X1, vectorizer

def getTextMatrix(df, word_indices):
    P = preprocessForTextClassification(df)

    X1 = word_indices.transform(P['reviewText'])
    return X1

# function for normalization
def normalize_column_data(input_data):
    for feature in input_data:    
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    #df = df.drop(columns=['label', 'first-release-year'])
    df = df.drop(columns=['label'])
    df['first-release-year'].fillna((df['first-release-year'].median()), inplace=True)
    df['first-release-year-trans'] = df['first-release-year'].apply(lambda x: 1 if x > 2000 else 0)
    
    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lupan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lupan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lupan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lupan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
trainingTextMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

In [4]:
dfTrain

Unnamed: 0,reviewerID,amazon-id,helpful,unixReviewTime,reviewText,overall,summary,price,artist,first-release-year,salesRank,first-release-year-trans
0,-4984057859803657856,1877521326299865484,1.000000,1302739200,Very nice music for practicing my Tai Chi. I d...,4,Beautiful,16.47,-7180760356347753735,2003.0,27222,1
1,9136764282801708742,1877521326299865484,1.000000,1180396800,I recently starting doing Tai Chi which I love...,5,Tranquillity In Motion !!!,16.47,-7180760356347753735,2003.0,27222,1
2,2164551966908582519,1877521326299865484,0.875000,1361404800,My wife uses it for her class room the kids lo...,5,Great Stuff,16.47,-7180760356347753735,2003.0,27222,1
3,-7309200698931694843,1877521326299865484,1.000000,1338163200,We bought this music to go Dr Lam DVD. The mus...,5,Beautiful,16.47,-7180760356347753735,2003.0,27222,1
4,-4461682407031037732,1877521326299865484,0.875000,1396310400,It helps me do my exercise because it sets the...,5,tai chi music,16.47,-7180760356347753735,2003.0,27222,1
...,...,...,...,...,...,...,...,...,...,...,...,...
111093,-508419005999372045,-272019625357917459,0.875000,1405900800,Nice soundtrack and I was pleasantly surprised...,4,Four Stars,33.76,-3758738156872779256,2014.0,6,1
111094,4690686471314282919,-272019625357917459,0.923077,1405209600,I'd you are looking for the music that they sa...,5,I'd you are looking for the music that they sa...,33.76,-3758738156872779256,2014.0,6,1
111095,-6735807132142826990,-272019625357917459,0.750000,1404259200,FANTASTIC FILM! Loved this movie and the musi...,5,FANTASTIC FILM! Loved this movie and the music,33.76,-3758738156872779256,2014.0,6,1
111096,6536263939078780437,2197509461459270640,0.875000,1404518400,"A great new CD with uptempo, funky guitar. Thi...",5,A great new CD with uptempo,32.98,2800811401610696293,2014.0,24972,1


In [11]:
# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean',
    'first-release-year-trans': 'mean'
})

# normalization for numerical features
normalize_column_data(trainData)

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
        
        # TODO first release year
    })
    normalize_column_data(testData)
    
    ytrain = trainData['overall'].to_numpy()
    # Todo here using review time, price and others. Let's make consistenet with "else" part for 'helpful only'
    Xtrain = scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    )
    Xtrain = scipy.sparse.csr_matrix(Xtrain)
    testIndex = testData.index
    Xtest = scipy.sparse.hstack(
        (testTextMatrix, scipy.sparse.csr_matrix(testData.to_numpy()))
    )
    Xtest = scipy.sparse.csr_matrix(Xtest)
else:
    
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData['helpful'].to_numpy().reshape(-1,1)))
    ))
    """
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData[['helpful', 'first-release-year-trans']].to_numpy()))
    ))
    """
    ytrain = trainData['overall'].to_numpy()
    Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xtrain, trainData['overall'].to_numpy(), test_size=0.3, shuffle=True)

In [12]:
trainData[['helpful', 'first-release-year-trans']]

Unnamed: 0_level_0,helpful,first-release-year-trans
amazon-id,Unnamed: 1_level_1,Unnamed: 2_level_1
-9217723718720870868,0.620370,0.0
-9215746463819797371,0.937500,1.0
-9213978596308513604,0.666667,0.0
-9211290576571923870,1.000000,0.0
-9208769561690910545,0.812500,1.0
...,...,...
9218870320655141661,0.500000,1.0
9221578337502519209,0.921875,1.0
9221615570697142155,1.000000,1.0
9221801008952598876,0.710648,1.0


In [13]:
# Testing
if not useTestCSV:
    # Run ML
    kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
    #kf = sklearn.model_selection.KFold(n_splits=10, shuffle=True)
    f1_vals_log = []
    f1_vals_rnd = []
    f1_vals_svm = []
    f1_vals_gnb = []
    f1_vals_voting = []
    
    log_clf = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    # for soft
    svm_rbf_clf = SVC(kernel='linear', probability=True)
    # for hard
    svm_rbf_clf = SVC(kernel='linear')
    rnd_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    MNBclf = MultinomialNB()
    gnb_clf = sklearn.naive_bayes.GaussianNB()
    #voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_rbf_clf)], voting='hard')
    voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_rbf_clf)], voting='hard')
    
    # ('gnb', gnb_clf)
    clf_list = [log_clf, rnd_clf, svm_rbf_clf, voting_clf]
    #clf_list = [log_clf, svm_rbf_clf, voting_clf]
    
    for train_index, test_index in kf.split(Xtrain):
        x_train, x_test = Xtrain[train_index], Xtrain[test_index]
        y_train, y_test = ytrain[train_index], ytrain[test_index]
        
        print("==================================================")
        for clf in clf_list:
            clt = clf.fit(x_train, y_train)
            f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
            
            if clf == log_clf:
                print("logistic regression")
                f1_vals_log.append(f1)
            elif clf == rnd_clf:
                print("random forest")
                f1_vals_rnd.append(f1)
            elif clf == gnb_clf:
                print("Gaussian Naive")
                f1_vals_gnb.append(f1)
            elif clf == svm_rbf_clf:
                print("SVM")
                f1_vals_svm.append(f1)
            else:
                print("voting")
                f1_vals_voting.append(f1)
            print("F1 {}".format(f1))
        

    print("Mean F1 LR: ", statistics.mean(f1_vals_log))
    #print("Mean F1 RF: ", statistics.mean(f1_vals_rnd))
    print("Mean F1 SVM: ", statistics.mean(f1_vals_svm))
    print("Mean F1 voting: ", statistics.mean(f1_vals_voting))

    # print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    LR = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    LRTrained = LR.fit(Xtrain, ytrain)
    ypreds = LRTrained.predict(Xtest)
    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': testIndex, 'Awesome': ypreds})
    output.to_csv('./Product_Predictions.csv')
    print("Output to ./Product_Predictions.csv")

logistic regression
F1 0.6793839645713531
random forest
F1 0.6294023449460747
SVM
F1 0.7071492045883598
voting
F1 0.7041053875624396
logistic regression
F1 0.6961778955650336
random forest
F1 0.5910588431520448
SVM
F1 0.7302876059876479
voting
F1 0.7238389351894724
logistic regression
F1 0.7320677911708595
random forest
F1 0.5488360814323342
SVM
F1 0.7172536044487263
voting
F1 0.7050941068105934
logistic regression
F1 0.7386525053817983
random forest
F1 0.6541794498178712
SVM
F1 0.7524070245348022
voting
F1 0.7517936820601601
logistic regression
F1 0.7534725173035702
random forest
F1 0.6501612484918192
SVM
F1 0.7457355190611799
voting
F1 0.7480248359590271
logistic regression
F1 0.7264940764352151
random forest
F1 0.6220850760583253
SVM
F1 0.7381671100779127
voting
F1 0.7341224455109914
logistic regression
F1 0.7061669641086656
random forest
F1 0.6111206841557638
SVM
F1 0.7195827857575877
voting
F1 0.7172509374416344
logistic regression
F1 0.6960743452317582
random forest
F1 0.60441739

In [None]:
statistics.mean(f1_vals_voting)

In [None]:
# 2000 year: 0.718
# only helpful: 0.720

# hard: increase
# soft: decrease
# without estimation decrease 0.726