<a href="https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Data

In [None]:
# Load CSV files from remote repo
import requests
import zipfile
import io

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse
import string
from ast import literal_eval
import nltk.tokenize
import nltk.stem.porter
import math
from nltk.corpus import stopwords
import sklearn.metrics
import statistics
import sklearn.naive_bayes
import sklearn.feature_extraction.text
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
import sklearn.linear_model
import sklearn.model_selection
import statistics
import nltk
import sklearn.feature_selection
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Flags
useTestCSV = False
#NGRAM_SIZE = 2
NGRAM_SIZE = 4
#COMMON_WORD_THRESHOLD = 10
COMMON_WORD_THRESHOLD = 4
stemmer = nltk.stem.porter.PorterStemmer()

def preprocessForTextClassification(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
    })

    P['reviewText'] = P['reviewText'] + " " + P['summary']

    return P


# Train text classifier
def trainTextFrequency(df):
    P = preprocessForTextClassification(df)

    #vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,NGRAM_SIZE))
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,NGRAM_SIZE), min_df=COMMON_WORD_THRESHOLD, preprocessor=lambda token: stemmer.stem(token))
    X1 = vectorizer.fit_transform(P['reviewText'])

    return X1, vectorizer

def getTextMatrix(df, word_indices):
    P = preprocessForTextClassification(df)

    X1 = word_indices.transform(P['reviewText'])
    return X1

# function for normalization
def normalize_column_data(input_data):
    for feature in input_data:
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    #df = df.drop(columns=['label', 'first-release-year'])
    df = df.drop(columns=['label'])
    df['first-release-year'].fillna((df['first-release-year'].median()), inplace=True)
    df['first-release-year'] = df['first-release-year'].apply(lambda x: 1 if x > 1990 else 0)

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # review counter for each review
    df['review_count'] = 1

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

In [None]:
# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
trainingTextMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

In [None]:
dfTrain

In [None]:
# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean',
    'first-release-year': 'mean',
    'review_count': 'sum'
})

# normalization for numerical features
normalize_column_data(trainData)

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',

        # TODO first release year
    })
    normalize_column_data(testData)

    ytrain = trainData['overall'].to_numpy()
    # Todo here using review time, price and others. Let's make consistenet with "else" part for 'helpful only'
    Xtrain = scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    )
    Xtrain = scipy.sparse.csr_matrix(Xtrain)
    testIndex = testData.index
    Xtest = scipy.sparse.hstack(
        (testTextMatrix, scipy.sparse.csr_matrix(testData.to_numpy()))
    )
    Xtest = scipy.sparse.csr_matrix(Xtest)
else:

    ablation_columns = ['unixReviewTime', 'price', 'salesRank', 'helpful', 'first-release-year', 'review_count']
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData[ablation_columns].to_numpy()))
    ))

    ytrain = trainData['overall'].to_numpy()
    selector = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, k=10000)
    Xtrain = selector.fit_transform(Xtrain, ytrain)
    Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xtrain, trainData['overall'].to_numpy(), test_size=0.3, shuffle=True)

## Voting

In [9]:
voting_flag = False

# Testing
if not useTestCSV:
    # Run ML
    kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
    #kf = sklearn.model_selection.KFold(n_splits=10, shuffle=True)
    f1_vals_log = []
    f1_vals_rnd = []
    f1_vals_svm = []
    f1_vals_gnb = []
    f1_vals_voting = []

    #log_clf = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    log_clf = sklearn.linear_model.LogisticRegression(max_iter=100000, multi_class='multinomial', C=1.0, class_weight='balanced', penalty='l2')
    # for soft
    #svm_rbf_clf = SVC(kernel='linear', probability=True)
    # for hard
    #svm_rbf_clf = SVC(kernel='linear')
    svm_rbf_clf = SVC(C=1.1, class_weight='balanced', kernel='linear', max_iter=5000000)
    rnd_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    MNBclf = MultinomialNB()
    gnb_clf = sklearn.naive_bayes.GaussianNB()
    #voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_rbf_clf)], voting='hard')
    voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svm_rbf_clf)], voting='hard')

    # ('gnb', gnb_clf)
    #clf_list = [log_clf, rnd_clf, svm_rbf_clf, voting_clf]
    clf_list = [log_clf, svm_rbf_clf, voting_clf]

    for train_index, test_index in kf.split(Xtrain):
        x_train, x_test = Xtrain[train_index], Xtrain[test_index]
        y_train, y_test = ytrain[train_index], ytrain[test_index]

        if voting_flag:
            print("==================================================")
            for clf in clf_list:
                clt = clf.fit(x_train, y_train)
                f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')

                if clf == log_clf:
                    print("logistic regression")
                    f1_vals_log.append(f1)
                elif clf == rnd_clf:
                    print("random forest")
                    f1_vals_rnd.append(f1)
                elif clf == gnb_clf:
                    print("Gaussian Naive")
                    f1_vals_gnb.append(f1)
                elif clf == svm_rbf_clf:
                    print("SVM")
                    f1_vals_svm.append(f1)
                else:
                    print("voting")
                    f1_vals_voting.append(f1)
                print("F1 {}".format(f1))

        else:
            clt = log_clf.fit(x_train, y_train)
            f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
            f1_vals_log.append(f1)
            print("F1 {}".format(f1))

    if voting_flag:
        print("Mean F1 LR: ", statistics.mean(f1_vals_log))
        #print("Mean F1 RF: ", statistics.mean(f1_vals_rnd))
        print("Mean F1 SVM: ", statistics.mean(f1_vals_svm))
        print("Mean F1 voting: ", statistics.mean(f1_vals_voting))
    else:
        print("Mean F1 LR: ", statistics.mean(f1_vals_log))
    # print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    LR = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    LRTrained = LR.fit(Xtrain, ytrain)
    ypreds = LRTrained.predict(Xtest)
    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': testIndex, 'Awesome': ypreds})
    output.to_csv('./Product_Predictions.csv')
    print("Output to ./Product_Predictions.csv")

F1 0.7753359448376416
F1 0.7397688171764085
F1 0.7645731221068192
F1 0.7523254770522712
F1 0.7579087696251389
F1 0.773240265094886
F1 0.7353427192808513
F1 0.7645025005682047
F1 0.737904417153558
F1 0.7589015659520509
Mean F1 LR:  0.755980359884783


### Bagging

In [None]:
"""
# Testing
if not useTestCSV:
    # Run ML
    kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
    #kf = sklearn.model_selection.KFold(n_splits=10, shuffle=True)
    f1_vals_log = []
    f1_vals_rnd = []
    f1_vals_svm = []
    f1_vals_gnb = []
    f1_vals_voting = []
    f1_vals_bagging = []

    log_clf = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    # for soft
    #svm_rbf_clf = SVC(kernel='linear', probability=True)
    # for hard
    svm_rbf_clf = SVC(kernel='linear')
    rnd_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    MNBclf = MultinomialNB()
    gnb_clf = sklearn.naive_bayes.GaussianNB()
    #voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_rbf_clf)], voting='hard')
    voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_rbf_clf)], voting='hard')

    # Bagging classifier
    bag_clf = BaggingClassifier(
        rnd_clf, n_estimators=500, max_samples=3000, bootstrap=True, n_jobs=-1)

    for train_index, test_index in kf.split(Xtrain):
        x_train, x_test = Xtrain[train_index], Xtrain[test_index]
        y_train, y_test = ytrain[train_index], ytrain[test_index]

        print("==================================================")
        bag_clf.fit(x_train, y_train)
        f1 = sklearn.metrics.f1_score(y_test, bag_clf.predict(x_test), average='weighted')
        f1_vals_bagging.append(f1)
        print("F1 {}".format(f1))

    print("Mean F1 bagging: ", statistics.mean(f1_vals_bagging))
    #print("Mean F1 LR: ", statistics.mean(f1_vals_log))
    #print("Mean F1 RF: ", statistics.mean(f1_vals_rnd))
    #print("Mean F1 SVM: ", statistics.mean(f1_vals_svm))
    #print("Mean F1 voting: ", statistics.mean(f1_vals_voting))
"""

In [None]:
# 2000 year: 0.718
# only helpful: 0.720

# hard: increase
# soft: decrease
# without estimation decrease 0.726