## Load Data

In [17]:
# Load CSV files from remote repo
import requests
import zipfile
import io

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse
import string
from ast import literal_eval
import nltk.tokenize
import nltk.stem.porter
import math
from nltk.corpus import stopwords
import sklearn.metrics
import statistics
import sklearn.naive_bayes
import sklearn.feature_extraction.text
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model
import sklearn.model_selection
import statistics
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Flags
useTestCSV = False
NGRAM_SIZE = 4
COMMON_WORD_THRESHOLD = 2
stemmer = nltk.stem.porter.PorterStemmer()

def preprocessForTextClassification(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
    })

    P['reviewText'] = P['reviewText'] + " " + P['summary']

    return P


# Train text classifier
def trainTextFrequency(df):
    P = preprocessForTextClassification(df)

    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,NGRAM_SIZE), min_df=COMMON_WORD_THRESHOLD, preprocessor=lambda token: stemmer.stem(token))
    X1 = vectorizer.fit_transform(P['reviewText'])

    return X1, vectorizer

def getTextMatrix(df, word_indices):
    P = preprocessForTextClassification(df)

    X1 = word_indices.transform(P['reviewText'])
    return X1

# function for normalization
def normalize_column_data(input_data):
    for feature in input_data:    
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    df = df.drop(columns=['label'])

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
print("Loading test data")
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
print("Constructing text frequency data")
trainingTextMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
print("Annotating with numerical data")
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)
print("Done!")

[nltk_data] Downloading package stopwords to /home/sam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Loading files from default locations
Loading test data
Constructing text frequency data
Annotating with numerical data
Done!


In [5]:
dfTrain.columns

Index(['reviewerID', 'amazon-id', 'helpful', 'unixReviewTime', 'reviewText',
       'overall', 'summary', 'price', 'artist', 'salesRank'],
      dtype='object')

In [3]:
# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'overall': isAwesome,
    'helpful': 'mean',
    'reviewText': 'count',
})

# normalization for numerical features
normalize_column_data(trainData)

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
    })
    normalize_column_data(testData)

    ytrain = trainData['overall'].to_numpy()
    Xtrain = scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    )
    Xtrain = scipy.sparse.csr_matrix(Xtrain)
    testIndex = testData.index
    Xtest = scipy.sparse.hstack(
        (testTextMatrix, scipy.sparse.csr_matrix(testData.to_numpy()))
    )
    Xtest = scipy.sparse.csr_matrix(Xtest)
else:
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (
            trainingTextMatrix,
            scipy.sparse.csr_matrix(trainData['helpful'].to_numpy().reshape(-1,1))
        )
    ))
    import sklearn.feature_selection

    ytrain = trainData['overall'].to_numpy()
    Xtrain = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, k=7000).fit_transform(Xtrain, ytrain)

    Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xtrain, trainData['overall'].to_numpy(), test_size=0.3, shuffle=True)

In [141]:
Xtrain.shape

(7380, 6000)

In [103]:


#if not useTestCSV:
    # Run ML

#    f1_vals = []
#    for train_index, test_index in kf.split(Xtrain):
#        x_train, x_test = Xtrain[train_index], Xtrain[test_index]
#        y_train, y_test = ytrain[train_index], ytrain[test_index]

#        clf = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
#        clt = clf.fit(x_train, y_train)

#        f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
#        print("F1 {}".format(f1))
#        f1_vals.append(f1)

#    print("Mean F1: ", statistics.mean(f1_vals))
    # print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    LR = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    LRTrained = LR.fit(Xtrain, ytrain)
    ypreds = LRTrained.predict(Xtest)
    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': testIndex, 'Awesome': ypreds})
    output.to_csv('./Product_Predictions.csv')
    print("Output to ./Product_Predictions.csv")

In [4]:
# Testing
useTestCSV = False

all_logistic_regression_params = [
    {
        "penalty": ["l2"],
        "C": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.9, 6.95, 6.96, 6.97, 6.98, 6.99, 7.0],
        "class_weight": ["balanced"],
        "max_iter": [100000],
        "multi_class": ["multinomial"]
    },
    {
        "penalty": ["elasticnet", "l1"],
        "solver": ["saga"],
        "class_weight": ["balanced"],
        "max_iter": [100000],
        "multi_class": ["auto", "multinomial"],
        "C": [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.5, 4.0],
        "l1_ratio": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }
]

kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

sklearn.linear_model.LogisticRegression()
logistic_regression = sklearn.linear_model.LogisticRegression()
print("Building gridd")
grid_search_logistic_regression = sklearn.model_selection.GridSearchCV(logistic_regression, param_grid=all_logistic_regression_params, cv=10, verbose=10, scoring="f1_weighted", n_jobs=31)

print("Fitting")
grid_search_logistic_regression.fit(Xtrain, ytrain)
grid_search_logistic_regression.best_score_


Building gridd
Fitting
Fitting 10 folds for each of 1039 candidates, totalling 10390 fits


KeyboardInterrupt: 

In [177]:
grid_search_logistic_regression.best_estimator_

LogisticRegression(C=6.97, class_weight='balanced', max_iter=100000,
                   multi_class='multinomial')

In [5]:
all_gradient_boost_params = [
    {"n_estimators": [10, 20, 30, 40, 50, 60, 60, 80, 90, 110, 120, 130, 140, 150, 160], "learning_rate": [0.3, 0.2, 0.1, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 2.0]}
]

grad_boost = sklearn.ensemble.GradientBoostingClassifier()
grid_search_gradient_boost = sklearn.model_selection.GridSearchCV(grad_boost, param_grid=all_gradient_boost_params, cv=10, verbose=10, scoring="f1_weighted", n_jobs=28)

grid_search_gradient_boost.fit(Xtrain, ytrain)
grid_search_gradient_boost.best_score_

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


KeyboardInterrupt: 

In [151]:
grid_search_gradient_boost.best_estimator_

GradientBoostingClassifier(learning_rate=0.3, n_estimators=150)

In [14]:
#all_random_forest_params = [
#    {
#        "criterion": ["gini", "entropy"],
#        "n_estimators": list(range(50, 301)),
#        "min_samples_split": list(range(2,20)),
#        "min_samples_leaf": list(range(1,5)),
#        "max_features": ["auto", "sqrt", "log2"],
#        "min_impurity_decrease": [0.0, 0.005, 0.1, 0.15, 0.2],
#        "class_weight": ["balanced", "balanced_subsample"],
#        "max_samples": [None, 1, 2, 3, 4, 5]
#    }
#]

all_random_forest_params = [
    {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 6, 7, 8, 9, 10,None],
        "class_weight": ["balanced", "balanced_subsample"],
        "min_impurity_decrease": [0.0001, 0.005, 0.1, 0.15, 0.2],
        "max_samples": [None, 0.5, 0.75],
        "n_estimators": [100, 150, 200, 250, 300, 50]
    }
]

kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
random_forest = sklearn.ensemble.RandomForestClassifier()
grid_search_random_forest = sklearn.model_selection.GridSearchCV(random_forest, param_grid=all_random_forest_params, cv=10, verbose=10, scoring="f1_weighted", n_jobs=31)
grid_search_random_forest.fit(Xtrain, ytrain)
grid_search_random_forest.best_score_

Fitting 10 folds for each of 2520 candidates, totalling 25200 fits


0.7368081605796963

In [6]:
all_linear_svm_parameters = [
    {
        "C": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.5, 6.0],
        "class_weight": [None, "balanced"],
        "max_iter": [5000000],
        "kernel": ['linear', 'rbf', 'poly'],
        "probability": [False],
        "cache_size": [5000],
        "verbose": [True]
    }
]

svc = sklearn.svm.SVC()
grid_search_svc = sklearn.model_selection.GridSearchCV(svc, param_grid=all_linear_svm_parameters, cv=10, verbose=10, scoring="f1_weighted", n_jobs=31)
grid_search_svc.fit(Xtrain, ytrain)
grid_search_svc.best_score_

Fitting 10 folds for each of 288 candidates, totalling 2880 fits


KeyboardInterrupt: 

In [166]:
grid_search_svc.best_estimator_

SVC(C=5.5, cache_size=5000, class_weight='balanced', kernel='linear',
    max_iter=5000000, verbose=True)

In [113]:
all_voting_params = [
    {
        "voting": ['soft'],
        'weights': [None, [1, 2], [2, 1], [1.5, 1], [1, 1.5], [1.1, 1.0], [1.0, 1.1]]
    }
]

voting_classifier = sklearn.ensemble.VotingClassifier(estimators=[('svc', grid_search_svc.best_estimator_), ('lr', grid_search_logistic_regression.best_estimator_)])
grid_search_voting = sklearn.model_selection.GridSearchCV(voting_classifier, param_grid=all_voting_params, cv=10, verbose=10, scoring="f1_weighted", n_jobs=31)
grid_search_voting.fit(Xtrain, ytrain)

grid_search_voting.best_score_



Fitting 10 folds for each of 7 candidates, totalling 70 fits


KeyboardInterrupt: 