<a href="https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Data

In [None]:
# Load CSV files from remote repo
import requests
import zipfile
import io

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

## Single-File Deliverable

In [None]:
from ast import literal_eval
import pandas as pd
import numpy as np
import scipy.sparse
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
import sklearn.linear_model
import sklearn.model_selection
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import statistics
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

"""
Flags
"""

# Set to True to use the Train.csv file and output predictions CSV
useTestCSV = False

# File paths
testPath = 'Test.csv'
trainPath = 'Train.csv'

# NLP params
# New: Hyperparameter optimization
NGRAM_SIZE = 4
COMMON_WORD_THRESHOLD = 2
stemmer = nltk.stem.porter.PorterStemmer()

"""
Processing Funcs
"""


# Preprocess textual data
def preprocessForTextClassification(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")
    p = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
    })
    p['reviewText'] = p['reviewText'] + " " + p['summary']
    return p


# Train text classifier
def trainTextFrequency(df):
    P = preprocessForTextClassification(df)
    # New: Vectorizer change
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,NGRAM_SIZE), min_df=COMMON_WORD_THRESHOLD, preprocessor=lambda token: stemmer.stem(token))
    X1 = vectorizer.fit_transform(P['reviewText'])
    return X1, vectorizer


# Create text matrix for NLP
def getTextMatrix(df, word_indices):
    P = preprocessForTextClassification(df)
    X1 = word_indices.transform(P['reviewText'])
    return X1


# Column normalization
def normalizeColumnData(input_data):
    for feature in input_data:
        input_data[feature] = (input_data[feature] - input_data[feature].min()) / (
                input_data[feature].max() - input_data[feature].min())


# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'categories', 'songs', 'related', 'reviewTime', 'label'])

    # Process release year
    df['first-release-year'].fillna((df['first-release-year'].median()), inplace=True)
    df['first-release-year-trans'] = df['first-release-year'].apply(lambda x: 1 if x > 1990 else 0)

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # review counter for each review
    df['review_count'] = 1

    # Convert categorical data to their own features
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df


"""
Main code
"""

# Print out runtime conditions
if (useTestCSV):
    print("useTestCSV = True, using Test.csv to generate predictions")
else:
    print("useTestCSV = False, using training data to validate model")
print("Training file location: ", trainPath)
print("Testing file location: ", testPath)

# Load dataframes
print("Loading training csv file...")
dfTrain = pd.read_csv(trainPath)
if useTestCSV:
    print("Loading testing csv file...")
    dfTest = pd.read_csv(testPath)

# Train text classifier on training data
print("Training text classifier...")
trainingTextMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    print("Getting text matrix for testing data...")
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
print("Processing numerical data in training file...")
dfTrain = processNumerical(dfTrain)
if useTestCSV:
    print("Processing numerical data in testing file...")
    dfTest = processNumerical(dfTest)

# Aggregate training data and normalize
print("Grouping training data by amazon-id and aggregating...")
# New: Added additional numerical fields
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': lambda x: 1 if np.mean(x) > 4.5 else 0,
    'salesRank': 'mean',
    'helpful': 'mean',
    'first-release-year': 'mean',
    'review_count': 'sum'
})
normalizeColumnData(trainData)

# Split data into dependent/independent vars
if useTestCSV:

    # Aggregate testing data from CSV file
    # New: Added additional numerical fields
    print("Grouping testing data by amazon-id and aggregating...")
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
        'first-release-year': 'mean',
        'review_count': 'sum'
    })
    normalizeColumnData(testData)

    # Split data
    print("Splitting testing/training data into dependent and indepedent variables...")
    ytrain = trainData['overall'].to_numpy()
    Xtrain = scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    )
    Xtrain = scipy.sparse.csr_matrix(Xtrain)
    testIndex = testData.index
    Xtest = scipy.sparse.hstack(
        (testTextMatrix, scipy.sparse.csr_matrix(testData.to_numpy()))
    )
    Xtest = scipy.sparse.csr_matrix(Xtest)

else:

    # If we're just testing our classifier, split the training data into training + testing datasets
    # New: Ablation testing
    print("Splitting training data into testing and training sets...")
    ablation_columns = ['first-release-year', 'review_count']
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData[ablation_columns].to_numpy()))
    ))
    ytrain = trainData['overall'].to_numpy()
    Xtrain = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, k=10000).fit_transform(Xtrain, ytrain)
    Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xtrain, trainData['overall'].to_numpy(), test_size=0.3, shuffle=True)

In [None]:
# Testing model
if not useTestCSV:
    print("Testing model with 10-fold cross-validation...")
    kf = sklearn.model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
    f1_vals = []
    for train_index, test_index in kf.split(Xtrain):
        x_train, x_test = Xtrain[train_index], Xtrain[test_index]
        y_train, y_test = ytrain[train_index], ytrain[test_index]
        # New: Hyperparameter optimization for LR
        clf = sklearn.linear_model.LogisticRegression(max_iter=100000, multi_class='multinomial', C=6.97, class_weight='balanced', penalty='l2')
        clt = clf.fit(x_train, y_train)
        f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
        print("\tF1 {}".format(f1))
        f1_vals.append(f1)
    print("Mean F1: ", statistics.mean(f1_vals))

# Output CSV file with predictions
if useTestCSV:
    print("Training model...")
    # New: Hyperparameter optimization for LR
    clf = sklearn.linear_model.LogisticRegression(max_iter=100000, multi_class='multinomial', C=6.97, class_weight='balanced', penalty='l2')
    clt = clf.fit(Xtrain, ytrain)
    ypreds = clt.predict(Xtest)
    output = pd.DataFrame({'amazon-id': testIndex, 'Awesome': ypreds})
    output.to_csv('./Product_Predictions.csv')
    print("Output predictions to './Product_Predictions.csv'")

print("Done!")