<a href="https://colab.research.google.com/github/charliecarver/cosc247/blob/master/deliverable-3/deliverable-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Data

In [2]:
# Load CSV files from remote repo
import requests
import zipfile
import io

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Single-File Deliverable

In [45]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse
import string
from ast import literal_eval
import nltk.tokenize
import nltk.stem.porter
import math
from nltk.corpus import stopwords
import sklearn.metrics
import statistics
import sklearn.naive_bayes
import sklearn.feature_extraction.text
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz
from collections import defaultdict

NGRAM_SIZE = 1
COMMON_WORD_THRESHOLD = 10
stemmer = nltk.stem.porter.PorterStemmer()

def tokenize_without_stopwords(review, stop_words, ngram_size):
    if type(review) == str:
        tokenized = nltk.tokenize.word_tokenize(review)

        filtered_words = [word for word in tokenized if (word not in string.punctuation) and (word not in stop_words) and (word != 'quot') and (word != "''") and (word != "``") and (word != '---')]
        filtered_words = list(map(stemmer.stem, filtered_words))

        return nltk.ngrams(filtered_words, NGRAM_SIZE)
    else:
        return []

def format_data_for_review_sentiment_classification(dataframe, index_by_word, ngram_size, column_name):
    stop_words_set = set(stopwords.words('english'))
    common_words = [key for key in index_by_word.keys()]

    print("Creating traning matrix")
    data = np.zeros(shape=(len(dataframe), len(common_words)), dtype=np.float64)
    print("Data shape:", data.shape)
    row_number = 0
    for _, data_row in dataframe.iterrows():
        review_words = tokenize_without_stopwords(data_row[column_name], stop_words_set, ngram_size)

        for word in review_words:
            if word in index_by_word:
                word_idx = index_by_word[word]
                data[row_number, word_idx] = np.float64(data[row_number, word_idx] + 1)
        row_number = row_number + 1

    return scipy.sparse.csr_matrix(data, dtype=np.float64)


def get_common_word_indices(training_data, column_name, ngram_size, common_word_threshold):
    training_data[column_name] = training_data[column_name].apply(
        lambda x: x.lower() if not type(x) == float else x
    )
    word_frequency = defaultdict(lambda: 0)

    stop_words_set = set(stopwords.words('english'))

    print("Building word frequency dictionary...")
    for review in training_data[column_name]:
        review_words = tokenize_without_stopwords(review, stop_words_set, ngram_size)

        for word in review_words:
            word_frequency[word] = word_frequency[word] + 1

    common_words = [
        word for word, freq in word_frequency.items() if freq > common_word_threshold
    ]

    common_words.sort(key=lambda word: word_frequency[word], reverse=True)

    print("Common words: ", common_words[0:50])

    print("Number of unique words", len(word_frequency))
    print("Number words that appear more than {} times".format(common_word_threshold), len(
        common_words
    ))

    print("Getting unique id for each word...")
    index_by_word = {}

    for index, word in enumerate(common_words):
        index_by_word[word] = index

    return index_by_word

# Train text classifier
def trainTextFrequency(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
        'overall': 'mean',
    })

    P['awesome'] = np.where(P['overall'] > 4.5, True, False)
    P['reviewText'] = P['reviewText'] + " " + P['summary']

    #word_index1 = get_common_word_indices(P, 'reviewText', NGRAM_SIZE, COMMON_WORD_THRESHOLD)
    #X1 = format_data_for_review_sentiment_classification(P, word_index1, NGRAM_SIZE, 'reviewText')
    v = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
    X1 = v.fit_transform(P['reviewText'])
    print(X1)

    #return X1, word_index1
    return X1, "a"

def getTextMatrix(df, word_indices):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join
    })

    P['reviewText'] = P['reviewText'] + " " + P['summary']

    X1 = format_data_for_review_sentiment_classification(P, word_indices, NGRAM_SIZE, 'reviewText')

    return X1

# function for normalization
def normalize_column_data(input_data):
    numerical_feautures = ['unixReviewTime','price', 'salesRank', 'helpful']
    for feature in numerical_feautures:    
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    df = df.drop(columns=['label', 'first-release-year'])

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

# Flag to set mode
useTestCSV = False

# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
trainingTextMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

  (0, 1640148)	0.02314594146566252
  (0, 143334)	0.022092300378772963
  (0, 2436648)	0.025061753941434315
  (0, 1120551)	0.019478005323008942
  (0, 1108367)	0.026851565196811828
  (0, 2152036)	0.01792600681092878
  (0, 202558)	0.01741632384372781
  (0, 2333838)	0.011291430705280749
  (0, 1179305)	0.009451989693031602
  (0, 2541864)	0.012659066168907477
  (0, 2115918)	0.018732929528016314
  (0, 390902)	0.012191142098364955
  (0, 702969)	0.020385777406389163
  (0, 336648)	0.03228020584068953
  (0, 2051831)	0.03228020584068953
  (0, 244675)	0.03228020584068953
  (0, 1423421)	0.03228020584068953
  (0, 296942)	0.021491068367127233
  (0, 1005060)	0.008143708631726154
  (0, 2555532)	0.017375438445206618
  (0, 205777)	0.03228020584068953
  (0, 2280488)	0.03228020584068953
  (0, 2383447)	0.02994221757316695
  (0, 1244719)	0.03228020584068953
  (0, 389040)	0.02243855325916562
  :	:
  (10542, 196509)	0.021909935228131873
  (10542, 1480364)	0.028387734010354874
  (10542, 109992)	0.0189232274104008

In [None]:
useTestCSV = False

# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean',
})

# normalization for numerical features
normalize_column_data(trainData)
trainData = trainData.drop(columns=['price', 'unixReviewTime'])


# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean',
    })
    normalize_column_data(testData)

    ytrain = trainData['overall'].to_numpy()
    Xtrain = scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    )
    Xtrain = scipy.sparse.csr_matrix(Xtrain)
    testIndex = testData.index
    Xtest = scipy.sparse.hstack(
        (testTextMatrix, scipy.sparse.csr_matrix(testData.to_numpy()))
    )
    Xtest = scipy.sparse.csr_matrix(Xtest)
else:
    Xtrain = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (trainingTextMatrix, scipy.sparse.csr_matrix(trainData.drop(columns='overall').to_numpy()))
    ))
    ytrain = trainData['overall'].to_numpy()
    #Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xtrain, trainData['overall'].to_numpy(), test_size=0.3, shuffle=True)

# Run ML
import sklearn.linear_model
import sklearn.model_selection
import statistics
kf = sklearn.model_selection.KFold(n_splits=10, shuffle=True)
f1_vals = []

for train_index, test_index in kf.split(Xtrain):
    x_train, x_test = Xtrain[train_index], Xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]

    #clf = sklearn.naive_bayes.MultinomialNB()
    #clf = sklearn.tree.DecisionTreeClassifier(max_depth=10)
    clf = sklearn.linear_model.LogisticRegression(max_iter=100000, class_weight='balanced')
    clt = clf.fit(x_train, y_train)

    f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
    print("F1 {}".format(f1))
    f1_vals.append(f1)
 
print("Mean F1: ", statistics.mean(f1_vals))
gnb = sklearn.linear_model.LogisticRegression(max_iter=10000, class_weight='balanced')
gnbTrained = gnb.fit(Xtrain, ytrain)
ypreds = gnbTrained.predict(Xtest)

# only for numerical data best model

# Testing
if not useTestCSV:
    print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': testIndex, 'Awesome': ypreds})
    output.to_csv('./Product_Predictions.csv')

F1 0.7603150772781635
F1 0.7198019724047143
F1 0.7402431264439867


In [None]:
# Decision Tree with K fold for numerical data performance only
Xnumerical_best_train = Xtrain['helpful']
Xnumerical_best_train = Xnumerical_best_train.to_numpy().reshape(-1,1)
kfold = model_selection.KFold(n_splits=10, shuffle=True)
tree_clf = sklearn.tree.DecisionTreeClassifier(max_depth=4)
cv_results_acc = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='accuracy')
cv_results_f1 = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='f1_weighted')
print("f1_weighted score list: {}".format(cv_results_f1))
print("f1_weighted score average: {}".format(np.mean(cv_results_f1)))  
print("accuracy score list: {}".format(cv_results_acc))
print("accuracy score average: {}".format(np.mean(cv_results_acc)))

f1_weighted score list: [0.60724904 0.59836392 0.64437124 0.62662451 0.62482607]
f1_weighted score average: 0.6202869571318353
accuracy score list: [0.63550136 0.61517615 0.63685637 0.62940379 0.64701897]
accuracy score average: 0.6327913279132791
