## Load Data

In [9]:
# Load CSV files from remote repo
import requests
import zipfile
import io

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

r = requests.get('https://github.com/charliecarver/cosc247/blob/master/datasets.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
testPath = 'Test.csv'
trainPath = 'Train.csv'

[nltk_data] Downloading package stopwords to /home/sam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Single-File Deliverable

In [13]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse
import string
from ast import literal_eval
import nltk.tokenize
import math
from nltk.corpus import stopwords
import sklearn.metrics
import statistics
import sklearn.naive_bayes
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.linear_model
from sklearn.metrics import classification_report, confusion_matrix
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from matplotlib import ticker
from sklearn.tree import export_graphviz
import graphviz
from collections import defaultdict

NGRAM_SIZE = 1
COMMON_WORD_THRESHOLD = 10

def tokenize_without_stopwords(review, stop_words, ngram_size):
    if type(review) == str:
        tokenized = nltk.tokenize.word_tokenize(review)

        return nltk.ngrams([word for word in tokenized if (word not in string.punctuation) and (word not in stop_words) and (word != 'quot') and (word != "''") and (word != "``") and (word != '---')], ngram_size)
    else:
        return []

def format_data_for_review_sentiment_classification(dataframe, index_by_word, ngram_size, column_name):
    stop_words_set = set(stopwords.words('english'))
    common_words = [key for key in index_by_word.keys()]

    print("Creating traning matrix")
    data = np.zeros(shape=(len(dataframe), len(common_words)), dtype=np.float64)
    print("Data shape:", data.shape)
    row_number = 0
    for _, data_row in dataframe.iterrows():
        review_words = tokenize_without_stopwords(data_row[column_name], stop_words_set, ngram_size)

        for word in review_words:
            if word in index_by_word:
                word_idx = index_by_word[word]
                data[row_number, word_idx] = np.float64(data[row_number, word_idx] + 1)
        row_number = row_number + 1

    return scipy.sparse.csr_matrix(data, dtype=np.float64)


def get_common_word_indices(training_data, column_name, ngram_size, common_word_threshold):
    training_data[column_name] = training_data[column_name].apply(
        lambda x: x.lower() if not type(x) == float else x
    )
    word_frequency = defaultdict(lambda: 0)

    stop_words_set = set(stopwords.words('english'))

    print("Building word frequency dictionary...")
    for review in training_data[column_name]:
        review_words = tokenize_without_stopwords(review, stop_words_set, ngram_size)

        for word in review_words:
            word_frequency[word] = word_frequency[word] + 1

    common_words = [
        word for word, freq in word_frequency.items() if freq > common_word_threshold
    ]

    common_words.sort(key=lambda word: word_frequency[word], reverse=True)

    print("Common words: ", common_words[0:50])

    print("Number of unique words", len(word_frequency))
    print("Number words that appear more than {} times".format(common_word_threshold), len(
        common_words
    ))

    print("Getting unique id for each word...")
    index_by_word = {}

    for index, word in enumerate(common_words):
        index_by_word[word] = index

    return index_by_word

# Train text classifier
def trainTextFrequency(df):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join,
        'overall': 'mean',
    })

    P['awesome'] = np.where(P['overall'] >= 4.5, True, False)
    P['reviewText'] = P['reviewText'] + " " + P['summary']

    word_index1 = get_common_word_indices(P, 'reviewText', NGRAM_SIZE, COMMON_WORD_THRESHOLD)
    X1 = format_data_for_review_sentiment_classification(P, word_index1, NGRAM_SIZE, 'reviewText')

    return X1, word_index1

def getTextMatrix(df, word_indices):
    df['reviewText'] = df['reviewText'].fillna("")
    df['summary'] = df['summary'].fillna("")

    P = df.groupby('amazon-id').agg({
        'reviewText': ' '.join,
        'summary': ' '.join
    })

    P['reviewText'] = P['reviewText'] + " " + P['summary']

    X1 = format_data_for_review_sentiment_classification(P, word_index1, NGRAM_SIZE, 'reviewText')

    return X1

# function for normalization
def normalize_column_data(input_data):
    numerical_feautures = ['unixReviewTime','price', 'salesRank', 'helpful']
    for feature in numerical_feautures:    
        input_data[feature] = (input_data[feature]-input_data[feature].min())/(input_data[feature].max()-input_data[feature].min())

# Process numerical data
def processNumerical(df):

    # Drop text data
    df = df.drop(columns=['title', 'reviewText', 'summary', 'categories', 'songs', 'related', 'reviewTime'])

    # Drop columns that need more time to process
    df = df.drop(columns=['label', 'first-release-year'])

    # Transform helpful into "ratio" of being helpful
    df['helpful'] = df['helpful'].apply(lambda x: np.nan if literal_eval(x)[1]== 0 else literal_eval(x)[0]/literal_eval(x)[1])
    df['helpful'].fillna((df['helpful'].median()), inplace=True)

    # Convert categorical data to their own features
    # df = df.join(pd.get_dummies(df['root-genre']))
    df = df.drop(columns=['root-genre'])

    # Return processed data
    return df

# Flag to set mode
useTestCSV = False

# Load data
try:
    testPath
except NameError:
    # Default paths of CSV files
    print('Loading files from default locations')
    testPath = 'Test.csv'
    trainPath = 'Train.csv'

# Load dataframes
dfTrain = pd.read_csv(trainPath)
if useTestCSV: dfTest = pd.read_csv(testPath)

# Train text classifier on training data
textMatrix, wordIndices = trainTextFrequency(dfTrain)

# Process textual data
if useTestCSV:
    testTextMatrix = getTextMatrix(dfTest, wordIndices)

# Process numerical data
dfTrain = processNumerical(dfTrain)
if useTestCSV: dfTest = processNumerical(dfTest)

Building word frequency dictionary...
Common words:  [("'s",), ('album',), ('cd',), ('music',), ('one',), ("n't",), ('great',), ('songs',), ('like',), ('song',), ('love',), ('good',), ('best',), ('...',), ('time',), ('would',), ('first',), ('really',), ('sound',), ('get',), ('well',), ('much',), ('new',), ('listen',), ('even',), ('also',), ('beatles',), ('voice',), ('still',), ('track',), ('better',), ('many',), ('albums',), ('heard',), ("'m",), ('think',), ('tracks',), ('band',), ('ever',), ('way',), ('two',), ('could',), ('back',), ("'ve",), ('movie',), ('years',), ('never',), ('work',), ('--',), ('fan',)]
Number of unique words 253503
Number words that appear more than 10 times 26296
Getting unique id for each word...
Creating traning matrix
Data shape: (10543, 26296)


In [25]:
# Aggregate training
isAwesome = lambda x: 1 if np.mean(x) > 4.5 else 0
trainData = dfTrain.groupby('amazon-id').agg({
    'unixReviewTime': 'mean',
    'price': 'mean',
    'overall': isAwesome,
    'salesRank': 'mean',
    'helpful': 'mean'
})
print("number rows in train data: ", len(trainData))

# normalization for numerical features
normalize_column_data(trainData)

# Aggregate testing data and split into dependent/independent vars
if useTestCSV:
    testData = dfTest.groupby('amazon-id').agg({
        'unixReviewTime': 'mean',
        'price': 'mean',
        'salesRank': 'mean',
        'helpful': 'mean'
    })
    normalize_column_data(testData)
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    print(Xtrain.shape)

    Xtrain = scipy.sparse.csr_matrix(
        scipy.sparse.hstack((textMatrix, Xtrain))
    )
    Xtest, ytest = scipy.sparse.csr_matrix(testData.to_numpy()), []

    Xtest = scipy.sparse.csr_matrix(scipy.sparse.hstack(
        (testTextMatrix, Xtest)
    ))
else:
    Xtrain, ytrain = trainData.drop(columns='overall'), trainData['overall']
    Xtest, ytest = testData.drop(columns='overall'), testData['overall']
    
    print(len(Xtrain))

    X = scipy.sparse.csr_matrix(
        scipy.sparse.hstack((textMatrix, Xtrain.to_numpy()))
    )
    y = trainData['overall'].to_numpy()


kf = sklearn.model_selection.KFold(n_splits=10, shuffle=True)
f1_vals = []

from sklearn.linear_model import LogisticRegression

for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #clf = sklearn.naive_bayes.MultinomialNB()
    #clf = sklearn.tree.DecisionTreeClassifier(max_depth=10)
    clf = LogisticRegression(max_iter=100000, class_weight='balanced')
    clt = clf.fit(x_train, y_train)

    f1 = sklearn.metrics.f1_score(y_test, clt.predict(x_test), average='weighted')
    print("F1 {}".format(f1))
    f1_vals.append(f1)

import statistics
print("Mean F1 value: {}".format(statistics.mean(f1_vals)))
# Run ML
"""
gnb = sklearn.naive_bayes.GaussianNB()
gnbTrained = gnb.fit(Xtrain, ytrain)
preds = gnbTrained.predict(Xtest)
"""

# Testing
if not useTestCSV:
    print(sklearn.metrics.f1_score(ytest, ypreds, average='weighted'))

# Output CSV file with predictions
if useTestCSV:

    # Output predictions for deliverable
    output = pd.DataFrame({'amazon-id': Xtest.index, 'Awesome': preds})
    output.to_csv('./Product_Predictions.csv')

number rows in train data:  10543
10543
F1 0.6845126457304492
F1 0.7126216835241035
F1 0.6887596340241856
F1 0.6800365013523982
F1 0.6890445445778232
F1 0.6790070779618123
F1 0.6931755572442905
F1 0.682722716743286
F1 0.686000178064957
F1 0.6911080273572953
Mean F1 value: 0.6886988566580601


NameError: name 'ypreds' is not defined

In [None]:
# Decision Tree with K fold for numerical data performance only
Xnumerical_best_train = Xtrain['helpful']
Xnumerical_best_train = Xnumerical_best_train.to_numpy().reshape(-1,1)
kfold = model_selection.KFold(n_splits=5, shuffle=True)
tree_clf = sklearn.tree.DecisionTreeClassifier(max_depth=4)
cv_results_acc = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='accuracy')
cv_results_f1 = model_selection.cross_val_score(tree_clf, Xtrain, ytrain.values.ravel(), cv=kfold, scoring='f1_weighted')
print("f1_weighted score list: {}".format(cv_results_f1))
print("f1_weighted score average: {}".format(np.mean(cv_results_f1)))  
print("accuracy score list: {}".format(cv_results_acc))
print("accuracy score average: {}".format(np.mean(cv_results_acc)))