In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import pandas as pd
import numpy as np
import nltk.tokenize
import math
from nltk.corpus import stopwords
from collections import defaultdict
import sklearn.model_selection
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree 
import string

COMMON_WORD_THRESHOLD = 400

def tokenize_without_stopwords(review, stop_words, ngram_size):
    if type(review) == str:
        tokenized = nltk.tokenize.word_tokenize(review)

        return nltk.ngrams([word for word in tokenized if (word not in string.punctuation) and (word not in stop_words) and (word != 'quot') and (word != "''") and (word != "``")], ngram_size)
    else:
        return []

def get_ground_truth_for_sentiment_classification_training(training_data, positive_or_negative):
    predicted_variable = np.zeros(len(training_data), dtype=np.int8)
    
    if positive_or_negative == 'positive':
        for row_idx, data_row in training_data.iterrows():
            if data_row['overall'] >= 4:
                predicted_variable[row_idx] = 1
    
    elif positive_or_negative == 'negative':
        for row_idx, data_row in training_data.iterrows():
            if data_row['overall'] <= 2:
                predicted_variable[row_idx] = 1
    else:
      raise Exception("Unrecognized sentiment type!")

    return predicted_variable


def format_data_for_review_sentiment_classification(training_data, common_word_threshold, column_name, ngram_size):
    training_data[column_name] = training_data[column_name].apply(
        lambda x: x.lower() if not type(x) == float else x
    )
    word_frequency = defaultdict(lambda: 0)

    stop_words_set = set(stopwords.words('english'))

    print("Building word frequency dictionary...")
    for review in training_data[column_name]:
        review_words = tokenize_without_stopwords(review, stop_words_set, ngram_size)

        for word in review_words:
            word_frequency[word] = word_frequency[word] + 1

    common_words = [
        word for word, freq in word_frequency.items() if freq > common_word_threshold
    ]

    common_words.sort(key=lambda word: word_frequency[word], reverse=True)

    print("Common words: ", common_words[0:50])

    print("Number of unique words", len(word_frequency))
    print("Number words that appear more than {} times".format(common_word_threshold), len(
        common_words
    ))

    print("Getting unique id for each word...")
    index_by_word = {}

    for index, word in enumerate(common_words):
        index_by_word[word] = index

    print("Creating traning matrix")
    data = np.zeros((len(training_data), len(common_words)), dtype=float)
    for row_idx, data_row in training_data.iterrows():
        review_words = tokenize_without_stopwords(data_row[column_name], stop_words_set, ngram_size)

        for word in review_words:
            if word in index_by_word:
                word_idx = index_by_word[word]
                data[row_idx][word_idx] = data[row_idx][word_idx] + 1

    return data


def train_review_sentiment_classifier(dataframe, common_word_threshold, column_name, ngram_size, positive_or_negative):
    print("Training sentiment classifier on column {} with common word threshold {}".format(column_name, common_word_threshold))
    data = format_data_for_review_sentiment_classification(
        dataframe, common_word_threshold, column_name, ngram_size
    )

    ground_truth = get_ground_truth_for_sentiment_classification_training(dataframe, positive_or_negative)

    clf = sklearn.tree.DecisionTreeClassifier(max_depth=6)
    #clf = sklearn.naive_bayes.GaussianNB()
    return clf.fit(data, ground_truth)


def add_review_sentiment_column_into_data(dataframe, trained_classifier, from_colname, to_colname, ngram_size):
    formatted_review_data = format_data_for_review_sentiment_classification(
        dataframe, COMMON_WORD_THRESHOLD, from_colname, ngram_size
    )

    positive_predictions = trained_classifier.predict(formatted_review_data)

    return dataframe.assign(**{to_colname: positive_predictions})


In [None]:
training_data = pd.read_csv("Train.csv")
testing_data = pd.read_csv("Test.csv")

In [None]:
print("~~~~~~~~~~~~~~ POSITIVE SENTIMENT ANALYSIS~~~~~~~~~~~~~~~~~~~~~")
print("Training sentiment classifier on full text!")
trained_positive_review_classifier = train_review_sentiment_classifier(
    training_data,
    COMMON_WORD_THRESHOLD,
    'reviewText',
    2,
    'positive'
)
trained_positive_review_summary_classifier = train_review_sentiment_classifier(
    training_data,
    COMMON_WORD_THRESHOLD,
    'summary',
    1,
    'positive'
)
print("Annotating training data with positive column!")
training_data = add_review_sentiment_column_into_data(training_data, trained_positive_review_classifier, 'reviewText', 'positive_review_text', 2)
training_data = add_review_sentiment_column_into_data(training_data, trained_positive_review_summary_classifier, 'summary', 'positive_review_summary', 1)

positive_ground_truth = get_ground_truth_for_sentiment_classification_training(training_data, 'negative')
positive_predictions_from_text = training_data['positive_review_text'].to_numpy(dtype=np.int8)

positive_predictions_from_summary = training_data['positive_review_summary'].to_numpy(dtype=np.int8)

f1_text = sklearn.metrics.f1_score(ground_truth, positive_predictions_from_text)
accuracy_text = sklearn.metrics.accuracy_score(ground_truth, positive_predictions_from_text)

f1_summary = sklearn.metrics.f1_score(ground_truth, positive_predictions_from_summary)
accuracy_summary = sklearn.metrics.accuracy_score(ground_truth, positive_predictions_from_summary)

print("Evaluating positive sentiment classifier on full review text")
print("F1 {}, Accuracy {}".format(f1_text, accuracy_text))

print("performance on positive summary")
print("F1 {}, Accuracy {}".format(f1_summary, accuracy_summary))






~~~~~~~~~~~~~~ POSITIVE SENTIMENT ANALYSIS~~~~~~~~~~~~~~~~~~~~~
Training sentiment classifier on full text!
Training sentiment classifier on column reviewText with common word threshold 400
Building word frequency dictionary...




Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Training sentiment classifier on column summary with common word threshold 400
Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Annotating training data with positive column!
Building word frequency dictionary...




Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Evaluating positive sentiment classifier on full review text
F1 0.8335672127356533, Accuracy 0.7149633656771499
performance on positive summary
F1 0.8347126158254886, Accuracy 0.717573673693495



Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Training sentiment classifier on column summary with common word threshold 400
Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Annotating training data with positive column!
Building word frequency dictionary...




Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Evaluating negative sentiment classifier on full review text
F1 0.00017632241813602016, Accuracy 0.28544168211848997
performance on negative summary
F1 5.037402715160063e-05, Accuracy 0.2852976

In [None]:
print("~~~~~~~~~~~~~~~~~~~~~~ NEGATIVE SENTIMENT ANALYSIS~~~~~~~~~~~~~~~")
print("Training sentiment classifier on full text!")
trained_negative_review_classifier = train_review_sentiment_classifier(
    training_data,
    COMMON_WORD_THRESHOLD,
    'reviewText',
    2,
    'negative'
)
trained_negative_review_summary_classifier = train_review_sentiment_classifier(
    training_data,
    COMMON_WORD_THRESHOLD,
    'summary',
    1,
    'negative'
)
print("Annotating training data with positive column!")
training_data = add_review_sentiment_column_into_data(training_data, trained_negative_review_classifier, 'reviewText', 'negative_review_text', 2)
training_data = add_review_sentiment_column_into_data(training_data, trained_negative_review_summary_classifier, 'summary', 'negative_review_summary', 1)

negative_ground_truth = get_ground_truth_for_sentiment_classification_training(training_data, 'negative')
negative_predictions_from_text = training_data['negative_review_text'].to_numpy(dtype=np.int8)

negative_predictions_from_summary = training_data['negative_review_summary'].to_numpy(dtype=np.int8)

f1_text = sklearn.metrics.f1_score(negative_ground_truth, negative_predictions_from_text)
accuracy_text = sklearn.metrics.accuracy_score(negative_ground_truth, negative_predictions_from_text)

f1_summary = sklearn.metrics.f1_score(ground_truth, negative_predictions_from_summary)
accuracy_summary = sklearn.metrics.accuracy_score(ground_truth, negative_predictions_from_summary)

print("Evaluating negative sentiment classifier on full review text")
print("F1 {}, Accuracy {}".format(f1_text, accuracy_text))

print("performance on negative summary")
print("F1 {}, Accuracy {}".format(f1_summary, accuracy_summary))

~~~~~~~~~~~~~~~~~~~~~~ NEGATIVE SENTIMENT ANALYSIS~~~~~~~~~~~~~~~
Training sentiment classifier on full text!
Training sentiment classifier on column reviewText with common word threshold 400
Building word frequency dictionary...




Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Training sentiment classifier on column summary with common word threshold 400
Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Annotating training data with positive column!
Building word frequency dictionary...




Common words:  [('ca', "n't"), ('one', 'best'), ('cd', "'s"), ('wo', "n't"), ('pink', 'floyd'), ('sounds', 'like'), ('every', 'song'), ("'s", 'voice'), ("'s", 'music'), ('album', "'s"), ("n't", 'know'), ('...', '...'), ("n't", 'get'), ('highly', 'recommend'), ("'s", 'best'), ('first', 'time'), ('great', 'album'), ('years', 'ago'), ('love', 'cd'), ("'s", 'great'), ('songs', 'like'), ('one', 'favorite'), ('abbey', 'road'), ('songs', 'album'), ('could', "n't"), ("n't", 'like'), ('sound', 'like'), ('great', 'cd'), ('ever', 'heard'), ('even', 'though'), ('title', 'track'), ('much', 'better'), ("'ve", 'heard'), ('long', 'time'), ('great', 'music'), ('song', 'album'), ('bought', 'cd'), ('sound', 'quality'), ('great', 'songs'), ("'s", 'good'), ('would', "n't"), ("n't", 'think'), ('white', 'album'), ('never', 'heard'), ('really', 'good'), ('buy', 'cd'), ('would', 'recommend'), ('--', '--'), ('best', 'album'), ("'s", 'one')]
Number of unique words 3113742
Number words that appear more than 400 t



Building word frequency dictionary...
Common words:  [('great',), ('...',), ('cd',), ('album',), ('music',), ('best',), ("'s",), ('good',), ('love',), ('one',), ('awesome',), ('excellent',), ('christmas',), ('beautiful',), ('soundtrack',), ("n't",), ('beatles',), ('amazing',), ('wonderful',), ('classic',), ('songs',), ('ever',), ('better',), ('like',), ('new',), ('stars',), ('time',), ('another',), ('movie',), ('voice',), ('favorite',), ('still',), ('sound',), ('fantastic',), ('collection',), ('score',), ('must',), ('nice',), ('back',), ('song',), ('rock',), ('buy',), ('fun',), ('get',), ('review',), ('greatest',), ('wow',), ('perfect',), ('band',), ('really',)]
Number of unique words 27690
Number words that appear more than 400 times 97
Getting unique id for each word...
Creating traning matrix
Evaluating negative sentiment classifier on full review text
F1 0.014109347442680775, Accuracy 0.9295576878071612
performance on negative summary
F1 5.037402715160063e-05, Accuracy 0.2852976651

In [None]:
%load_ext google.colab.data_table
training_data

In [None]:
import json
def clean_helpful_column(data):
  if type(data) == float:
    return 0.0

  parsed = json.loads(data)
  if parsed[1] == 0:
    return 0.0
    
  return float(parsed[0]) / float(parsed[1])

In [None]:
training_data['helpful'] = training_data['helpful'].apply(lambda x: clean_helpful_column(x))
training_data['positive_helpful'] = np.where((training_data['helpful'] > 0.1) & (training_data['positive_review_text'] == 1), True, False)
training_data

Unnamed: 0,reviewerID,amazon-id,helpful,unixReviewTime,reviewText,overall,reviewTime,summary,price,categories,root-genre,title,artist,label,first-release-year,songs,salesRank,related,positive_review_text,positive_review_summary,positive_helpful
0,-4984057859803657856,1877521326299865484,0.0,1302739200,very nice music for practicing my tai chi. i d...,4,"04 14, 2011",beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160...",1,1,False
1,9136764282801708742,1877521326299865484,0.0,1180396800,i recently starting doing tai chi which i love...,5,"05 29, 2007",tranquillity in motion !!!,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160...",1,1,False
2,2164551966908582519,1877521326299865484,0.0,1361404800,my wife uses it for her class room the kids lo...,5,"02 21, 2013",great stuff,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160...",1,1,False
3,-7309200698931694843,1877521326299865484,0.0,1338163200,we bought this music to go dr lam dvd. the mus...,5,"05 28, 2012",beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160...",1,1,False
4,-4461682407031037732,1877521326299865484,0.0,1396310400,it helps me do my exercise because it sets the...,5,"04 1, 2014",tai chi music,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160...",1,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111093,-508419005999372045,-272019625357917459,0.0,1405900800,nice soundtrack and i was pleasantly surprised...,4,"07 21, 2014",four stars,33.76,"['CDs & Vinyl', 'Pop']",Pop,-4806325396575401230,-3758738156872779256,222 Records/Interscope,2014.0,"[-1473205402332670702, 4962016810493850163, -1...",6,"{'also_bought': [1108107143497525752, -5941116...",1,1,False
111094,4690686471314282919,-272019625357917459,0.0,1405209600,i'd you are looking for the music that they sa...,5,"07 13, 2014",i'd you are looking for the music that they sa...,33.76,"['CDs & Vinyl', 'Pop']",Pop,-4806325396575401230,-3758738156872779256,222 Records/Interscope,2014.0,"[-1473205402332670702, 4962016810493850163, -1...",6,"{'also_bought': [1108107143497525752, -5941116...",1,1,False
111095,-6735807132142826990,-272019625357917459,0.0,1404259200,fantastic film! loved this movie and the musi...,5,"07 2, 2014",fantastic film! loved this movie and the music,33.76,"['CDs & Vinyl', 'Pop']",Pop,-4806325396575401230,-3758738156872779256,222 Records/Interscope,2014.0,"[-1473205402332670702, 4962016810493850163, -1...",6,"{'also_bought': [1108107143497525752, -5941116...",1,1,False
111096,6536263939078780437,2197509461459270640,0.0,1404518400,"a great new cd with uptempo, funky guitar. thi...",5,"07 5, 2014",a great new cd with uptempo,32.98,"['CDs & Vinyl', 'Jazz', 'Smooth Jazz']",Jazz,5145278291721917176,2800811401610696293,Nuance Music Group,2014.0,"[4589355438812792687, -640221404018307482, 897...",24972,"{'also_bought': [314388363399769352, 111945196...",1,1,False


In [None]:
product_data = training_data.groupby('amazon-id').agg({
    'positive_review_text': 'sum',
    'overall': 'mean',
    'reviewText': 'count',
    'salesRank': 'mean',
    'positive_helpful': 'sum',
    'positive_review_summary': 'sum',
    'negative_review_summary': 'sum',
    'negative_review_text': 'sum'
    })
product_data['awesome'] = np.where(product_data['overall'] >= 4.5, True, False)
product_data

Unnamed: 0_level_0,positive_review_text,overall,reviewText,salesRank,positive_helpful,positive_review_summary,awesome
amazon-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-9217723718720870868,9.0,4.333333,9,1310516,0,9.0,False
-9215746463819797371,4.0,5.000000,4,309139,0,4.0,True
-9213978596308513604,1.0,3.000000,1,280309,0,0.0,False
-9211290576571923870,2.0,4.500000,2,321654,0,2.0,True
-9208769561690910545,16.0,4.500000,16,17515,0,16.0,True
...,...,...,...,...,...,...,...
9218870320655141661,1.0,3.500000,2,1760050,0,2.0,False
9221578337502519209,8.0,4.875000,8,149220,0,8.0,True
9221615570697142155,2.0,5.000000,2,600235,0,2.0,True
9221801008952598876,18.0,4.388889,18,271266,0,13.0,False


In [None]:
training_data.columns

In [None]:
independent_variables = product_data[['positive_review_text', 'positive_review_summary', 'reviewText', 'positive_helpful', 'negative_review_text', 'negative_review_summary']].to_numpy()
dependent_variables = product_data['awesome'].to_numpy(dtype=np.int8)
independent_variables

array([[ 9.,  9.,  9.,  0.],
       [ 4.,  4.,  4.,  0.],
       [ 1.,  0.,  1.,  0.],
       ...,
       [ 2.,  2.,  2.,  0.],
       [18., 13., 18.,  0.],
       [12., 12., 12.,  0.]])

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(independent_variables, dependent_variables, test_size=0.4)
clf = sklearn.naive_bayes.GaussianNB()
trained = clf.fit(X_train, y_train)
trained.predict(X_test)

f1_score = sklearn.metrics.f1_score(y_test, trained.predict(X_test))
accuracy = sklearn.metrics.accuracy_score(y_test, trained.predict(X_test))

print("Accuracy {} F1 {}".format(f1_score, accuracy))

Accuracy 0.7859741015568167 F1 0.6512565196775723
