# Sentiment Analysis of Amazon Book Reviews
- A short exercise of positive/negative sentiment prediction.
- comparison of approaches using:
  - CountVectorizer
  - Term Frequency-Inverse Document Frequency
  - n-grams of words
- text preprocessing:
  - since book reviews, we only take words, ignore symbols and digits, make lowercase
  - lemmatize using WordNet
  - drop stop words like: 'the', 'is', etc.

In [1]:
import pandas as pd
import numpy as np
import warnings
import nltk
import os
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
float_formatter = "{:.2f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

##? LOAD DATA, PREP RELEVANT COLS
reviews = pd.read_csv('./data/all_kindle_review.csv', index_col=[1])
reviews.drop(['Unnamed: 0.1','asin','unixReviewTime','reviewerName','reviewerID','summary','reviewTime','helpful'], axis=1, inplace=True)
reviews.index.name = 'id'

##? SET BOOK RATINGS 4 AND 5 TO BE POSITIVE SENTIMENT LABEL, 1 AND 2 NEGATIVE, DROP 3
reviews['label'] = reviews['rating'].apply(lambda x: 1 if x>3 else 0 if x<3 else -1)
reviews = reviews.drop(reviews[reviews['label']==-1].index, axis=0)
reviews.drop('rating', axis=1, inplace=True)
reviews.columns = ['text','label']
reviews.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5957,Great short read. I didn't want to put it dow...,1
1776,I did not expect this type of book to be in li...,1
3744,Aislinn is a little girl with big dreams. Afte...,1
13641,This has the makings of a good story... unfort...,0
4448,I got this because I like collaborated short s...,1


In [2]:

##? LABEL CLASS PROPORTIONS
print(reviews['label'].value_counts())
# 1    6000
# 0    4000

##? MEAN LENGTH OF SPAM VS NON-SPAM TEXTS
positive = reviews[reviews['label']==1]
negative = reviews[reviews['label']==0]
positivemeanlen = positive.text.str.len().mean()
negativemeanlen = negative.text.str.len().mean()

print('\
Mean length of positive reviews : {:>8.3f}\n\
Mean length of negative reviews : {:>8.3f}\n\
'.format(positivemeanlen, negativemeanlen))
# Mean length of positive reviews :  598.007
# Mean length of negative reviews :  579.159

# No significant difference, so won't add as engineered feature.

1    6000
0    4000
Name: label, dtype: int64
Mean length of positive reviews :  598.007
Mean length of negative reviews :  579.159



# Baseline Models

### CountVectorizer bag-of-words
- ignore words that appear in less than 3 samples, ignore words that appear in more than half the samples.
- only accept alphabets, ignore digits, symbols.
- tokenize, lowercase, lemmatize, drop stop words.
- create a sparse vector array of words and their counts.
- run logistic regression and determine coefficients of features(i.e. words) that highly correlate with positive or negative labels.

In [3]:

##? DEFINE TEXT PREPROCESSING
from nltk.corpus import stopwords
sw_eng = set(stopwords.words('english'))

class LemmaTokenizer():
    def __init__(self):
        self.wnl = nltk.WordNetLemmatizer()
    def __call__(self, doc):
        # lowercase, tokenize
        words = nltk.word_tokenize(doc.lower())
        # drop stopwords english
        words = [ w for w in words if w not in sw_eng ]
        # drop symbols and numbers, only keep words
        words = [ self.wnl.lemmatize(t) for t in words if re.search(r'\b[A-Za-z\']+\b', t) ]
        words = [ w for w in words if w not in sw_eng ]
        return words

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews['text'], 
                                                    reviews['label'], 
                                                    random_state=0)

##? Transform to Vect using CountVectorizer (simple bag-of-words)
# vect = TfidfVectorizer().fit(X_train)
# vect = CountVectorizer(min_df=3).fit(X_train)
vect_countvec = CountVectorizer(
    tokenizer=LemmaTokenizer(),
    strip_accents = 'unicode',
    lowercase = True,
    max_df = 0.5,
    min_df = 3
).fit(X_train)
X_train_vect = vect_countvec.transform(X_train)
X_test_vect  = vect_countvec.transform(X_test)

feature_names = np.array(vect_countvec.get_feature_names_out())
# [w for w in feature_names if re.search('[0-9]+', w)] # sanity check that CountVectorizer indeed ignored numbers etc
print('Feature Count: ', len(feature_names))

##? MultinomialNaiveBayes using CountVectorizer
# model_baseline_countvec = MultinomialNB(alpha=0.1).fit(X_train_vect, y_train)
model_baseline_countvec = LogisticRegression(max_iter=1000).fit(X_train_vect, y_train)
y_pred = model_baseline_countvec.predict(X_test_vect)
score = roc_auc_score(y_test, y_pred)

print('Model Test Score using CountVectorizer: {:.4f}\n'.format(score))

##? Smallest and Largest tfidfs (word importance)
# print as list
# feature_names = np.array(vect.get_feature_names_out())
# sorted_tfidf_index = X_train_vect.max(0).toarray()[0].argsort()
# sorted_tfidf_values = X_train_vect.max(0).toarray()[0].sort()
# print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
# print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))
# print()
#
# display as dataframe
train_vect_count = pd.DataFrame({'feature_name':feature_names, 'count':X_train_vect.max(0).toarray()[0]}).sort_values('count', ascending=False)
print('HIGH OCCURRENCE WORDS')
display(train_vect_count.head(10))
print('LOW OCCURRENCE WORDS')
display(train_vect_count.tail(10))

##? Smallest and Largest Coefs
# print as list
# sorted_coef_index = model_baseline_countvec.coef_[0].argsort()
# print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
# print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
# print()
#
# display as dataframe
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':model_baseline_countvec.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH POSITIVE SENTIMENT')
display(train_coefs.head(10))
print('WORDS ASSOCIATED WITH NEGATIVE SENTIMENT')
display(train_coefs.tail(10))



Feature Count:  9205
Model Test Score using CountVectorizer: 0.8685

HIGH OCCURRENCE WORDS


Unnamed: 0,feature_name,count
11,'s,26
7806,story,23
2045,dean,22
5861,parker,22
3676,h,21
7065,saul,17
5403,n't,17
263,alex,17
7472,slave,17
4428,jane,16


LOW OCCURRENCE WORDS


Unnamed: 0,feature_name,count
5609,objection,1
5599,nutshell,1
2156,denial,1
5607,obey,1
5605,o.,1
5604,o'kane,1
2158,denies,1
5602,nyc,1
5600,nutty,1
9204,~reviewed,1


WORDS ASSOCIATED WITH POSITIVE SENTIMENT


Unnamed: 0,feature_name,coef
2746,enjoyed,2.294729
4913,loved,2.06066
2744,enjoyable,1.72726
4794,liked,1.53797
3388,fun,1.336085
7575,solve,1.272378
3599,great,1.260932
8969,well-written,1.253244
6067,pleasantly,1.248235
8080,tame,1.207037


WORDS ASSOCIATED WITH NEGATIVE SENTIMENT


Unnamed: 0,feature_name,coef
6171,potential,-1.465237
8626,unfortunately,-1.478243
4040,idea,-1.502744
5663,okay,-1.547944
7800,stopped,-1.605004
3954,horrible,-1.611842
9116,worst,-1.661471
8914,waste,-1.745521
969,boring,-1.750775
7605,sorry,-1.771919


In [18]:
print('PREDICTIONS\n   -VE  +VE')
print(model_baseline_countvec.predict_proba(vect_countvec.transform([
    'the book is great',
    'it was really boring',
    'but',
    'don\'t normally enjoy',
    'don\'t enjoy',
])))

PREDICTIONS
   -VE  +VE
[[0.26 0.74]
 [0.91 0.09]
 [0.55 0.45]
 [0.55 0.45]
 [0.55 0.45]]


### tf-idf
- ignore words that appear in less than 3 samples, ignore words that appear in more than half the samples.
- only accept alphabets, ignore digits, symbols.
- tokenize, lowercase, lemmatize, drop stop words.
- create a sparse vector array of words and tf-idf values.
- (tf-idf is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.)(value will be larger for words that are deemed important, i.e they appear often enough, yet still have differentiating factor i.e not words like 'the' which will appear everywhere and have low informative value)
- run logistic regression and determine coefficients of features(i.e. words) that highly correlate with positive or negative labels.

In [19]:

##? Transform to Vect using tfidf

vect_tfidf = TfidfVectorizer(
    tokenizer=LemmaTokenizer(),
    strip_accents = 'unicode',
    lowercase = True,
    max_df = 0.5,
    min_df = 3
).fit(X_train)
X_train_vect = vect_tfidf.transform(X_train)
X_test_vect  = vect_tfidf.transform(X_test)

feature_names = np.array(vect_tfidf.get_feature_names_out())
print('Feature Count: ', len(feature_names))

##? MultinomialNaiveBayes using tf-idf Vectorizer
# model_baseline_tfidf = MultinomialNB(alpha=0.1).fit(X_train_vect, y_train)
model_baseline_tfidf = LogisticRegression(max_iter=1000).fit(X_train_vect, y_train)
y_pred = model_baseline_tfidf.predict(X_test_vect)
score = roc_auc_score(y_test, y_pred)
print('Model_baseline_tfidf Test Score using TfidfVectorizer: {:.4f}\n'.format(score))

##? Smallest and Largest tf-idfs
# display as dataframe
train_vect_tfidf = pd.DataFrame({'feature_name':feature_names, 'tfidf':X_train_vect.max(0).toarray()[0]}).sort_values('tfidf', ascending=False)
print('HIGH IMPORTANCE WORDS')
display(train_vect_tfidf.head(10))
print('LOW IMPORTANCE WORDS')
display(train_vect_tfidf.tail(10))


##? Smallest and Largest Coefs
# display as dataframe
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':model_baseline_tfidf.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH POSITIVE SENTIMENT')
display(train_coefs.head(10))
print('WORDS ASSOCIATED WITH NEGATIVE SENTIMENT')
display(train_coefs.tail(10))




Feature Count:  9205
Model_baseline_tfidf Test Score using TfidfVectorizer: 0.8609

HIGH IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
2746,enjoyed,1.0
2744,enjoyable,1.0
3599,great,1.0
4909,love,1.0
9095,word,0.988182
969,boring,0.921537
9187,yuck,0.885457
5384,must,0.879703
1146,cake,0.864934
2428,done,0.849839


LOW IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
4918,lover-,0.075402
4560,kenyon'sfantasy,0.075402
3473,gennaro,0.074139
5770,overabundance,0.073362
2300,dionne,0.071214
7425,sister-in-law,0.070669
537,assign,0.070086
5725,organic,0.064474
589,attuned,0.061835
9204,~reviewed,0.060706


WORDS ASSOCIATED WITH POSITIVE SENTIMENT


Unnamed: 0,feature_name,coef
2746,enjoyed,6.203247
4913,loved,5.396721
3599,great,4.717249
4794,liked,3.531988
3388,fun,3.335159
4909,love,3.259024
8960,well,3.194049
3535,good,3.013144
2744,enjoyable,2.773525
3968,hot,2.736328


WORDS ASSOCIATED WITH NEGATIVE SENTIMENT


Unnamed: 0,feature_name,coef
7266,sex,-2.862084
657,bad,-2.88364
3155,finish,-2.907934
4040,idea,-2.992962
7605,sorry,-3.178622
3341,free,-3.224293
5571,nothing,-3.248386
969,boring,-3.501281
8914,waste,-3.655241
5403,n't,-4.363071


In [20]:
print('PREDICTIONS\n   -VE  +VE')
print(model_baseline_tfidf.predict_proba(vect_tfidf.transform([
    'the book is great',
    'it was really boring',
    'but',
    'don\'t normally enjoy',
    'don\'t enjoy',
])))

PREDICTIONS
   -VE  +VE
[[0.01 0.99]
 [0.97 0.03]
 [0.43 0.57]
 [0.49 0.51]
 [0.61 0.39]]


# Improved Model

Problem is, both of the above bag-of-word approaches do not take into account structure, like any word associations and orders. the last two reviews mean different things but have similar words, and don't have strong prediction probabilities.

We will use n-grams of words to improve the model.

### tf-idf & n-grams
- ignore words that appear in less than 5 samples, ignore words that appear in more than half the samples.
- only accept alphabets, ignore digits, symbols.
- tokenize, lowercase, lemmatize, drop stop words.
- create n-grams of words. e.g. in 'this is a sentence'
  - bigrams would be 'this is', 'is a', 'a sentence'
  - trigrams would be 'this is a', 'is a sentence'
  - and so on, n-grams being n order of word lengths.
  - by doing so we will capture a sense of word orders and distinguish between 'not bad, quite good' vs 'quite bad, not good'
  - of course, the amount of ngrams we use will make number of features explode significantly and increase computation. thus we increased min_df to 5 above.
- create a sparse vector array of words(and ngrams) and tf-idf values.
- run logistic regression and determine coefficients of features(i.e. words) that highly correlate with positive or negative labels.

In [21]:
##? Transform to Vect using tfidf, include up to 3-word phrases as features
vect_ngram_tfidf = TfidfVectorizer(
    tokenizer=LemmaTokenizer(),
    strip_accents = 'unicode',
    lowercase = True,
    max_df = 0.5,
    min_df = 5,
    ngram_range=(1,2),
).fit(X_train)
X_train_vect = vect_ngram_tfidf.transform(X_train)
X_test_vect  = vect_ngram_tfidf.transform(X_test)

feature_names = np.array(vect_ngram_tfidf.get_feature_names_out())
print('Feature Count: ', len(feature_names))

##? MultinomialNaiveBayes using tf-idf Vectorizer
# model_ngram_tfidf = MultinomialNB(alpha=0.1).fit(X_train_vect, y_train)
model_ngram_tfidf = LogisticRegression(max_iter=1000).fit(X_train_vect, y_train)
y_pred = model_ngram_tfidf.predict(X_test_vect)
score = roc_auc_score(y_test, y_pred)
print('Model_ngram_tfidf Test Score using TfidfVectorizer and trigrams of words: {:.4f}\n'.format(score))

##? Smallest and Largest tf-idfs
# display as dataframe
train_vect_tfidf = pd.DataFrame({'feature_name':feature_names, 'tfidf':X_train_vect.max(0).toarray()[0]}).sort_values('tfidf', ascending=False)
print('HIGH IMPORTANCE WORDS')
display(train_vect_tfidf.head(10))
print('LOW IMPORTANCE WORDS')
display(train_vect_tfidf.tail(10))

##? Smallest and Largest Coefs
# display as dataframe
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':model_ngram_tfidf.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH POSITIVE SENTIMENT')
display(train_coefs.head(30))
print('WORDS ASSOCIATED WITH NEGATIVE SENTIMENT')
display(train_coefs.tail(30))




Feature Count:  13661
Model_ngram_tfidf Test Score using TfidfVectorizer and trigrams of words: 0.8699

HIGH IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
6941,love,1.0
3672,enjoyable,1.0
13318,word word,0.884972
5157,great book,0.87601
1999,cake,0.847588
1825,boring,0.846285
6649,like like,0.838166
10861,sith,0.835648
3680,enjoyed book,0.834944
8666,passionate,0.828153


LOW IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
13135,werewolf romanceher,0.063334
13259,wolf chronicle,0.063334
2813,courtesy romance,0.062973
3249,dionne courtesy,0.062973
3248,dionne,0.062973
903,arend'stidal wave,0.062133
12864,vivian arend'stidal,0.062133
7952,nature book,0.062133
902,arend'stidal,0.062133
9170,protective instinct,0.061302


WORDS ASSOCIATED WITH POSITIVE SENTIMENT


Unnamed: 0,feature_name,coef
3677,enjoyed,5.464269
7013,loved,5.281393
5154,great,4.613258
6941,love,3.566246
6701,liked,3.317127
4653,fun,3.291387
13073,well,2.981591
10493,series,2.909122
4986,good,2.776183
5608,hot,2.775221


WORDS ASSOCIATED WITH NEGATIVE SENTIMENT


Unnamed: 0,feature_name,coef
8917,point,-1.817347
12991,waste time,-1.879519
5596,horrible,-1.882501
8938,poor,-1.893254
7784,n't even,-1.916898
1335,better,-1.9303
8942,poorly,-1.93055
12688,unfortunately,-1.948736
3266,disappointed,-1.95705
5465,heroine,-2.026169


In [22]:
print('PREDICTIONS\n   -VE  +VE')
print(model_ngram_tfidf.predict_proba(vect_ngram_tfidf.transform([
    'the book is great',
    'it was really boring',
    'but',
    'don\'t normally enjoy',
    'don\'t enjoy',
])))

PREDICTIONS
   -VE  +VE
[[0.07 0.93]
 [0.87 0.13]
 [0.44 0.56]
 [0.44 0.56]
 [0.70 0.30]]


- with n-grams, the predictions probabilities are better for the last 2 test entries