In [13]:
import re
import os
import numpy as np
import pandas as pd

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

from NB_Template import *
from LR_Template import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
DATA_DIR = os.path.realpath('../../Data')
X_train, X_val, Y_train, Y_val = map(lambda fn: pd.read_csv(os.path.join(DATA_DIR, fn)), ['X_train.csv', 'X_val.csv', 'Y_train.csv', 'Y_val.csv'])

for df in X_train, Y_train, X_val, Y_val:
    if 'ID' in df:
        df.drop('ID', axis=1, inplace=True)
Y_train = Y_train['Sentiment']
Y_val = Y_val['Sentiment']

In [8]:
X_train.head()

Unnamed: 0,ID,Title,Review Text,Division Name,Department Name,Class Name,Age
0,2220,Comfort,Extremely comfortable t. fits tts. bought in b...,General,Tops,Knits,59
1,14590,Country classic,Perfect fall accessory and transitional piece ...,General,Jackets,Jackets,64
2,11990,Gorgeous but challenging for buxom beauties,"Love, love, love but alas had to return. gorge...",General,Dresses,Dresses,52
3,10036,"Again, my bad for not looking at description...",I did not like this top at all-but had i looke...,General,Tops,Blouses,53
4,8964,Coat,Omg. i also bought this years ago. i have worn...,General,Tops,Sweaters,63


In [9]:
X_val.head()

Unnamed: 0,ID,Title,Review Text,Division Name,Department Name,Class Name,Age
0,3069,,I purchased this sweater in the grey and loved...,General,Tops,Fine gauge,83
1,16840,Way too short,"I'm 5'7"", 130 lbs. i ordered a medium-regular....",General,Dresses,Dresses,57
2,1667,Fabulous top,"Fabric is soft, body runs a bit big but falls ...",General,Tops,Knits,70
3,10085,,Beautiful spring dress. perfect for luncheons ...,General,Dresses,Dresses,63
4,14360,Versatile & fun dress,Not sure why this tunic dress is rated only 3 ...,General,Dresses,Dresses,65


In [10]:
Y_train.head()

Unnamed: 0,ID,Sentiment
0,2220,Positive
1,14590,Positive
2,11990,Positive
3,10036,Negative
4,8964,Positive


In [11]:
Y_val.head()

Unnamed: 0,ID,Sentiment
0,3069,Positive
1,16840,Negative
2,1667,Positive
3,10085,Positive
4,14360,Positive


In [14]:
# Make sure the IDs line up / aren't duplicated
print(sum(X_train['ID'] != Y_train['ID']))
print(sum(X_val['ID'] != Y_val['ID']))

print(len(set(X_train['ID'])) == X_train['ID'].shape[0])
print(len(set(X_val['ID'])) == X_val['ID'].shape[0])

0
0
True
True


In [18]:
X1 = X_train[:100]
X1_text = X1['Review Text']
X1_text.head()

0    Extremely comfortable t. fits tts. bought in b...
1    Perfect fall accessory and transitional piece ...
2    Love, love, love but alas had to return. gorge...
3    I did not like this top at all-but had i looke...
4    Omg. i also bought this years ago. i have worn...
Name: Review Text, dtype: object

In [25]:
# View char counts

unique_chars = set([ch for text in X1_text for ch in text])
#display(unique_chars)
punct = ''.join([ch for ch in unique_chars if not re.search(r'[A-Za-z0-9]', ch)])
punct

'.$-\n\r& ",!/?_(#;\':)*'

In [29]:
def preprocess(text):
    text = text.lower()
    bow = re.split(r'[.$\-\n\r& \",!/?_(#;\':)*]', text)
    bow = [word for word in bow if len(word) > 1] # remove empty strings and 1-letter words
    return bow

In [37]:
all_word_counts = Counter()
for text in X1_text:
    bow = preprocess(text)
    all_word_counts.update(bow)
word_counts = all_word_counts.most_common(10)
vocabulary = sorted(list(zip(*word_counts))[0])
vocabulary

['and', 'but', 'in', 'is', 'it', 'of', 'the', 'this', 'to', 'with']

In [41]:
vec = CountVectorizer()
doc_term_matrix = vec.fit_transform(X1_text)
pd.DataFrame(doc_term_matrix.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [43]:
vec.vocabulary_.keys()

dict_keys(['extremely', 'comfortable', 'fits', 'tts', 'bought', 'in', 'black', 'teal', 'can', 'be', 'dressed', 'up', 'with', 'scarf', 'or', 'worn', 'just', 'jewelry', 'fairly', 'lightweight', 'so', 'probably', 'not', 'good', 'choice', 'for', 'cold', 'winter', 'months', 'great', 'price', 'too', 'perfect', 'fall', 'accessory', 'and', 'transitional', 'piece', 'cooler', 'days', 'classic', 'red', 'check', 'fashion', 'flair', 'jeans', 'dress', 'it', 'find', 'love', 'but', 'alas', 'had', 'to', 'return', 'gorgeous', 'fabric', 'elegant', 'design', 'purchased', 'an', 'evening', 'summer', 'wedding', 'however', 'the', 'cut', 'of', 'this', 'is', 'more', 'suited', 'small', 'bust', 'line', 'short', 'waist', 'felt', 'like', 'hit', 'at', 'my', 'rib', 'cage', 'natural', 'top', 'half', 'was', 'spilling', 'out', 'neckline', 'slightly', 'loose', 'don', 'think', 'sizing', 'would', 'have', 'helped', 'maybe', 'needed', 'size', 'down', '145', 'lbs', '10', 'loved', 'didn', 'fit', 'me', 'properly', 'also', 'noti

In [45]:
Y_train

Unnamed: 0,ID,Sentiment
0,2220,Positive
1,14590,Positive
2,11990,Positive
3,10036,Negative
4,8964,Positive
...,...,...
13188,16470,Positive
13189,20154,Negative
13190,14649,Positive
13191,19938,Positive


In [10]:
nb = NaiveBayes()
nb.fit(X_train, Y_train)

In [19]:
Y_pred = nb.predict(X_val)
Y_pred = Y_pred.replace('Positive', 1).replace('Negative', 0)
Y_val = Y_val.replace('Positive', 1).replace('Negative', 0)

print('Accuracy:', accuracy_score(Y_val, Y_pred))
print('Precision:', precision_score(Y_val, Y_pred))
print('Recall:', recall_score(Y_val, Y_pred))
print('ROC/AUC:', roc_auc_score(Y_val, Y_pred))
print('F1:', f1_score(Y_val, Y_pred))

Accuracy: 0.9348287359806002
Precision: 0.9543010752688172
Recall: 0.9729359369647139
ROC/AUC: 0.8075206000613042
F1: 0.9635284139100933
