The goal of this assignment is to understand precision-recall in the context of classifiers.

Use Amazon review data in its entirety.
Train a logistic regression model.
Explore various evaluation metrics: accuracy, confusion matrix, precision, recall.
Explore how various metrics can be combined to produce a cost of making an error.
Explore precision and recall curve

In [99]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [100]:
products = pd.read_csv("amazon_baby.csv")

In [101]:
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation))

In [102]:
def remove_numbers(text):
    return text.translate(str.maketrans('','','0123456789'))

In [103]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    import re
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [104]:
products = products[products["rating"] != 3]

In [105]:
products["sentiment"] = products["rating"].apply(lambda x : 1 if x >=4 else -1)

In [106]:
pos = (products["sentiment"]==1).sum()
neg = (products["sentiment"]==-1).sum()
print("positive review count: ", (pos))
print("negative review count: " ,(neg))

positive review count:  140259
negative review count:  26493


In [107]:
products["word_counts"] = products["review"].apply(lambda x : len(str(x).split(" ")))

In [108]:
def text_preprocessing(df,col_in,col_out):
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    from textblob import TextBlob
    from nltk.stem import PorterStemmer
    st = PorterStemmer()
    from textblob import Word
    from tqdm import tqdm
    
    #df is dataframe
    # col_in: input column used for processing
    # col_out: output column name
    # return df
    tqdm.pandas(desc="lower_case")
    # convert to lower_case
    df[col_out] = df[col_in].progress_apply(lambda x : str(x).lower())
    print("\n")
    tqdm.pandas(desc="Punctuation")
    # Removing Punctuation
    df[col_out] = df[col_out].progress_apply(lambda x : remove_punctuation(str(x)))
    print("\n")
    tqdm.pandas(desc="Stop_Words")
    # Removal of Stop Words
    df[col_out] = df[col_out].progress_apply(lambda x : " ".join([i for i in x.split() if i not in stop]))
    #print("\n")
    #tqdm.pandas(desc="Spelling_correction")
    # Spelling correction
    #df[col_out] = df[col_out].progress_apply(lambda x : str(TextBlob(x).correct()))
    print("\n")
    tqdm.pandas(desc="Stemming")
    # Stemming: removal of “ing”, “ly”, “s”
    df[col_out] = df[col_out].progress_apply(lambda x : " ".join([st.stem(word) for word in x.split()]))
    print("\n")
    tqdm.pandas(desc="Lemmatization")
    # Lemmatization
    df[col_out] = df[col_out].progress_apply(lambda x: " ".join([Word(word).lemmatize("v") for word in x.split()]))
    
    return df

In [109]:
'''
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=20, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
products = text_preprocessing(products, "review", "review_processed")
bar.finish()
'''

'\nimport progressbar\nfrom time import sleep\nbar = progressbar.ProgressBar(maxval=20,     widgets=[progressbar.Bar(\'=\', \'[\', \']\'), \' \', progressbar.Percentage()])\nbar.start()\nproducts = text_preprocessing(products, "review", "review_processed")\nbar.finish()\n'

In [110]:
products = text_preprocessing(products, "review", "review_processed")

lower_case: 100%|██████████████████████████████████████████████████████████| 166752/166752 [00:01<00:00, 165563.53it/s]






Punctuation: 100%|██████████████████████████████████████████████████████████| 166752/166752 [00:02<00:00, 65325.34it/s]






Stop_Words: 100%|████████████████████████████████████████████████████████████| 166752/166752 [00:45<00:00, 3704.53it/s]






Stemming: 100%|███████████████████████████████████████████████████████████████| 166752/166752 [05:14<00:00, 529.81it/s]






Lemmatization: 100%|█████████████████████████████████████████████████████████| 166752/166752 [01:30<00:00, 1836.02it/s]


In [111]:
products.head()

Unnamed: 0,name,review,rating,sentiment,word_counts,review_processed
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1,30,come earli disappoint love planet wise bag wip...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1,23,soft comfort warmer looksfit full size bed per...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1,78,product well worth purchas find anyth els like...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,79,kid cri nonstop tri ween pacifi find thumbuddi...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1,93,binki fairi come hous didnt special gift book ...


In [116]:
with open("module-9-assignment-train-idx.json","r") as file:
    train_idx = json.load(file)
with open("module-9-assignment-test-idx.json","r") as file:
    test_idx = json.load(file)

In [118]:
train_data = products.iloc[train_idx]
test_data = products.iloc[test_idx]
print("shape of train:", train_data.shape[0])
print("shape of test:", test_data.shape[0])

shape of train: 133416
shape of test: 33336


In [119]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_processed'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_processed'])

In [120]:
train_matrix.shape

(133416, 102457)

In [121]:
test_matrix.shape

(33336, 102457)

In [124]:
y_train = train_data["sentiment"].as_matrix()
y_test = test_data["sentiment"].as_matrix()

  """Entry point for launching an IPython kernel.
  


In [126]:
y_train.shape

(133416,)

In [127]:
y_test.shape

(33336,)

In [123]:
from sklearn.linear_model import LogisticRegression

In [128]:
model = LogisticRegression().fit(train_matrix,y_train)

In [130]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=test_data['sentiment'].as_matrix(), y_pred=model.predict(test_matrix))
print ("Test Accuracy: %s" % accuracy)

  


Test Accuracy: 0.9228761699064075
