## Load the Amazon review data-set

In [1]:
import sframe as sf
products = sf.SFrame("../data/amazon_baby.gl")

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1475129310.log


In [2]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [3]:
products = products[products['rating'] != 3] # ignore all the neutral ratings

# Assign Positive and Negative ratings according to the reviews.
products['sentiment'] =  products['rating'].apply(lambda rating : +1 if rating > 3 else -1) 

# Split the data into training and test set

In [4]:
train_data, test_data = products.random_split(.8, seed=1)

# Build the Word Vector

Now build the bag-of-words vector per review. Bag of words is essentially the count of words in each review i,e it is a count of each word that indicates how many times a word has appeared in that particular review.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=test_data['sentiment'].to_numpy(), y_pred=model.predict(test_matrix))
print "Test Accuracy: %s" % accuracy

Test Accuracy: 0.932175425966


In [8]:
from __future__ import division
baseline = len(test_data[test_data['sentiment'] == 1])/len(test_data)
print "Baseline accuracy (majority class classifier): %s" % baseline

Baseline accuracy (majority class classifier): 0.842782577394


**Quiz question**: Using accuracy as the evaluation metric, was our logistic regression model better than the baseline (majority class classifier)? <span style="color:red">*YES*</span>

In [9]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_true=test_data['sentiment'].to_numpy(),
                        y_pred=model.predict(test_matrix),
                        labels=model.classes_)    # use the same order of class as the LR model.
print ' target_label | predicted_label | count '
print '--------------+-----------------+-------'
# Print out the confusion matrix.
# NOTE: Your tool may arrange entries in a different order. Consult appropriate manuals.
for i, target_label in enumerate(model.classes_):
    for j, predicted_label in enumerate(model.classes_):
        print '{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j])

 target_label | predicted_label | count 
--------------+-----------------+-------
     -1       |       -1        |  3786
     -1       |        1        |  1455
      1       |       -1        |   806
      1       |        1        | 27289


In [10]:
totalPos = 1455 + 27289
actualPos = 27289 + 806
print "fractional:", (1455/totalPos), 1455/actualPos, 27289/actualPos

fractional: 0.0506192596716 0.0517885744794 0.971311621285


In [11]:
def print_cmat(y_true, y_pred, labels):
    from sklearn.metrics import confusion_matrix
    cmat = confusion_matrix(y_true=test_data['sentiment'].to_numpy(),
                            y_pred=y_pred,
                            labels=labels)    # use the same order of class as the LR model.
    print ' target_label | predicted_label | count '
    print '--------------+-----------------+-------'
    # Print out the confusion matrix.
    # NOTE: Your tool may arrange entries in a different order. Consult appropriate manuals.
    for i, target_label in enumerate(model.classes_):
        for j, predicted_label in enumerate(model.classes_):
            print '{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j])

In [12]:
TN = 3786
FP = 1455
FN = 806
TP = 27289
TOTAL = TN + FP + FN + TP

**Quiz Question**: How many predicted values in the test set are false positives? <span style="color:blue">**1455**</span>

In [13]:
cost = FP*100 + FN*1
print "Quiz Question: Given the stipulation, what is the cost associated with the logistic regression classifier's performance on the test set? ", cost

Quiz Question: Given the stipulation, what is the cost associated with the logistic regression classifier's performance on the test set?  146306


In [14]:
from sklearn.metrics import precision_score
precision = precision_score(y_true=test_data['sentiment'].to_numpy(), 
                            y_pred=model.predict(test_matrix))
print "Precision on test data: %s" % precision

Precision on test data: 0.949380740328


In [15]:
print "Fraction of FP", FP/(FP + TP)

Fraction of FP 0.0506192596716


In [16]:
from sklearn.metrics import recall_score
recall = recall_score(y_true=test_data['sentiment'].to_numpy(),
                      y_pred=model.predict(test_matrix))
print "Recall on test data: %s" % recall

Recall on test data: 0.971311621285


In [18]:
recall = TP/(TP+FN)

In [19]:
recall

0.9713116212849261

In [None]:
def apply_threshold(probabilities, threshold):
    def tApply(a, b):
        if(a > b):
            return +1
        else:
            return -1
    vfunc = np.vectorize(tApply)
    return vfunc(probabilities, threshold)

In [None]:
probabilities = model.predict_proba(test_matrix)[:,1]

In [None]:
t_05 = apply_threshold(probabilities, 0.5)

In [None]:
def print_results(array):
    print "Positives:", (array == +1).sum(), " and Negatives:", (array == -1).sum() 

t_09 = apply_threshold(probabilities, 0.9)    
print "Actual:", print_results(test_data['sentiment'].to_numpy())
print "Predicted:", print_results(t_05)
print "Predicted:", print_results(t_09)

recall is fraction of TP. As we increase the threshold, the TP decreases and FN increases which will bring down the recall value. Also, as we increase the threshold, predicted +ve decreases as compared to Actual +ves, i.e FP decreases. 

In [None]:
import numpy as np
threshold_values = np.linspace(0.5, 1, num=100)
print threshold_values

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def plot_pr_curve(precision, recall, title):
    plt.rcParams['figure.figsize'] = 7, 5
    plt.locator_params(axis = 'x', nbins = 5)
    plt.plot(precision, recall, 'b-', linewidth=4.0, color = '#B0017F')
    plt.title(title)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.rcParams.update({'font.size': 16})

In [None]:
y = test_data['sentiment'].to_numpy()
precision_all = [precision_score(y_true=y, y_pred=apply_threshold(probabilities, threshold)) for threshold in threshold_values]
recall_all = [recall_score(y_true=y, y_pred=apply_threshold(probabilities, threshold)) for threshold in threshold_values]

In [None]:
plot_pr_curve(precision_all, recall_all, 'Precision recall curve (all)')

In [None]:
y_pred = apply_threshold(probabilities, 0.98)
print_cmat(y, y_pred, model.classes_)

In [None]:
precision_all

In [None]:
idx = np.argmin(np.abs(np.asarray(precision_all) - 0.965))

In [None]:
idx

In [None]:
threshold_values[idx]

In [None]:
baby_reviews = test_data[test_data['name'].apply(lambda x: 'baby' in x.lower())]

In [None]:
baby_matrix = vectorizer.transform(baby_reviews['review_clean'])
probabilities = model.predict_proba(baby_matrix)[:,1]

In [None]:
precision_all = [precision_score(y_true=baby_reviews['sentiment'].to_numpy(), y_pred=apply_threshold(probabilities, threshold)) for threshold in threshold_values]
recall_all = [recall_score(y_true=baby_reviews['sentiment'].to_numpy(), y_pred=apply_threshold(probabilities, threshold)) for threshold in threshold_values]