Implementing logistic regression from scratch

In [1]:
import pandas as pd
import numpy as np
import json
import string
import math
from math import sqrt

Import the amazon reviews dataset

In [2]:
products = pd.read_csv("amazon_baby_subset.csv")

display(products.head(1))

print("\n The shape of PRODUCTS dataset is {}".format(products.shape))

print("\n # of positive reviews {}".format(len(products[products["sentiment"] == 1  ])))
print("\n # of negative reviews {}".format(len(products[products["sentiment"] == -1 ])))

## This is not a class-imbalanced problem. 

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1



 The shape of PRODUCTS dataset is (53072, 4)

 # of positive reviews 26579

 # of negative reviews 26493


In [3]:
file_name_important_words = "important_words.json"

with open (file_name_important_words, "r") as data:
    important_words = json.load(data)
    
print("\n The lenght of important_words {} \n\n Head of important words {}".format(len(important_words), important_words[0:5]))



 The lenght of important_words 193 

 Head of important words ['baby', 'one', 'great', 'love', 'use']


In [4]:
## get cleaned reviews 

products["review_clean"] = products["review"].str.replace("[{}]".format(string.punctuation), "")

display(products.head(1))

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...


Testing List Comprehension

In [23]:
list_comprehension = [1,2,3,4]

a = [x**2 for x in list_comprehension]
a

[1, 4, 9, 16]

Difference Between a Lambda Function and List Comprehension.

In [5]:
## takes couple of minutes
for word in important_words:
    products[word] = products['review_clean'].astype('U').apply(lambda s : s.split().count(word))
## astype("U") makes it unicode. 

In [6]:
products["contains_perfect"] = products["perfect"] >= 1

print("The # of data points that contains perfect are {}".format(np.sum(products.contains_perfect)))

The # of data points that contains perfect are 2955


In [7]:
def get_numpy_data(dataframe, features_, label_):
    
    dataframe["intercept"] = 1   ## the equation has an intercept, why is it kept to one?? 
    ## doesn't matter the co_ef starts with 0 so everything is zero
    features_all =   ["intercept"] + features_  ## all features ## why is intercept added at first 
    return_df = dataframe[features_all].to_numpy() ## filter them all
    dataframe_label = dataframe[label_].to_numpy() ## filter labels
    
    return return_df, dataframe_label

In [9]:
feature_matrix, sentiment = get_numpy_data(products, important_words, "sentiment")

print("\n Feature Matrix data \n {}".format(feature_matrix[:2]))
print("\n Sentiment data \n {}".format(sentiment[:2]))



 Feature Matrix data 
 [[1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
  0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

 Sentiment data 
 [1 1]


The score is from the line(linear or polynomial) and then the score is passed through a sigmoid to get probability

In [10]:
print("The lenght of 1 instance of feature matrix : ",len(feature_matrix[0]))
print("The lenght of important words  : ", len(important_words))

The lenght of 1 instance of feature matrix :  194
The lenght of important words  :  193


In [11]:
print("\n Shape of feature matrix : {}".format(feature_matrix.shape))

print("\n Shape of Jth feature  : {}".format(feature_matrix[7].shape))


 Shape of feature matrix : (53072, 194)

 Shape of Jth feature  : (194,)


In [20]:
def predict_probability(feature_matrix, coefficients):
    '''
    produces probablistic estimate for P(y_i = +1 | x_i, w).
    estimate ranges between 0 and 1.
    '''
    score = np.dot(feature_matrix, coefficients)
    predinction = 1/(1+np.exp(-score))
    return predinction

>The derivate of log likelihood with respect to single co-efficent : that feature value * [labels - predictions]

In [68]:
def feature_derivative(errors, feature):  
    '''
    The derivatve wrt to each feature
    The will give the value delta step to be added to the existing derivative
    
    feature * error
    '''
    derivative = np.dot(np.transpose(errors), feature) ## feature or indicator
    return derivative
    


In [70]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    '''
    We calcualte the likelikehood - 
    
    score * ( indicator-1) - predictions
    '''
    indicator = (sentiment == +1) ## array with one row
    
    score = np.dot (feature_matrix, coefficients) ## on columns
    
    log_likelihood = np.log(1. + np.exp(-score)) ## one column
    indicator_     = (np.transpose(np.array([indicator]))-1)*score
    lp  = np.sum(indicator_ - log_likelihood)
    
    return lp
    

In [17]:
## Logistic Regression Initial Conditions
initial_coefficients = np.zeros((194,1))
step_size = 1e-7
max_iter = 301


In [54]:
score = np.dot (feature_matrix, coefficients)
print("shape of scores {}".format(score.shape))

left_side = (np.transpose(np.array([indicator]))-1)*score
print("shape of left_side {}".format(left_side.shape))

print("\nindicator")
display(indicator[0:5])
print("shape of indicator {}".format(indicator.shape))

print ('\nnp array ')
test_1 = np.array([indicator])
display(test_1)
display(test_1[:,4])
print("shape of test_1 {}".format(test_1.shape))


test_2 = np.transpose(np.array([indicator]))
#display(test_2)
display(test_2[:7])
print("shape of test_2 {}".format(test_2.shape))

## indicator True = +1 
False-1

display(sentiment [0:5])
display(indicator [0:5])

shape of scores (53072, 1)
shape of left_side (53072, 1)

indicator


array([ True,  True,  True,  True,  True])

shape of indicator (53072,)

np array 


array([[ True,  True,  True, ..., False, False, False]])

array([ True])

shape of test_1 (1, 53072)


array([[ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True]])

shape of test_2 (53072, 1)


array([1, 1, 1, 1, 1], dtype=int64)

array([ True,  True,  True,  True,  True])

In [30]:
coefficients = np.array(initial_coefficients)
print("size of coefs {}".format(coefficients.shape))
predictions = predict_probability(feature_matrix, coefficients)

indicator = (sentiment==+1)

print("\nindicator")
display(indicator[0:5])

print("\npredictions")
display(predictions[0:5])

print("\nshape of is indicator {} ". format( indicator.shape))
print("\nshape of is predictions{} ". format( predictions.shape))

size of coefs (194, 1)

indicator


array([ True,  True,  True,  True,  True])


predictions


array([[0.5],
       [0.5],
       [0.5],
       [0.5],
       [0.5]])


shape of is indicator (53072,) 

shape of is predictions(53072, 1) 


In [79]:
#coefficients: D * 1
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    '''
    input
    feature_matrix = numpy array for feature matrix 194 columns
    sentiment   - label for each review
    initial_coef - all np.zero(len(coeff,1)
    step_size  - heuristically 
    max_iter   - 
    '''
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    # lplist = []
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        # gives the scores
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = np.transpose(np.array([indicator])) - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            
            ## for each coeff the delta is added for hill climb
            coefficients[j] += step_size*derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lplist = []
            lplist.append(compute_log_likelihood(feature_matrix, sentiment, coefficients))
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            #print ('iteration %*d: log likelihood of observed labels = %.8f' % \
            #   (int(np.ceil(np.log10(max_iter))), itr, lp))
    
    import matplotlib.pyplot as plt
    x= [i for i in range(len(lplist))]
    plt.plot(x,lplist,'ro')
    plt.show()
    
    return coefficients

In [71]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [82]:
predictions_ = predict_probability(feature_matrix, coefficients)

predictions_positive = (predictions_>0.5 ).sum()

print("Total # predictions_positive {} ".format(predictions_positive) )

Total # predictions_positive 25126 


In [83]:

print (np.transpose(predictions_.flatten()).shape)
print ((products['sentiment']).shape)

(53072,)
(53072,)


In [91]:
np.transpose(predictions_[:5].flatten())

array([0.51275866, 0.49265935, 0.50602867, 0.50196725, 0.53290719])

In [92]:
predictions_[:5].flatten()

array([0.51275866, 0.49265935, 0.50602867, 0.50196725, 0.53290719])

In [106]:
correct_num = np.sum((np.transpose(predictions_.flatten())> 0.5) == np.array(products['sentiment']>0))
total_num = len(products['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
accuracy = correct_num * 1./ total_num
print (accuracy)

correct_num: 39903, total_num: 53072
0.7518653904130238


In [109]:
np.sum((np.transpose(predictions_.flatten() > 0.5)) == (products["sentiment"]>0))

39903

In [125]:
important_words

['baby',
 'one',
 'great',
 'love',
 'use',
 'would',
 'like',
 'easy',
 'little',
 'seat',
 'old',
 'well',
 'get',
 'also',
 'really',
 'son',
 'time',
 'bought',
 'product',
 'good',
 'daughter',
 'much',
 'loves',
 'stroller',
 'put',
 'months',
 'car',
 'still',
 'back',
 'used',
 'recommend',
 'first',
 'even',
 'perfect',
 'nice',
 'bag',
 'two',
 'using',
 'got',
 'fit',
 'around',
 'diaper',
 'enough',
 'month',
 'price',
 'go',
 'could',
 'soft',
 'since',
 'buy',
 'room',
 'works',
 'made',
 'child',
 'keep',
 'size',
 'small',
 'need',
 'year',
 'big',
 'make',
 'take',
 'easily',
 'think',
 'crib',
 'clean',
 'way',
 'quality',
 'thing',
 'better',
 'without',
 'set',
 'new',
 'every',
 'cute',
 'best',
 'bottles',
 'work',
 'purchased',
 'right',
 'lot',
 'side',
 'happy',
 'comfortable',
 'toy',
 'able',
 'kids',
 'bit',
 'night',
 'long',
 'fits',
 'see',
 'us',
 'another',
 'play',
 'day',
 'money',
 'monitor',
 'tried',
 'thought',
 'never',
 'item',
 'hard',
 'plasti

In [124]:
coefficients

[array([-0.00850205]),
 array([0.06654608]),
 array([0.06589076]),
 array([0.00501744]),
 array([-0.05386015]),
 array([-0.00350488]),
 array([0.06479459]),
 array([0.04543563]),
 array([0.00398353]),
 array([0.02007754]),
 array([0.030135]),
 array([-0.02871155]),
 array([0.0152162]),
 array([0.00027259]),
 array([0.01194482]),
 array([-0.01824619]),
 array([-0.01217064]),
 array([-0.04151103]),
 array([0.0027682]),
 array([0.0177032]),
 array([-0.004397]),
 array([0.0449764]),
 array([0.00990916]),
 array([0.00089924]),
 array([-0.0013622]),
 array([0.01268594]),
 array([0.00826467]),
 array([-0.0277427]),
 array([0.00061013]),
 array([0.01540845]),
 array([-0.01321348]),
 array([-0.03005125]),
 array([0.02973994]),
 array([0.01840871]),
 array([0.00286179]),
 array([-0.0105768]),
 array([-0.00065735]),
 array([-0.01014766]),
 array([-0.0047958]),
 array([0.00750892]),
 array([0.00427938]),
 array([0.00306786]),
 array([-0.00220318]),
 array([0.00957273]),
 array([9.91666827e-05]),
 

In [122]:
coefficient_ = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficient_)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True) ## sorting is done based on key 
word_coefficient_tuples[-10:]

display(word_coefficient_tuples[:10])
display(word_coefficient_tuples[-10:])

[('one', array([0.06654608])),
 ('great', array([0.06589076])),
 ('like', array([0.06479459])),
 ('easy', array([0.04543563])),
 ('much', array([0.0449764])),
 ('old', array([0.030135])),
 ('even', array([0.02973994])),
 ('seat', array([0.02007754])),
 ('perfect', array([0.01840871])),
 ('good', array([0.0177032]))]

[('money', array([-0.0244821])),
 ('waste', array([-0.02659278])),
 ('still', array([-0.0277427])),
 ('well', array([-0.02871155])),
 ('however', array([-0.02897898])),
 ('first', array([-0.03005125])),
 ('bottles', array([-0.03306952])),
 ('day', array([-0.03898204])),
 ('bought', array([-0.04151103])),
 ('use', array([-0.05386015]))]