In [56]:
import pandas as pd
import numpy as np
import string
import json
import matplotlib.pyplot as plt
%matplotlib inline

### loading and processing the data

In [57]:
products = pd.read_csv('amazon_baby_subset.csv')

In [58]:
products = products.fillna({'review':''},inplace = True)

In [59]:
def remove_punctuation(text):
    text = text.translate(str.maketrans('','',string.punctuation))
    return text

products['review_clean'] = products['review'].apply(remove_punctuation)

In [62]:
#limited vocab used, stored in a json format
with open('important_words.json') as file:
    important_words = json.load(file)
    important_words = [str(w) for w in important_words]
print('no of words',len(important_words))

no of words 193


In [None]:
#creates the individual columns of word counts to their respective words from important_words
for word in important_words:
   products[word] =  products['review'].apply(lambda x : x.split().count(word))

In [None]:
products.shape[0]

### Train-Validation split

In [None]:
#train data indices, stored in a json format
with open('module-4-assignment-train-idx.json') as file:
    train_indices = json.load(file)
train_data = products.iloc[train_indices]

In [None]:
train_data.shape[0]

In [None]:
#validation data indices, stored in a json format
with open('module-4-assignment-validation-idx.json') as file:
    validation_indices = json.load(file)
validation_data = products.iloc[validation_indices]

In [None]:
validation_data.shape[0]

In [None]:
# get feature matrix and label array
def get_numpy_data(dataframe,features,label_name):
    #dataframe is a input dataframe
    #features are a list of feature
    #string for label_name
    
    #prepend a 'constant feature' to features list of value '1'
    features = ['constant'] + features
    dataframe['constant'] = 1
    feature_matrix = dataframe[features].as_matrix()
    label = dataframe[label_name].as_matrix()
    
    #return numpy 2d feature matrix and 1d array for label
    return feature_matrix,label

In [None]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')

### Building on logistic regression with no L2 penalty assignment

In [None]:
def predict_probability(features,coefficient):
    #feature matrix is a N x D matrix
    #coefficient is a vector of shape(D)
    return 1 / (1 + np.exp(-np.dot(features,coefficient))

In [None]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
    #error is actual -prediction having shape (N,1)
    #jth feature vector for calculating partial derivative w.r.t jth coefficient
    #jth coefficient
    #l2_penalty is regularization factor(scalar)
    #feature_is_constant: j==0 or constant(intercept/bias feature) has not considered for l2 penalty
    
    derivative = np.dot(error,feature)
    if not feature_is_constant:
        derivative = derivative - 2*l2_penalty*coefficient
    return derivative

In [None]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    # feature matrix is a N x D matrix
    #sentiment is (N,1) actual label +1 for positive sentiment,-1 for negative sentiment
    #coefficient is a vector of shape(D)
    #l2_penalty is aregularization factor(scalar)
    indicator = np.array([1 if i == +1 else 0 for i in sentiment])
    score = np.dot(feature_matrix,coefficients)
    lp = np.sum((indicator-1)*score - np.log(1 + np.exp(-score))) - l2_penalty*np.sum(coefficients[1:]**2)
    return lp    

In [None]:
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients,step_size, l2_penalty, max_iter):
    coefficients = initial_coefficients
    for itr in range(max_iter):
        indicator = np.array([1 if i == +1 else 0 for i in sentiment])
        prediction = predict_probability(feature_matrix,coefficient)
        error = indicator - prediction
        #update each coefficient from their derivative
        for j in range(len(coefficients)):
            is_intercept = (j == 0)
            derivative= feature_derivative_with_L2(error,feature_matrix[:j],coefficient,l2_penalty,is_intercept)
            coefficients[j] = coefficients[j] + derivative
                # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients        

In [None]:
L2_penalty_list = [0, 4, 10, 1e2, 1e3, 1e5]
feature_matrix = feature_matrix_train
sentiment = sentiment_train 
initial_coefficients = np.zeros(194)
step_size = 5e-6
max_iter = 501
coefficients_list = [logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, 
  step_size, l2_penalty, max_iter) for l2_penalty in L2_penalty_list]