# Logistic Regression

## Overview
In this lab, we will be applying logistic regression on "prosper-loan" data which is an individual loan data set provided by the P2P lending company, Prosper.
Our goal is to predict whether or not an individual can get a loan based on his/her credentials.


In [1]:
# Downloading S3 Data into Jupyter Notebook

# Firstly, we will download the data file from amazon S3 into our directory, 
# if we are able to download it, we will get a message like below,

# !wget 'http://datakmeans.s3.amazonaws.com/prosper-loan-data.csv'

In [2]:
## TODO : Modify these to your own bucket
bucket = "elephantscale-sagemaker"
prefix = "elephantscale-sagemaker/prosper"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

location = 'http://datakmeans.s3.amazonaws.com/prosper-loan-data.csv'
# location = 'https://s3.amazonaws.com/elephantscale-public/data/prosper-loan/prosper-loan-data-sample.csv'
prosper_data = pd.read_csv(location, header=0)
prosper_data.head()

Unnamed: 0,Term,LoanStatus,BorrowerRate,ProsperRating (numeric),ProsperScore,ListingCategory,BorrowerState,EmploymentStatus,EmploymentStatusDuration,IsBorrowerHomeowner,...,ProsperPaymentsOneMonthPlusLate,ProsperPrincipalBorrowed,ProsperPrincipalOutstanding,LoanOriginalAmount,MonthlyLoanPayment,Recommendations,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors,YearsWithCredit
0,36,1,0.158,4.0,6.0,Unknown,CO,Self-employed,2.0,True,...,0.0,0.0,0.0,9425,330.43,0,0,0.0,258,13
1,36,1,0.1325,4.0,6.0,Unknown,Unknown,Full-time,19.0,False,...,0.0,0.0,0.0,1000,33.81,0,0,0.0,53,14
2,36,0,0.1435,5.0,4.0,Debt,AL,Employed,1.0,False,...,0.0,0.0,0.0,4000,137.39,0,0,0.0,1,18
3,36,0,0.3177,1.0,5.0,Household,FL,Other,121.0,True,...,0.0,0.0,0.0,4000,173.71,0,0,0.0,10,15
4,36,1,0.2075,4.0,6.0,Unknown,MI,Full-time,36.0,False,...,0.0,0.0,0.0,3000,112.64,0,0,0.0,53,11


In [4]:
# Preparing the data for Training

# The data has more than 50 columns. We will make it simple by just taking 6 columns. 
# We will consider 'LoanStatus','BorrowerRate','EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome','IncomeVerifiable' alone.

prosper_final = prosper_data[['LoanStatus','BorrowerRate','EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome','IncomeVerifiable']]
prosper_final.head()

Unnamed: 0,LoanStatus,BorrowerRate,EmploymentStatus,CreditScore,StatedMonthlyIncome,IncomeVerifiable
0,1,0.158,Self-employed,640.0,3083.333333,True
1,1,0.1325,Full-time,640.0,1666.666667,True
2,0,0.1435,Employed,680.0,2416.666667,True
3,0,0.3177,Other,700.0,5500.0,True
4,1,0.2075,Full-time,620.0,3750.0,True


In [5]:
# Get the execution role for the notebook instance. 
# This is the IAM role that you created when you created your notebook instance. 

from sagemaker import get_execution_role
role = get_execution_role()


In [6]:
# Next step is to convert the categoriacal values into one-hot encoding. 
# Pandas library provides in-built functions for converting the columns into one-hot encoding. 
# In our data, we will be converting "Employment Status" column.


EmpStatus_onehot = pd.get_dummies(prosper_final['EmploymentStatus'])

In [7]:
# We will remove the EmploymentStatus Column
prosper_final = prosper_final.drop('EmploymentStatus',axis = 1)
prosper_final = prosper_final.join(EmpStatus_onehot)

In [8]:
prosper_final.head()

Unnamed: 0,LoanStatus,BorrowerRate,CreditScore,StatedMonthlyIncome,IncomeVerifiable,Employed,Full-time,Not employed,Other,Part-time,Retired,Self-employed
0,1,0.158,640.0,3083.333333,True,0,0,0,0,0,0,1
1,1,0.1325,640.0,1666.666667,True,0,1,0,0,0,0,0
2,0,0.1435,680.0,2416.666667,True,1,0,0,0,0,0,0
3,0,0.3177,700.0,5500.0,True,0,0,0,1,0,0,0
4,1,0.2075,620.0,3750.0,True,0,1,0,0,0,0,0


In [9]:
len(prosper_final)

49724

In [10]:
# Next, we will convert "IncomeVerifiable" column into integer values. 
# We will assign 1 for True and 0 for False. 

IncomeVerifiable_int = []
for i in range(len(prosper_final)):
    IncomeVerifiable_int.append(int(prosper_final['IncomeVerifiable'][i]))
    

In [11]:
prosper_final['IncomeVerifiable_int'] = IncomeVerifiable_int

In [12]:
prosper_final1 = prosper_final.drop('IncomeVerifiable',axis = 1)

In [13]:
prosper_final1.head()

Unnamed: 0,LoanStatus,BorrowerRate,CreditScore,StatedMonthlyIncome,Employed,Full-time,Not employed,Other,Part-time,Retired,Self-employed,IncomeVerifiable_int
0,1,0.158,640.0,3083.333333,0,0,0,0,0,0,1,1
1,1,0.1325,640.0,1666.666667,0,1,0,0,0,0,0,1
2,0,0.1435,680.0,2416.666667,1,0,0,0,0,0,0,1
3,0,0.3177,700.0,5500.0,0,0,0,1,0,0,0,1
4,1,0.2075,620.0,3750.0,0,1,0,0,0,0,0,1


In [14]:
#We are taking 20000 rows for training and convert into np.array

import numpy as np
a = np.array(prosper_final1[:20000]).astype('float32')

# Now, all the columns are float variables. So, we can proceed for training the data. 

In [15]:
# we divide the data into features and labels. 
# Labels is the prediction column. In this case, it is the "LoanStatus" column.
# Features are all the columns except the "LoanStatus" column.

features = a[:,1:]
labels = a[:,0]

In [16]:
len(features)

20000

In [17]:
# Next, create a bucket in S3 that begins with the letters "sagemaker". 
# SageMaker will create the subfolders it needs automatically. 
# It is important that you create the S3 buckets in the same Amazon region as the notebook. 
# Otherwise you will get an error saying it cannot find data. See the note below on that.

#You should see an output displaying the location where the training data and artifacts will be stored. 

import boto3
import sagemaker
import io
import sagemaker.amazon.common as smac
import os

sess = sagemaker.Session()


buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, a, labels)
buf.seek(0)

key = 'linearlearner'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://elephantscale-sagemaker/elephantscale-sagemaker/prosper/train/linearlearner
training artifacts will be uploaded to: s3://elephantscale-sagemaker/elephantscale-sagemaker/prosper/output


In [18]:
# Next, create a docker container in the same region where our notebook is created. 
# In our case, it is " us-east-1"

# containers = {
#               'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest'
#               }

In [19]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [20]:
# containers[boto3.Session().region_name]

In [21]:
# Setting up the Logistic Regression Estimator

# Now we begin to set up the Estimator. 
# SageMaker will not let you use any of their smaller (i.e. less expensive) images, 
# so here we use a virtual machine of size ml.p2.xlarge.

#We set up the linear estimator
linear = sagemaker.estimator.Estimator(container,
                                       role=role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)

In [22]:
%%time

# Now we provide hyperparameters. There are many, like which loss function to use. 
# Here we put only the most important ones
# feature_dim - It is the number of columns in our feature array. In this case it is 12.
# mini_batch_size - It is the number of batches into which to split the data. 
#                   This number should be smaller than the number of records in our training set. 
#                   We only have 20000 records, so we take 2000 as batch size.
# predictor_type—we use binary_classifier, which means logistic regression.

linear.set_hyperparameters(feature_dim=12,
                           mini_batch_size=2000,
                           predictor_type='binary_classifier')

# When you run the fit() method Amazon will kick off this job. This will take several minutes to run.

linear.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2019-04-26-06-04-33-916


2019-04-26 06:04:34 Starting - Starting the training job...
2019-04-26 06:04:35 Starting - Launching requested ML instances......
2019-04-26 06:05:42 Starting - Preparing the instances for training.........
2019-04-26 06:07:31 Downloading - Downloading input data
2019-04-26 06:07:31 Training - Downloading the training image.
[31mDocker entrypoint called with argument(s): train[0m
[31m[04/26/2019 06:07:38 INFO 139657021953856] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_minimum_lr': u'auto', u'target_reca

In [23]:
# Deploying the Model


# When the training model is done, deploy it to an endpoint. 
# Remember that Amazon is charging you money now. 
# So when you get done delete your endpoints unless you want to be charged.
# Deploying the linear model

linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.c4.xlarge')

INFO:sagemaker:Creating model with name: linear-learner-2019-04-26-06-08-16-334
INFO:sagemaker:Creating endpoint with name linear-learner-2019-04-26-06-04-33-916


---------------------------------------------------------------------------------------!

In [24]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [25]:
# Creating an array for testing.
b = np.array(prosper_final1[20000:21000]).astype('float32')

In [26]:
# Validating the Dataset

# Now we run the prediction. We take 1000 data points and run the prediction.  

result = linear_predictor.predict(b[0:1000])

# Printing some 100 predictions
for i in range(100):
    # print(i)
      print(result['predictions'][i])


{'score': 0.9925729036331177, 'predicted_label': 1.0}
{'score': 0.9958631992340088, 'predicted_label': 1.0}
{'score': 0.9938401579856873, 'predicted_label': 1.0}
{'score': 0.9943941831588745, 'predicted_label': 1.0}
{'score': 0.993845522403717, 'predicted_label': 1.0}
{'score': 0.9945579767227173, 'predicted_label': 1.0}
{'score': 0.991969645023346, 'predicted_label': 1.0}
{'score': 0.01213435921818018, 'predicted_label': 0.0}
{'score': 0.9959169030189514, 'predicted_label': 1.0}
{'score': 0.016232261434197426, 'predicted_label': 0.0}
{'score': 0.9966867566108704, 'predicted_label': 1.0}
{'score': 0.9927384257316589, 'predicted_label': 1.0}
{'score': 0.9934595823287964, 'predicted_label': 1.0}
{'score': 0.9929485321044922, 'predicted_label': 1.0}
{'score': 0.015534740872681141, 'predicted_label': 0.0}
{'score': 0.9964721202850342, 'predicted_label': 1.0}
{'score': 0.9968417882919312, 'predicted_label': 1.0}
{'score': 0.007534547708928585, 'predicted_label': 0.0}
{'score': 0.99450832605

In [27]:
# ToDo 1
# To find the accuracy of the model, 
#   1. Precision - Precision tries to find out what proportion of positive identifications was actually correct.
#   2. Recall - Recall tries to find what proportion of actual positives was identified correctly.

# 1. Precision formula,
#     precision = TP/(TP+FP)

# 2. Recall formula,
#     recall = TP/(TP+FN)
# where,
# TP(True Positive): A true positive is an outcome where the model correctly predicts the positive class.
# TN(True Negative): It is an outcome where the model correctly predicts the negative class.
# FP(False Positive): It is an outcome where the model incorrectly predicts the positive class.
# FN(False Negative): It is an outcome where the model incorrectly predicts the negative class.

        
# Correct prediction
true_positive = 0
true_negative = 0
# Wrong prediction
false_positive = 0
false_negative = 0

for i in range(500):
    # print(i)
    original = float(prosper_final1['LoanStatus'][20000+i])
    predicted = result['predictions'][i]['predicted_label']
# Here we are calculating the parameters needed to calculate precision and recall    
    if original == 1.0 :
        if original == predicted :
            # print(str(original) +" : "+str(predicted))
            true_positive+= 1
        else :
            false_negative+=1      
    elif original == 0.0 :
        if original == predicted :
            true_negative+= 1
        else :
            false_positive+=1
            
    # print (original)        
    # print (predicted)

# print(result)
# print("TP "+str(true_positive) +" : TN "+str(true_negative))
# print("FP "+str(false_positive)+" : FN "+str(false_negative))

precision = ??? # Apply precision formula here and uncomment below line
# print("Precision "+str(precision))

# Output similar to below

# Precision 0.9916546414285714

SyntaxError: invalid syntax (<ipython-input-27-a4bb15e582e3>, line 49)

In [None]:
# ToDo 2

recall = ??? # Apply recall formula here and uncomment below line
# print("Recall "+str(recall))

# Output similar to below

# Recall 1.0