# XGBoost - Direct Marketing in Banking

### Step 1. Importing Packages

In [1]:
%%time

import os
import boto3
import re
import sagemaker
from sagemaker.predictor import csv_serializer

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = 'atl-gkrishna-ml'
prefix = 'aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)
print(bucket_path)

https://s3-us-east-1.amazonaws.com/atl-gkrishna-ml
CPU times: user 803 ms, sys: 116 ms, total: 918 ms
Wall time: 2.53 s


### Step 2. Fetching the dataset
Following methods split the data into train/test/validation datasets and upload files to S3.

In [2]:
import io
import boto3
import random
import pandas as pd
import json
from sklearn.model_selection import train_test_split

s3_bucket = 'atl-gkrishna-ml'
s3_path_training = 'aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker/data/training/banking_train.csv'
s3_path_testing = 'aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker/data/testing/banking_test.csv'
s3_path_val = 'aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker/data/validation/banking_validation.csv'
s3_model_output = 'aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker/output'
data = pd.read_csv('s3://atl-gkrishna-ml/aws-mlops-lunch-and-learn/part1-sagemaker/xgboost-sagemaker/data/raw/banking.csv')


## Bringing target column as the first column (requirement for algorithm)
cols = list(data)
cols.insert(0, cols.pop(cols.index('y')))
data = data.loc[:, cols]


## Feature Engineering (from local model development notebook)
categorical_variable_cols = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']

for var in categorical_variable_cols:
    categ_list = 'var_' + var
    categ_list = pd.get_dummies(data[var],prefix = var)
    data1=data.join(categ_list)
    data = data1

## Creating feilds to keep
data_vars = data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in categorical_variable_cols]

data_final = data[to_keep]
print("Fields - after categorical encoding:")
print(data_final.columns.values)

columns_final = ["previous", "euribor3m", "job_blue-collar", "job_retired", "job_services", "job_student", "default_no", 
      "month_aug", "month_dec", "month_jul", "month_nov", "month_oct", "month_sep", "day_of_week_fri", "day_of_week_wed", 
      "poutcome_failure", "poutcome_nonexistent", "poutcome_success"] 
col_dict = {}
col_dict['final_features'] = columns_final

with open('final_features.json', 'w') as fp:
    json.dump(col_dict, fp)

X = data_final[columns_final]
y = data_final['y']



Fields - after categorical encoding:
['y' 'age' 'duration' 'campaign' 'pdays' 'previous' 'emp_var_rate'
 'cons_price_idx' 'cons_conf_idx' 'euribor3m' 'nr_employed' 'job_admin.'
 'job_blue-collar' 'job_entrepreneur' 'job_housemaid' 'job_management'
 'job_retired' 'job_self-employed' 'job_services' 'job_student'
 'job_technician' 'job_unemployed' 'job_unknown' 'marital_divorced'
 'marital_married' 'marital_single' 'marital_unknown' 'education_basic.4y'
 'education_basic.6y' 'education_basic.9y' 'education_high.school'
 'education_illiterate' 'education_professional.course'
 'education_university.degree' 'education_unknown' 'default_no'
 'default_unknown' 'default_yes' 'housing_no' 'housing_unknown'
 'housing_yes' 'loan_no' 'loan_unknown' 'loan_yes' 'contact_cellular'
 'contact_telephone' 'month_apr' 'month_aug' 'month_dec' 'month_jul'
 'month_jun' 'month_mar' 'month_may' 'month_nov' 'month_oct' 'month_sep'
 'day_of_week_fri' 'day_of_week_mon' 'day_of_week_thu' 'day_of_week_tue'
 'day_of_

In [3]:
from io import StringIO # python3; python2: BytesIO 
import boto3
import s3fs

def data_split(X, y):  
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=123)    
    data_train = pd.concat([y_train,X_train], axis=1, sort=False)
    data_test = pd.concat([y_test,X_test], axis=1, sort=False)
    data_val = pd.concat([y_val, X_val], axis=1, sort=False)
    return(data_train,data_test,data_val)



In [4]:
data_train,data_test,data_val = data_split(X, y)

In [5]:
print(data_train.shape)
print(data_val.shape)

(32435, 19)
(4634, 19)


In [6]:
data_train.to_csv('banking_train.csv', header=False, index=False)
data_test.to_csv('banking_test.csv', header=False, index=False)
data_val.to_csv('banking_validation.csv', header=False, index=False)

In [7]:
s3_input_train = boto3.Session().resource('s3').Bucket(s3_bucket).Object(s3_path_training).upload_file('banking_train.csv')
s3_input_validation = boto3.Session().resource('s3').Bucket(s3_bucket).Object(s3_path_testing).upload_file('banking_test.csv')
s3_input_test = boto3.Session().resource('s3').Bucket(s3_bucket).Object(s3_path_val).upload_file('banking_validation.csv')

In [8]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}'.format(s3_bucket, s3_path_training), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}'.format(s3_bucket, s3_path_val), content_type='csv')

### Step 3. Training the XGBoost model

In [9]:
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()

container = get_image_uri(region, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [10]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}'.format(s3_bucket, s3_model_output),
                                    sagemaker_session=sess)

In [11]:
xgb.set_hyperparameters(max_depth=2,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective="binary:logistic",
                        num_round=100,
                        eval_metric="auc")

In [12]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-06-19 15:27:30 Starting - Starting the training job...
2020-06-19 15:27:32 Starting - Launching requested ML instances.........
2020-06-19 15:29:17 Starting - Preparing the instances for training......
2020-06-19 15:30:11 Downloading - Downloading input data...
2020-06-19 15:30:44 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-06-19:15:31:04:INFO] Running standalone xgboost training.[0m
[34m[2020-06-19:15:31:04:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 8496.57mb[0m
[34m[2020-06-19:15:31:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:31:04] S3DistributionType set as FullyReplicated[0m
[34m[15:31:05] 32435x18 matrix with 583830 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-06-19:15:31:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:31:05] S3DistributionType set as FullyReplicated[0m
[34m[15:31:05] 4634x18 matrix


2020-06-19 15:31:17 Uploading - Uploading generated training model
2020-06-19 15:31:17 Completed - Training job completed
Training seconds: 66
Billable seconds: 66


### Step 4. Hosting Model to an Endpoint

In [13]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

---------------!

### Step 5. Predictions using the hosted model

In [14]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None
import numpy as np
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(data_test.as_matrix()[:, 1:])




In [18]:
final_predictions = list(np.where(predictions > 0.5, 1, 0))
print("First 20 Final Predictions: ",final_predictions[:20])

First 20 Final Predictions:  [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [19]:
Y_test_act = list(data_test['y'])

### Step 6. Measuing Model Performance

In [20]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [21]:
precision = precision_score(Y_test_act, final_predictions, average='binary')
print('Precision: {}%'.format(precision*100))
recall = recall_score(Y_test_act, final_predictions, average='binary')
print('Recall: {}%'.format(recall*100))
score = f1_score(Y_test_act, final_predictions, average='binary')
print('F-Measure: %.3f' % score)

Precision: 67.24137931034483%
Recall: 16.52542372881356%
F-Measure: 0.265


### Step 7. Deleting Endpoint

In [103]:
import boto3
sm = boto3.Session(region_name=region).client('sagemaker')
sm.delete_endpoint(EndpointName='xgboost-2020-06-16-16-14-32-211')

{'ResponseMetadata': {'RequestId': 'f5c70d17-05ef-4b9a-ab0b-af8fa9887080',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f5c70d17-05ef-4b9a-ab0b-af8fa9887080',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 16 Jun 2020 16:55:30 GMT'},
  'RetryAttempts': 0}}