In [None]:
import pandas as pd
import matplotlib as plt
import urllib.request

In [None]:
datafile_src = "https://s3-us-west-2.amazonaws.com/ml-training-sfo/Credit_default_dataset.csv"
target_file = "credit_default_dataset.csv"

urllib.request.urlretrieve(datafile_src, target_file)

In [None]:
credit = pd.read_csv(target_file)

In [None]:
credit.head()

In [None]:
credit = credit.drop('ID', axis=1)

In [None]:
credit.describe()

In [None]:
credit['SEX'].value_counts()

In [None]:
credit['MARRIAGE'].value_counts()

In [None]:
credit["MARRIAGE"]=credit["MARRIAGE"].map({0:3,1:1,2:2,3:3})
credit['MARRIAGE'].value_counts()

In [None]:
credit['LIMIT_BAL'].hist(bins=100, color='orange')

In [None]:


credit.boxplot(column=['AGE'])



In [None]:
credit.apply(lambda x: sum(x.isnull()),axis=0) 

# Understanding feature importance

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
X = credit.drop('default.payment.next.month', axis = 1)
Y = credit['default.payment.next.month']

clf = GradientBoostingClassifier()
clf.fit(X,Y)

In [None]:
importances = clf.feature_importances_

In [None]:
import numpy as np
features_label = X.columns[0:]
indices = np. argsort(importances)[::-1]
for i in range(X.shape[1]):
    print ("%2d) %-*s %f" % (i + 1, 30, features_label[i],importances[indices[i]]))

In [None]:
plt.pyplot.title('Feature Importances')
plt.pyplot.bar(range(X.shape[1]),importances[indices], color="green", align="center")
plt.pyplot.xticks(range(X.shape[1]),features_label, rotation=90)
plt.pyplot.xlim([-1, X.shape[1]])
plt.pyplot.show()

# Feature engineering and data preparation

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit_transform(credit)


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(credit, test_size = 0.3)

X_train = train.drop('default.payment.next.month',axis=1)

y_train = train['default.payment.next.month']

X_test =  test.drop('default.payment.next.month',axis=1)

y_test = test['default.payment.next.month']

In [None]:
import os
os.mkdir('data/train')
train.to_csv("data/train/train.csv", index=False)
os.mkdir('data/test')
test.to_csv("data/test/test.csv", index=False)

# Using Logistic Regression Algorithm

In [None]:
#Train benchmark model(Log Regression)
from sklearn.linear_model import LogisticRegression
clf0 = LogisticRegression(random_state=10)
clf0.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = clf0.predict(X_test)
accuracy_regr=accuracy_score(y_test, y_pred)
print("accuracy:", accuracy_regr)

# Using GradientBoosting algorithm

In [None]:
#gradeint boost
from sklearn.ensemble import GradientBoostingClassifier
clf2=GradientBoostingClassifier(random_state=10)
clf2.fit(X_train,y_train)

In [None]:
y_pred2 = clf2.predict(X_test)
accuracy_gb=accuracy_score(y_test, y_pred2)
print("accuracy:", accuracy_gb)

# Using MXNet MLP

In [None]:
import mxnet as mx
import numpy as np

# Enable logging so we will see output during the training
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
Batch_Size = 50
trainIter = mx.io.NDArrayIter(X_train.values, y_train.values, Batch_Size, shuffle=True)
testIter = mx.io.NDArrayIter(X_test.values, y_test.values, Batch_Size, shuffle=True)

In [None]:
def build_graph():
    
    
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')

    fc1  = mx.sym.FullyConnected(data=input_x, num_hidden=250)
    act1 = mx.sym.Activation(data=fc1, act_type="relu") 
    fc2  = mx.sym.FullyConnected(data=act1, num_hidden=250)
    act2 = mx.sym.Activation(data=fc2, act_type="relu") 

    fc3 = mx.sym.FullyConnected(data=act2, num_hidden=2) 

    mlp = mx.sym.SoftmaxOutput(data=fc3, label=input_y, name='softmax')
    return mlp


mlp = build_graph()

In [None]:
mx.viz.plot_network(mlp) 

In [None]:
num_epoch = 20     
device =   mx.cpu()     
optimizer = "adam"     
eval_metric = "acc" 

# Assign the network symbol(mlp) to the module class
mlp_model = mx.mod.Module(symbol=mlp, context=device) 


# Start training by calling the fit function
mlp_model.fit(trainIter,                      # training data               
    eval_data=testIter,                       # validation data                            
    optimizer= optimizer,                     # use adam optimizer to train
    optimizer_params={'learning_rate':0.01},  # set learning rate for adam         
    eval_metric= eval_metric,                 # report accuracy during training  
    batch_end_callback = mx.callback.Speedometer(Batch_Size, 100), # output progress for each 100 data batches   
    num_epoch=num_epoch) # train data passes indicatd by num_epoch
  

In [None]:
metric = mx.metric.Accuracy()
print (mlp_model.score(testIter, metric))

# Train using SageMaker training service - Local mode and SageMaker mode

- Download SageMaker local mode setup file https://s3-us-west-2.amazonaws.com/ml-training-sfo/setup.sh

- upload to the directory of this notebook


In [None]:
!/bin/bash ./setup.sh

In [None]:
from sagemaker import get_execution_role
import boto3
import sagemaker

bucket = '<your bucket name here>'

sagemaker_session = sagemaker.Session()

custom_code_upload_location = '< your S3 uri for custom code upload location>' 

#Bucket location where results of model training are saved.
model_artifacts_location = '< your S3 uri for model artifacts>' 

role = get_execution_role()

In [None]:
inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix='data/creditrisk')

In [None]:
import subprocess

instance_type = 'local'

if subprocess.call('nvidia-smi') == 0:
    ## Set type to GPU if one is present
    instance_type = 'local_gpu'
    
print("Instance type = " + instance_type)

role = get_execution_role()

- download training script https://s3-us-west-2.amazonaws.com/ml-training-sfo/creditrisk.py
- upload to the directory of this notebook

In [None]:
!cat creditrisk.py

In [None]:
from sagemaker.mxnet import MXNet

cr_estimator = MXNet('creditrisk.py',
                        role=role,
                        output_path=model_artifacts_location,
                        code_location=custom_code_upload_location,
                        train_instance_count=1, 
                        #train_instance_type='ml.m4.xlarge',
                        train_instance_type= instance_type,
                        hyperparameters={'learning_rate': 0.1})


In [None]:
cr_estimator.fit(inputs)

In [None]:
from sagemaker.mxnet import MXNet

cr_estimator = MXNet('creditrisk.py',
                        role=role,
                        output_path=model_artifacts_location,
                        code_location=custom_code_upload_location,
                        train_instance_count=1, 
                        train_instance_type='ml.m4.xlarge',
                        hyperparameters={'learning_rate': 0.1})


In [None]:
cr_estimator.fit(inputs)

# Evaluting the model performance

In [None]:
import os
prefix = "artifacts"
model_path = os.path.join(prefix, cr_estimator._current_job_name, 'output/model.tar.gz')
model_path

In [None]:
boto3.resource('s3').Bucket(bucket).download_file(model_path, 'downloaded_model.tar.gz')

In [None]:
!tar -xzvf 'downloaded_model.tar.gz'

In [None]:
mod = mx.module.Module.load("model", 0)

In [None]:
sym, arg_params, aux_params = mx.model.load_checkpoint('model', 0)

ctx = mx.cpu()

mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
mod.bind(for_training=False, data_shapes=testIter.provide_data, label_shapes=testIter.provide_label)
mod.set_params(arg_params, aux_params, allow_missing=True)


mod.score(testIter, metric)