In [31]:
%load_ext sagemaker_studio_analytics_extension.magics
%sm_analytics emr connect --cluster-id --auth-type None --language python  
# Add EMR cluster ID after "--cluster-id". The id should start with j- and can be retrieved from the Cluster button on SageMaker Studio Notebook or from the EMR console

The sagemaker_studio_analytics_extension.magics extension is already loaded. To reload it, use:
  %reload_ext sagemaker_studio_analytics_extension.magics
Successfully read emr cluster(j-3IMB88WP82OKD) details
Initiating EMR connection..
Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1683677432071_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.
{"namespace": "sagemaker-analytics", "cluster_id": "j-3IMB88WP82OKD", "error_message": null, "success": true, "service": "emr", "operation": "connect"}


In [32]:
%%configure -f 

{
  "conf": {
    "spark.jars.packages": "io.delta:delta-core_2.12:2.0.0",
    "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog"
  }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1683677432071_0002,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1683677432071_0002,pyspark,idle,Link,Link,,✔


# Training SageMaker XGBoost Model on Delta Lake (Loan Risk Data)

<img src="img/SageMaker-300x150.png" width=200/>

This is a companion notebook to provide a SageMaker Model Training on Delta Lake example against the Lending Club data.
* This notebook has been tested with *EMR 6.6.0, SparkMagic Kernel*

In [25]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
pd.set_option('display.max_columns', 100) #replace n with the number of columns you want to see completely
pd.set_option('display.max_rows', 1000) #replace n with the numbe

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<img src="img/Delta-Lake-Multi-Hop-Architecture-Bronze.png"/>

In [27]:
#  Configure location of Raw Delta Table
FILES = ""

# Read Delta Lake Files to Spark Dataframe
data = spark.read.format("delta").load(FILES)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: float (nullable = true)
 |-- funded_amnt: integer (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: float

## ![Delta Lake Tiny Logo](https://pages.databricks.com/rs/094-YMS-629/images/delta-lake-tiny-logo.png) Munge Data - create feature columns


In [10]:
from pyspark.sql.functions import *

print("------------------------------------------------------------------------------------------------")
print("Create bad loan label, this will include charged off, defaulted, and late repayments on loans...")
data = data.filter(data.loan_status.isin(["Default", "Charged Off", "Fully Paid"]))\
                       .withColumn("bad_loan", (~(data.loan_status == "Fully Paid")).cast("string"))

print("------------------------------------------------------------------------------------------------")
print("Turning string interest rate and revoling util columns into numeric columns...")
data = data.withColumn('int_rate', regexp_replace('int_rate', '%', '').cast('float')) \
                       .withColumn('revol_util', regexp_replace('revol_util', '%', '').cast('float')) \
                       .withColumn('issue_year',  substring(data.issue_d, 5, 4).cast('double') ) \
                       .withColumn('earliest_year', substring(data.earliest_cr_line, 5, 4).cast('double'))
data = data.withColumn('credit_length_in_years', (data.issue_year - data.earliest_year))


print("------------------------------------------------------------------------------------------------")
print("Converting emp_length column into numeric...")
data = data.withColumn('emp_length', trim(regexp_replace(data.emp_length, "([ ]*+[a-zA-Z].*)|(n/a)", "") ))
data = data.withColumn('emp_length', trim(regexp_replace(data.emp_length, "< 1", "0") ))
data = data.withColumn('emp_length', trim(regexp_replace(data.emp_length, "10\\+", "10") ).cast('float'))

print("------------------------------------------------------------------------------------------------")
print("Map multiple levels into one factor level for verification_status...")
data = data.withColumn('verification_status', trim(regexp_replace(data.verification_status, 'Source Verified', 'Verified')))

print("------------------------------------------------------------------------------------------------")
print("Calculate the total amount of money earned or lost per loan...")
data = data.withColumn('net', round( data.total_pymnt - data.loan_amnt, 2))
data.createOrReplaceTempView("rawDataView")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

------------------------------------------------------------------------------------------------
Create bad loan label, this will include charged off, defaulted, and late repayments on loans...
------------------------------------------------------------------------------------------------
Turning string interest rate and revoling util columns into numeric columns...
------------------------------------------------------------------------------------------------
Converting emp_length column into numeric...
------------------------------------------------------------------------------------------------
Map multiple levels into one factor level for verification_status...
------------------------------------------------------------------------------------------------
Calculate the total amount of money earned or lost per loan...

#### You can use SparkSQL queries using %%sql from the notebook and save results to a local DataFrame. This allows for a quick data exploration. The maximum rows returned by default is 2,500. You can set the maximum rows by using the -n argument.

In [11]:
%%sql -o raw_data
select * from rawDataView

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

#### You can then access and explore Spark dataframe locally as Pandas

In [12]:
%%local
raw_data[["net","verification_status","int_rate", "revol_util", "issue_year", "earliest_year", "bad_loan", "credit_length_in_years", "emp_length"]]

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

### Set Response & Predictor Variables

In [13]:
print("------------------------------------------------------------------------------------------------")
print("Setting variables to predict bad loans")
myY = "bad_loan"
categoricals = ["term", "home_ownership", "purpose", "addr_state",
                "verification_status","application_type"]
numerics = ["loan_amnt","emp_length", "annual_inc","dti",
            "delinq_2yrs","revol_util","total_acc",
            "credit_length_in_years"]
myX = categoricals + numerics

loan_stats = data.select(myX + [myY, "int_rate", "net", "issue_year"])
loan_stats.createOrReplaceTempView("loanStatsView")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

------------------------------------------------------------------------------------------------
Setting variables to predict bad loans

In [14]:
%%sql -o loan_stats
select * from loanStatsView

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

### Create Encoded Train / Validation Split for SageMaker XGBoost Model

In [92]:
# SageMaker XGBoost has the convention of label in the first column
loan_stats = loan_stats.toPandas()
label = loan_stats.pop('bad_loan')
loan_stats.insert(0, 'bad_loan', label)
  
loan_stats[categoricals] = loan_stats[categoricals].apply(lambda x: x.astype("category").cat.codes)
loan_stats['bad_loan'] = loan_stats['bad_loan'].astype("category").cat.codes

# # Split the downloaded loan_stats into train/test loan_statsframes
train, valid = np.split(loan_stats.sample(frac=1), [int(0.8 * len(loan_stats))])

print("training data size = %d | validation data size = %d" % (len(train), len(valid)))
train_df = spark.createDataFrame(train)
valid_df = spark.createDataFrame(valid)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Write Train / Validation data to S3 in Parquet

In [107]:
# Configure location of train and validation dataset
train_df.coalesce(1).write.mode("overwrite").parquet("s3://")
valid_df.coalesce(1).write.mode("overwrite").parquet("s3://")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## ![SageMaker Tiny Logo](img/sagemaker_tiny.png) Train SageMaker XGBoost Model using Automated Model Tuning

To create a tuning job using the AWS SageMaker Automatic Model Tuning API, you need to define 3 attributes.

1. Tuning job name (string)

2. Tuning job config (to specify settings for the hyperparameter tuning job - JSON object)

3. Training job definition (to configure the training jobs that the tuning job launches - JSON object).

To learn more about that, refer to the [Configure and Launch a Hyperparameter Tuning Job documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-ex-tuning-job.html).

To learn more about using XGBoost in SageMaker, refer to [Valid Inputs for SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html)


#### Note that the tuning job will take between 7 and 10 minutes to complete.

In [27]:
%%local 

import io
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here if you wish.
bucket = sagemaker.Session().default_bucket()
prefix = "vedjain-deltalake-2022/lending_club/sagemaker-xg-boost"
# customize to your bucket where you have would like to store the data
bucket_path = "https://s3-{}.amazonaws.com/{}".format(region, bucket)
container = sagemaker.image_uris.retrieve("xgboost", "us-east-2", "1.7-1")
client = boto3.client("sagemaker", region_name=region)

In [23]:
%%local
import time
from time import gmtime, strftime, sleep

tuning_job_name = "DEMO-xgboost-parquet-" + strftime("%d-%H-%M-%S", gmtime())

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "0.5",
                "MinValue": "0.1",
                "Name": "eta",
            },
            {
                "MaxValue": "5",
                "MinValue": "0",
                "Name": "gamma",
            },
            {
                "MaxValue": "120",
                "MinValue": "0",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            },
            {
                "MaxValue": "2",
                "MinValue": "0",
                "Name": "alpha",
            },
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "4000",
                "MinValue": "1",
                "Name": "num_round",
            },
        ],
    },
    # SageMaker sets the following default limits for resources used by automatic model tuning:
    # https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-limits.html
    "ResourceLimits": {
        # Increase the max number of training jobs for increased accuracy (and training time).
        "MaxNumberOfTrainingJobs": 8,
        # Change parallel training jobs run by AMT to reduce total training time. Constrained by your account limits.
        # if max_jobs=max_parallel_jobs then Bayesian search turns to Random.
        "MaxParallelTrainingJobs": 7,
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:error", "Type": "Minimize"},
}

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/{prefix}/train.parquet",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-parquet",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/{prefix}/validation.parquet",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-parquet",
            "CompressionType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": f"{bucket_path}/{prefix}/single-xgboost"},
    "ResourceConfig": {"InstanceCount": 4, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "RoleArn": role,
    "StaticHyperParameters": {
        "objective": "binary:logistic",
        "verbosity": "2",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

print(
    f"Creating a tuning job with name: {tuning_job_name}. It will take between 7 and 10 minutes to complete."
)
client.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

status = client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]
print(status)
while status != "Completed" and status != "Failed":
    time.sleep(60)
    status = client.describe_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuning_job_name
    )["HyperParameterTuningJobStatus"]
    print(status)

Creating a tuning job with name: DEMO-xgboost-parquet-03-18-18-53. It will take between 7 and 10 minutes to complete.
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


### Monitor Experiment Runs in SageMaker Training Console

<img src="img/sagemaker-training-xgboost.png"/>

### Plot Objective Metric

In [33]:
%%local 
from sagemaker.analytics import TrainingJobAnalytics

training = client.describe_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuning_job_name
    )["BestTrainingJob"]["TrainingJobName"]

metric_name = "validation:error"

metrics_dataframe = TrainingJobAnalytics(
    training_job_name=training, metric_names=[metric_name]
).dataframe()

metrics_dataframe

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

## ![SageMaker Tiny Logo](img/sagemaker_tiny.png) Register SageMaker Model to Model Group

In [37]:
%%local
# Get the model path from SageMaker training dashboard, it should be titled is model.tar.gz
MODEL_OUTPUT_PATH = "s3://"

In [38]:
%%local 

# Specify the model source
model_url = MODEL_OUTPUT_PATH
model_package_group_name = "LendingClub-BadLoans-BinaryClassification"
modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1',
	    "ModelDataUrl": model_url
         }
      ],
      "SupportedContentTypes": [ "application/x-parquet" ],
      "SupportedResponseMIMETypes": [ "application/x-parquet" ],
   }
 }
create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Model to predict if the loan will turn out bad or good",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

In [39]:
%%local
create_model_package_response = client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-2:850751315356:model-package/lendingclub-badloans-binaryclassification/5
