In [2]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
# Install the latest version of scikit-learn and restart kernel
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /opt/conda/anaconda/lib/python3.6/site-packages (0.23.1)


# Mortgage XGboost - GPUs 

Based on notebooks from https://github.com/rapidsai/spark-examples

Learn more about RAPIDS-Spark XGboost4j here https://news.developer.nvidia.com/gpu-accelerated-spark-xgboost/

### Include pyspark methods used in notebook

In [3]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from pyspark.sql.functions import col
from time import time

### Read data using GPU data reader

This is a custom built reader created by Nvida to read data using GPUs. 

In Spark 2.4 you need include the GpuDataReader method but from Spark 3.0+ you will be able to use the native Spark read method.

In [4]:
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader

### Set label used for model

In [5]:
label = 'delinquency_12'

### Create a list of the features used for the model

In [6]:
schema = StructType([
    StructField('orig_channel', FloatType()),
    StructField('first_home_buyer', FloatType()),
    StructField('loan_purpose', FloatType()),
    StructField('property_type', FloatType()),
    StructField('occupancy_status', FloatType()),
    StructField('property_state', FloatType()),
    StructField('product_type', FloatType()),
    StructField('relocation_mortgage_indicator', FloatType()),
    StructField('seller_name', FloatType()),
    StructField('mod_flag', FloatType()),
    StructField('orig_interest_rate', FloatType()),
    StructField('orig_upb', IntegerType()),
    StructField('orig_loan_term', IntegerType()),
    StructField('orig_ltv', FloatType()),
    StructField('orig_cltv', FloatType()),
    StructField('num_borrowers', FloatType()),
    StructField('dti', FloatType()),
    StructField('borrower_credit_score', FloatType()),
    StructField('num_units', IntegerType()),
    StructField('zip', IntegerType()),
    StructField('mortgage_insurance_percent', FloatType()),
    StructField('current_loan_delinquency_status', IntegerType()),
    StructField('current_actual_upb', FloatType()),
    StructField('interest_rate', FloatType()),
    StructField('loan_age', FloatType()),
    StructField('msa', FloatType()),
    StructField('non_interest_bearing_upb', FloatType()),
    StructField(label, IntegerType()),
])

features = [ x.name for x in schema if x.name != label ]
features

['orig_channel',
 'first_home_buyer',
 'loan_purpose',
 'property_type',
 'occupancy_status',
 'property_state',
 'product_type',
 'relocation_mortgage_indicator',
 'seller_name',
 'mod_flag',
 'orig_interest_rate',
 'orig_upb',
 'orig_loan_term',
 'orig_ltv',
 'orig_cltv',
 'num_borrowers',
 'dti',
 'borrower_credit_score',
 'num_units',
 'zip',
 'mortgage_insurance_percent',
 'current_loan_delinquency_status',
 'current_actual_upb',
 'interest_rate',
 'loan_age',
 'msa',
 'non_interest_bearing_upb']

### Set GCS bucket used for data and model

In [7]:
gcs_bucket = 'gs://cloudml-demo-datalake'

### Read training data parquet files

In [8]:
train_url_parquet_file = f'{gcs_bucket}/processed/parquet/mortgage_small_train/*.parquet'
train_data = GpuDataReader(spark).parquet(train_url_parquet_file)
train_data

<ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset at 0x7fbe0de412e8>

### Read eval data parquet files

In [9]:
eval_url_parquet_file = f'{gcs_bucket}/processed/parquet/mortgage_small_eval/*.parquet'
eval_data = GpuDataReader(spark).parquet(eval_url_parquet_file)
eval_data

<ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset at 0x7fbe0de41518>

### Create a XGBoostClassifier

Full list of available [XGboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) 

In [10]:
# params = { 
#     'eta': 0.1, #learning rate 
#     'gamma': 0.1,
#     'missing': 0.0,
#     'treeMethod': 'gpu_hist',
#     'maxDepth': 10, 
#     'maxLeaves': 256,
#     'growPolicy': 'depthwise',
#     'objective': 'binary:logistic',
#     'minChildWeight': 30.0,
#     'lambda_': 1.0,
#     'scalePosWeight': 2.0,
#     'subsample': 1.0,
#     'nthread': 1,
#     'numRound': 100,
#     'numWorkers': 1,
# }

params = { 
    'treeMethod': 'gpu_hist',
    'objective': 'binary:logistic'
}

classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)

### Create reusable benchmark function

In [11]:
def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
    return result

### Train the model

In [12]:
model = with_benchmark('Training', lambda: classifier.fit(train_data))

Training takes 12.7 seconds


### View model stats using PySpark

In [17]:
accuracy = with_benchmark(
    'Evaluation',
    lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))
print('Accuracy is ' + str(accuracy))

Evaluation takes 0.3 seconds
Accuracy is 0.9969984992496248


### View model stats using Numpy

In [48]:
# Make predicitons
predictions = model.transform(eval_data).select("delinquency_12", "prediction")

In [49]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

predictions_np = np.array((predictions.collect()))

np_acc = accuracy_score(predictions_np[:,0], predictions_np[:,1])
np_f1 = f1_score(predictions_np[:,0], predictions_np[:,1])
np_precision = precision_score(predictions_np[:,0], predictions_np[:,1])
np_recall = recall_score(predictions_np[:,0], predictions_np[:,1])
np_auc = roc_auc_score(predictions_np[:,0], predictions_np[:,1])

print("accuracy:", np_acc)
print("f1:", np_f1)
print("weighted precision:", np_precision)
print("weighted recall:", np_recall)
print("auc:", np_auc)

accuracy: 0.9969984992496248
f1: 0.7692307692307693
weighted precision: 0.7692307692307693
weighted recall: 0.7692307692307693
auc: 0.8838600976063211


### Confusion Matrix

Based on code from @Zack Akil's [Precision/Recall Optimisation notebok](https://colab.sandbox.google.com/github/ZackAkil/Hands-on-ML-Precision-Recall-Predicting-Pneumonia-in-X-rays/blob/master/Precision_Recall_Optimisation.ipynb#scrollTo=I4Cg_tubgvUx)

In [44]:
# import package that will generate the confusion matrix scores
from sklearn.metrics import confusion_matrix
# import packages that will help display the scores
import pandas as pd

In [50]:
confusion_matrix_scores = confusion_matrix(predictions_np[:,0], 
                                           predictions_np[:,1], 
                                           labels=[1, 0])

# display scores as a heatmap
df = pd.DataFrame(confusion_matrix_scores, 
                  columns = ["Predicted Delinquent", "Predicted Not Delinquent"],
                  index = ["Actually Delinquent", "Actually Not Delinquent"])


df.head()

Unnamed: 0,Predicted Delinquent,Predicted Not Delinquent
Actually Delinquent,10,3
Actually Not Delinquent,3,1983


In [51]:
confusion_matrix_scores_normalized = confusion_matrix(predictions_np[:,0], 
                                                     predictions_np[:,1], 
                                                     labels=[1, 0],
                                                     normalize='true')

# display scores as a heatmap
df_normalize = pd.DataFrame(confusion_matrix_scores_normalized, 
                  columns = ["Predicted Delinquent", "Predicted Not Delinquent"],
                  index = ["Actually Delinquent", "Actually Not Delinquent"])

df_normalize.head()

Unnamed: 0,Predicted Delinquent,Predicted Not Delinquent
Actually Delinquent,0.769231,0.230769
Actually Not Delinquent,0.001511,0.998489


### Save the model

In [14]:
model.write().overwrite().save(f'{gcs_bucket}/xgboost/spark/mortgage/model')

### Load model

In [15]:
loaded_model = XGBoostClassificationModel().load(f'{gcs_bucket}/xgboost/spark/mortgage/model')

### Run predictions on evaluation dataset

In [16]:
def transform():
    result = loaded_model.transform(eval_data).cache()
    result.foreachPartition(lambda _: None)
    return result

result = with_benchmark('Transformation', transform)
result.select(label, 'rawPrediction', 'probability', 'prediction').where("delinquency_12 > 0").show(50)

Transformation takes 0.87 seconds
+--------------+--------------------+--------------------+----------+
|delinquency_12|       rawPrediction|         probability|prediction|
+--------------+--------------------+--------------------+----------+
|           1.0|[0.59753340482711...|[0.64509177207946...|       0.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[0.59753340482711...|[0.64509177207946...|       0.0|
|           1.0|[0.23389831185340...|[0.55820941925048...|       0.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351..