In [3]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Mortgage XGboost - GPUs 

Based on notebooks from https://github.com/rapidsai/spark-examples

Learn more about RAPIDS-Spark XGboost4j here https://news.developer.nvidia.com/gpu-accelerated-spark-xgboost/

### Include pyspark methods used in notebook

In [1]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from pyspark.sql.functions import col
from time import time

### Read data in using GPU data reader

This is a custom built reader created by Nvida to make use of GPUs to read data using GPUs. 

In Spark 2.4 you need include the GpuDataReader method but from Spark 3.0+ you will be able to use the native Spark read method.

In [3]:
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader

### Set label used for model

In [9]:
label = 'delinquency_12'

### Create a list of the features used for the model

In [10]:
schema = StructType([
    StructField('orig_channel', FloatType()),
    StructField('first_home_buyer', FloatType()),
    StructField('loan_purpose', FloatType()),
    StructField('property_type', FloatType()),
    StructField('occupancy_status', FloatType()),
    StructField('property_state', FloatType()),
    StructField('product_type', FloatType()),
    StructField('relocation_mortgage_indicator', FloatType()),
    StructField('seller_name', FloatType()),
    StructField('mod_flag', FloatType()),
    StructField('orig_interest_rate', FloatType()),
    StructField('orig_upb', IntegerType()),
    StructField('orig_loan_term', IntegerType()),
    StructField('orig_ltv', FloatType()),
    StructField('orig_cltv', FloatType()),
    StructField('num_borrowers', FloatType()),
    StructField('dti', FloatType()),
    StructField('borrower_credit_score', FloatType()),
    StructField('num_units', IntegerType()),
    StructField('zip', IntegerType()),
    StructField('mortgage_insurance_percent', FloatType()),
    StructField('current_loan_delinquency_status', IntegerType()),
    StructField('current_actual_upb', FloatType()),
    StructField('interest_rate', FloatType()),
    StructField('loan_age', FloatType()),
    StructField('msa', FloatType()),
    StructField('non_interest_bearing_upb', FloatType()),
    StructField(label, IntegerType()),
])

features = [ x.name for x in schema if x.name != label ]
features

['orig_channel',
 'first_home_buyer',
 'loan_purpose',
 'property_type',
 'occupancy_status',
 'property_state',
 'product_type',
 'relocation_mortgage_indicator',
 'seller_name',
 'mod_flag',
 'orig_interest_rate',
 'orig_upb',
 'orig_loan_term',
 'orig_ltv',
 'orig_cltv',
 'num_borrowers',
 'dti',
 'borrower_credit_score',
 'num_units',
 'zip',
 'mortgage_insurance_percent',
 'current_loan_delinquency_status',
 'current_actual_upb',
 'interest_rate',
 'loan_age',
 'msa',
 'non_interest_bearing_upb']

### Read training data parquet files

In [6]:
!hdfs dfs -ls gs://dataproc-datalake-warehouse/datasets/mortgage_small_train

Found 2 items
-rwx------   3 root root          0 2020-05-12 12:01 gs://dataproc-datalake-warehouse/datasets/mortgage_small_train/_SUCCESS
-rwx------   3 root root      48716 2020-05-12 12:01 gs://dataproc-datalake-warehouse/datasets/mortgage_small_train/part-00000-52e89ff6-2296-4552-95f0-65923d730f3b-c000.snappy.parquet


In [None]:
train_url_parquet_file = 'gs://dataproc-datalake-warehouse/datasets/mortgage_small_train/part-00000-52e89ff6-2296-4552-95f0-65923d730f3b-c000.snappy.parquet'
train_data = GpuDataReader(spark).parquet(train_url_parquet_file)

### Read eval data parquet files

In [8]:
!hdfs dfs -ls gs://dataproc-datalake-warehouse/datasets/mortgage_small_eval

Found 2 items
-rwx------   3 root root          0 2020-05-12 12:01 gs://dataproc-datalake-warehouse/datasets/mortgage_small_eval/_SUCCESS
-rwx------   3 root root      25983 2020-05-12 12:01 gs://dataproc-datalake-warehouse/datasets/mortgage_small_eval/part-00000-f95c666a-2937-4a46-a240-8047d5a0ffe8-c000.snappy.parquet


In [12]:
eval_url_parquet_file = 'gs://dataproc-datalake-warehouse/datasets/mortgage_small_eval/part-00000-f95c666a-2937-4a46-a240-8047d5a0ffe8-c000.snappy.parquet'
eval_data = GpuDataReader(spark).parquet(eval_url_parquet_file)

### Create a XGBoostClassifier

In [18]:
# params = { 
#     'eta': 0.1, #learning rate 
#     'gamma': 0.1,
#     'missing': 0.0,
#     'treeMethod': 'gpu_hist',
#     'maxDepth': 10, 
#     'maxLeaves': 256,
#     'growPolicy': 'depthwise',
#     'objective': 'binary:logistic',
#     'minChildWeight': 30.0,
#     'lambda_': 1.0,
#     'scalePosWeight': 2.0,
#     'subsample': 1.0,
#     'nthread': 1,
#     'numRound': 100,
#     'numWorkers': 1,
# }

params = { 
    'treeMethod': 'gpu_hist',
    'objective': 'binary:logistic'
}

classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)

### Create reusable benchmark function

In [14]:
def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
    return result

### Train the model

In [28]:
model = with_benchmark('Training', lambda: classifier.fit(train_data))

Training takes 0.71 seconds


### Save the model

In [24]:
model.write().overwrite().save('gs://dataproc-datalake-xgboost/mortgage/gpu-model')

### Load model

In [25]:
loaded_model = XGBoostClassificationModel().load('gs://dataproc-datalake-xgboost/mortgage/gpu-model')

### Run predictions

In [26]:
def transform():
    result = loaded_model.transform(eval_data).cache()
    result.foreachPartition(lambda _: None)
    return result

result = with_benchmark('Transformation', transform)
result.select(label, 'rawPrediction', 'probability', 'prediction').where("delinquency_12 > 0").show(50)

Transformation takes 4.76 seconds
+--------------+--------------------+--------------------+----------+
|delinquency_12|       rawPrediction|         probability|prediction|
+--------------+--------------------+--------------------+----------+
|           1.0|[0.59753340482711...|[0.64509177207946...|       0.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[0.59753340482711...|[0.64509177207946...|       0.0|
|           1.0|[0.23389831185340...|[0.55820941925048...|       0.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351...|       1.0|
|           1.0|[-0.4235294461250...|[0.39567250013351..

### See model stats

In [27]:
accuracy = with_benchmark(
    'Evaluation',
    lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))
print('Accuracy is ' + str(accuracy))

Evaluation takes 0.96 seconds
Accuracy is 0.9969984992496248
