In [1]:
# copy data to /shared-data folder - only need to run for first time
!mkdir -p /shared-data/datasets
!cp datasets/mortgage-small.tar.gz /shared-data/datasets
!tar xzvf /shared-data/datasets/mortgage-small.tar.gz -C /shared-data/datasets

mortgage-small/
mortgage-small/train/
mortgage-small/train/mortgage-small.csv
mortgage-small/trainWithEval/
mortgage-small/trainWithEval/test.csv
mortgage-small/eval/
mortgage-small/eval/mortgage-small.csv


In [2]:
# Taken from the example shown in 

#https://github.com/NVIDIA/spark-xgboost-examples/blob/spark-3/examples/notebooks/python/taxi-gpu.ipynb

In [3]:
import sys
sys.path.insert(0, '/usr/local/spark/jars/samples.zip')
sys.path.insert(0, '/usr/local/spark/jars/xgboost4j-spark_3.0-1.0.0-0.2.0.jar')
sys.path.insert(0, '/usr/local/spark/jars/xgboost4j-1.0.0-0.2.0.jar')
#sys.path.remove('/usr/local/spark/jars/xgboost4j-spark_3.0-1.0.0-0.1.0.jar')
#sys.path.remove('/usr/local/spark/jars/xgboost4j-1.0.0-0.1.0.jar')

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from time import time

spark = SparkSession.builder \
    .config('spark.executor.cores','1')\
    .config('driver-memory', '10G')\
    .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')\
    .config('spark.rapids.sql.enabled','true')\
    .config('spark.rapids.sql.explain', 'ALL')\
    .config('spark.rapids.sql.concurrentGpuTasks','1')\
    .config('spark.rapids.memory.pinnedPool.size', '8G')\
    .getOrCreate()

In [5]:
reader = spark.read
label = 'delinquency_12'
schema = StructType([
    StructField('orig_channel', FloatType()),
    StructField('first_home_buyer', FloatType()),
    StructField('loan_purpose', FloatType()),
    StructField('property_type', FloatType()),
    StructField('occupancy_status', FloatType()),
    StructField('property_state', FloatType()),
    StructField('product_type', FloatType()),
    StructField('relocation_mortgage_indicator', FloatType()),
    StructField('seller_name', FloatType()),
    StructField('mod_flag', FloatType()),
    StructField('orig_interest_rate', FloatType()),
    StructField('orig_upb', IntegerType()),
    StructField('orig_loan_term', IntegerType()),
    StructField('orig_ltv', FloatType()),
    StructField('orig_cltv', FloatType()),
    StructField('num_borrowers', FloatType()),
    StructField('dti', FloatType()),
    StructField('borrower_credit_score', FloatType()),
    StructField('num_units', IntegerType()),
    StructField('zip', IntegerType()),
    StructField('mortgage_insurance_percent', FloatType()),
    StructField('current_loan_delinquency_status', IntegerType()),
    StructField('current_actual_upb', FloatType()),
    StructField('interest_rate', FloatType()),
    StructField('loan_age', FloatType()),
    StructField('msa', FloatType()),
    StructField('non_interest_bearing_upb', FloatType()),
    StructField(label, IntegerType()),
])
features = [ x.name for x in schema if x.name != label ]

train_data = reader.schema(schema).option('header', True).csv('/shared-data/datasets/mortgage-small/train')
trans_data = reader.schema(schema).option('header', True).csv('/shared-data/datasets/mortgage-small/trainWithEval')

In [6]:
train_data.show(5)

+------------+----------------+------------+-------------+----------------+--------------+------------+-----------------------------+-----------+--------+------------------+--------+--------------+--------+---------+-------------+----+---------------------+---------+---+--------------------------+-------------------------------+------------------+-------------+--------+-------+------------------------+--------------+
|orig_channel|first_home_buyer|loan_purpose|property_type|occupancy_status|property_state|product_type|relocation_mortgage_indicator|seller_name|mod_flag|orig_interest_rate|orig_upb|orig_loan_term|orig_ltv|orig_cltv|num_borrowers| dti|borrower_credit_score|num_units|zip|mortgage_insurance_percent|current_loan_delinquency_status|current_actual_upb|interest_rate|loan_age|    msa|non_interest_bearing_upb|delinquency_12|
+------------+----------------+------------+-------------+----------------+--------------+------------+-----------------------------+-----------+--------+----

In [7]:
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from time import time
params = { 
    'eta': 0.1,
    'gamma': 0.1,
    'missing': 0.0,
    'treeMethod': 'gpu_hist',
    'maxDepth': 10, 
    'maxLeaves': 256,
    'objective':'binary:logistic',
    'growPolicy': 'depthwise',
    'minChildWeight': 30.0,
    'lambda_': 1.0,
    'scalePosWeight': 2.0,
    'subsample': 1.0,
    'nthread': 1,
    'numRound': 100,
    'numWorkers': 1,
}
classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)

In [8]:
def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
    return result
model = with_benchmark('Training', lambda: classifier.fit(train_data))

Training takes 3.36 seconds


In [9]:
model.write().overwrite().save('/shared-data/datasets/mortgage-small/data/new-model-path')
loaded_model = XGBoostClassificationModel().load('/shared-data/datasets/mortgage-small/data/new-model-path')

In [10]:
def transform():
    result = loaded_model.transform(trans_data).cache()
    result.foreachPartition(lambda _: None)
    return result
result = with_benchmark('Transformation', transform)
result.select(label, 'rawPrediction', 'probability', 'prediction').show(5)

Transformation takes 1.32 seconds
+--------------+--------------------+--------------------+----------+
|delinquency_12|       rawPrediction|         probability|prediction|
+--------------+--------------------+--------------------+----------+
|             0|[5.29092645645141...|[0.99498815322294...|       0.0|
|             0|[5.29092645645141...|[0.99498815322294...|       0.0|
|             0|[5.29092645645141...|[0.99498815322294...|       0.0|
|             0|[5.29092645645141...|[0.99498815322294...|       0.0|
|             0|[5.29092645645141...|[0.99498815322294...|       0.0|
+--------------+--------------------+--------------------+----------+
only showing top 5 rows



In [11]:
accuracy = with_benchmark(
    'Evaluation',
    lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))
print('Accuracy is ' + str(accuracy))

Evaluation takes 0.4 seconds
Accuracy is 1.0


In [12]:
spark.stop()