Grid Search in SparkMLib (PySpark ML)

In [3]:
import findspark
findspark.init()

print('Success!')

Success!


In [49]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from sklearn.metrics import confusion_matrix, roc_auc_score

In [5]:
spark = SparkSession.builder.appName('grid-search').getOrCreate()
df = spark.read.csv('credit-card-full.csv', header=True, inferSchema = True)
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- LIMIT_BAL: integer (nullable = true)
 |-- SEX: integer (nullable = true)
 |-- EDUCATION: integer (nullable = true)
 |-- MARRIAGE: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PAY_0: integer (nullable = true)
 |-- PAY_2: integer (nullable = true)
 |-- PAY_3: integer (nullable = true)
 |-- PAY_4: integer (nullable = true)
 |-- PAY_5: integer (nullable = true)
 |-- PAY_6: integer (nullable = true)
 |-- BILL_AMT1: integer (nullable = true)
 |-- BILL_AMT2: integer (nullable = true)
 |-- BILL_AMT3: integer (nullable = true)
 |-- BILL_AMT4: integer (nullable = true)
 |-- BILL_AMT5: integer (nullable = true)
 |-- BILL_AMT6: integer (nullable = true)
 |-- PAY_AMT1: integer (nullable = true)
 |-- PAY_AMT2: integer (nullable = true)
 |-- PAY_AMT3: integer (nullable = true)
 |-- PAY_AMT4: integer (nullable = true)
 |-- PAY_AMT5: integer (nullable = true)
 |-- PAY_AMT6: integer (nullable = true)
 |-- default payment next month: inte

In [6]:
pd.DataFrame(df.take(2))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1


In [6]:
pd.DataFrame(df.take(2), columns=df.columns).transpose()

Unnamed: 0,0,1
ID,1,2
LIMIT_BAL,20000,120000
SEX,2,2
EDUCATION,2,2
MARRIAGE,1,2
AGE,24,26
PAY_0,2,-1
PAY_2,2,2
PAY_3,-1,0
PAY_4,-1,0


In [7]:
#Check if classes are perfectly balanced
df.groupby('default payment next month').count().toPandas()

Unnamed: 0,default payment next month,count
0,1,6636
1,0,23364


In [8]:
#df.rename(columns={"default payment next month": "label"})
#print('Success')

df_renamed = df.withColumnRenamed("default payment next month", "label")
#df2 = pd.DataFrame(df)
#df2.rename(index = {24: "label"})
#df2.head()

In [9]:
print(df_renamed)

DataFrame[ID: int, LIMIT_BAL: int, SEX: int, EDUCATION: int, MARRIAGE: int, AGE: int, PAY_0: int, PAY_2: int, PAY_3: int, PAY_4: int, PAY_5: int, PAY_6: int, BILL_AMT1: int, BILL_AMT2: int, BILL_AMT3: int, BILL_AMT4: int, BILL_AMT5: int, BILL_AMT6: int, PAY_AMT1: int, PAY_AMT2: int, PAY_AMT3: int, PAY_AMT4: int, PAY_AMT5: int, PAY_AMT6: int, label: int]


In [10]:
df = df_renamed.drop('ID')

In [11]:
column_list = df.columns
print(column_list)

['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'label']


In [12]:
# show summary statistics
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
LIMIT_BAL,30000,167484.32266666667,129747.66156720246,10000,1000000
SEX,30000,1.6037333333333332,0.4891291960902602,1,2
EDUCATION,30000,1.8531333333333333,0.7903486597207269,0,6
MARRIAGE,30000,1.5518666666666667,0.5219696006132467,0,3
AGE,30000,35.4855,9.217904068090155,21,79
PAY_0,30000,-0.0167,1.1238015279973335,-2,8
PAY_2,30000,-0.13376666666666667,1.1971859730345495,-2,8
PAY_3,30000,-0.1662,1.1968675684465686,-2,8
PAY_4,30000,-0.22066666666666668,1.1691386224023357,-2,8


In [13]:
numeric_features_df=df.select(numeric_features)
numeric_features_df.toPandas().head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,label
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [14]:
column_list = df.columns
print(column_list)

['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'label']


In [16]:
# Prepare data for ML Model
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
inputColums = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
stages = []

#outputlabel = ['default payment next month']

#outputColum = StringIndexer(inputCol=outputlabel, outputCol='label')
#stages+= [outputlabel]

#outputAssembler = VectorAssembler(inputCols=outputlabel, outputCol="label")
#stages += [outputAssembler]

assembler = VectorAssembler(inputCols=inputColums, outputCol="features")
stages += [assembler]

#scaler = StandardScaler(inputCol="vectorized_features", outputCol="features")
#stages += [scaler]

## Pipeline
The pipeline is use to chain multiple Transformers and Estimators together to specify our machine learning workflow. A Pipeline’s stages are specified as an ordered array.

In [11]:
cols = df.columns

In [12]:
cols

['ID',
 'LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'default payment next month']

In [17]:
from pyspark.ml import Pipeline
cols = df.columns
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['features'] + cols
df = df.select(selectedCols)
df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- LIMIT_BAL: integer (nullable = true)
 |-- SEX: integer (nullable = true)
 |-- EDUCATION: integer (nullable = true)
 |-- MARRIAGE: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PAY_0: integer (nullable = true)
 |-- PAY_2: integer (nullable = true)
 |-- PAY_3: integer (nullable = true)
 |-- PAY_4: integer (nullable = true)
 |-- PAY_5: integer (nullable = true)
 |-- PAY_6: integer (nullable = true)
 |-- BILL_AMT1: integer (nullable = true)
 |-- BILL_AMT2: integer (nullable = true)
 |-- BILL_AMT3: integer (nullable = true)
 |-- BILL_AMT4: integer (nullable = true)
 |-- BILL_AMT5: integer (nullable = true)
 |-- BILL_AMT6: integer (nullable = true)
 |-- PAY_AMT1: integer (nullable = true)
 |-- PAY_AMT2: integer (nullable = true)
 |-- PAY_AMT3: integer (nullable = true)
 |-- PAY_AMT4: integer (nullable = true)
 |-- PAY_AMT5: integer (nullable = true)
 |-- PAY_AMT6: integer (nullable = true)
 |-- label: integer (nullable = 

In [26]:
#Randomly split data into train and test sets, and set seed for reproducibility
train, test = df.randomSplit([0.7, 0.3], seed=2018)
print("Training Dataset Count:" + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count:21035
Test Dataset Count: 8965


## Model Definition

In [27]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('LIMIT_BAL', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+---------+-----+--------------------+----------+--------------------+
|LIMIT_BAL|label|       rawPrediction|prediction|         probability|
+---------+-----+--------------------+----------+--------------------+
|    10000|    0|[15.1486313339977...|       0.0|[0.75743156669988...|
|    20000|    0|[15.1486313339977...|       0.0|[0.75743156669988...|
|    20000|    0|[16.2456036501765...|       0.0|[0.81228018250882...|
|    20000|    1|[15.1486313339977...|       0.0|[0.75743156669988...|
|    20000|    0|[15.1486313339977...|       0.0|[0.75743156669988...|
|    20000|    1|[16.2456036501765...|       0.0|[0.81228018250882...|
|    20000|    1|[15.1486313339977...|       0.0|[0.75743156669988...|
|    30000|    1|[15.3599117976001...|       0.0|[0.76799558988000...|
|    30000|    0|[15.3599117976001...|       0.0|[0.76799558988000...|
|    30000|    0|[15.3599117976001...|       0.0|[0.76799558988000...|
+---------+-----+--------------------+----------+--------------------+
only s

In [28]:
class_names = [1, 0]
y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred,labels=class_names)
print(cnf_matrix)

[[ 671 1239]
 [ 348 6707]]


In [29]:
# Check accuracy
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
evaluator.getMetricName()

Test Area Under ROC: 0.7627071513649297


'areaUnderROC'

In [30]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ",accuracy)

Accuracy :  0.8229782487451199


## Model Finetuning 

Here we finetuned the model using the same parameters used as Scikit-learn's GridSearchCV method except 'log_loss' evaluation criterion which unfortunately, Spark's machine learning RandomForestClassifer method does not support.

In [34]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from datetime import datetime, timedelta

start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

#Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder() 
            .addGrid(rf.maxDepth, [2, 4, 8, 15])
            .addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt', 'log2'])
            .addGrid(rf.impurity, ['gini', 'entropy'])
            .build())
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(train)

end_time = datetime.now()
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Start Time           2023-01-15 15:36:28.931552
Start Time           2023-01-15 15:36:28.931552
End Time             2023-01-15 15:44:23.989357
0:07:55


In [36]:
## Evaluate best model on dataset
## Evaluate Best Model
predictions = cvModel.transform(test)
print('Best Model Test Area Under ROC', evaluator.evaluate(predictions))

Best Model Test Area Under ROC 0.7672137765722574


In [37]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ",accuracy)

Accuracy :  0.8214166201896264


In [38]:
cvModel.bestModel

RandomForestClassificationModel: uid=RandomForestClassifier_a132e80b74f9, numTrees=20, numClasses=2, numFeatures=23

In [39]:
#Best model parameters
best_model=cvModel.bestModel
best_model.explainParams().split("\n")

['bootstrap: Whether bootstrap samples are used when building trees. (default: True)',
 'cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)',
 'checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)',
 "featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3

In [48]:
#print best model parameters
print('Best Model - (Feature Subset Strategy i.e. criterion):', best_model._java_obj.getFeatureSubsetStrategy())
print('Best Model - Maximum Depth', best_model._java_obj.getMaxDepth())
print('Best Model - Impurity (Max Features):', best_model._java_obj.getImpurity())


Best Model - (Feature Subset Strategy i.e. criterion): auto
Best Model - Maximum Depth 8
Best Model - Impurity (Max Features): entropy
