In [1]:
from pyspark import SparkContext
sc=SparkContext(master='local')

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('SQL Example').config('spark.some.config.option','some-value').getOrCreate()

## Load the data


In [2]:
raw_data=spark.read.csv('cuse_binary.csv',inferSchema=True,header=True)
raw_data.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



## String Indexer,One Hot Encoder,Vector Assembler

In [4]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

In [6]:
categorical_columns=raw_data.columns[0:3]
categorical_columns

['age', 'education', 'wantsMore']

### string indexer stages

In [7]:
stringindexer_stages=[StringIndexer(inputCol=c,outputCol='index_'+c) for c in categorical_columns]
stringindexer_stages += [StringIndexer(inputCol='y',outputCol='label')]

### one hot encoder stages

In [8]:
onehotencoder_stages=[OneHotEncoder(inputCol='index_'+c, outputCol='ohe_'+c) for c in categorical_columns]

### vector Assembler stages

In [9]:
feature_columns=['ohe_'+c for c in categorical_columns]
vectorassembler_stages=[VectorAssembler(inputCols=feature_columns,outputCol='features')]

### pipeline stages

In [10]:
stages=stringindexer_stages+onehotencoder_stages+vectorassembler_stages
pipeline=Pipeline(stages=stages)

### Fit the data

In [12]:
pipeline_model=pipeline.fit(raw_data)

### Transform the data

In [13]:
columns=feature_columns+['features','label']
cuse_df=pipeline_model.transform(raw_data).select(columns)
cuse_df.show()

+-------------+-------------+-------------+-------------------+-----+
|      ohe_age|ohe_education|ohe_wantsMore|           features|label|
+-------------+-------------+-------------+-------------------+-----+
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (

## Split the data

In [14]:
train_df,test_df=cuse_df.randomSplit([0.8,0.2],seed=123)

In [15]:
train_df.count()

1290

In [16]:
test_df.count()

317

## Cross Validation 

### estimator

In [17]:
from pyspark.ml.classification import GBTClassifier
gbt=GBTClassifier(featuresCol='features',labelCol='label')

### Parameter Grid

In [18]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid=ParamGridBuilder().addGrid(gbt.maxDepth,[2,3,4]).addGrid(gbt.minInfoGain,[0.0,0.1,0.2,0.3]) \
                            .addGrid(gbt.stepSize,[0.05,0.1,0.2,0.4]).build()

### Evaluator

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction')

### Cross validation model

In [20]:
from pyspark.ml.tuning import CrossValidator
cv=CrossValidator(estimator=gbt,estimatorParamMaps=param_grid,evaluator=evaluator)

### Fit the model

In [21]:
cv_model=cv.fit(cuse_df)

### Predicting the training data

In [24]:
training_pred=cv_model.transform(train_df)
training_pred.show(5)

+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
|  ohe_age|ohe_education|ohe_wantsMore| features|label|       rawPrediction|         probability|prediction|
+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
only showing top 5 

### Predicting the test data

In [25]:
testing_pred=cv_model.transform(test_df)
testing_pred.show(5)

+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
|  ohe_age|ohe_education|ohe_wantsMore| features|label|       rawPrediction|         probability|prediction|
+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
|(3,[],[])|    (1,[],[])|    (1,[],[])|(5,[],[])|  0.0|[-0.0599765765084...|[0.47004761793628...|       1.0|
+---------+-------------+-------------+---------+-----+--------------------+--------------------+----------+
only showing top 5 

### Prediction performance

In [26]:
print('training accuracy (areaUnderROC): ',evaluator.setMetricName('areaUnderROC').evaluate(training_pred))
print('testing accuracy (areaUnderROC): ',evaluator.setMetricName('areaUnderROC').evaluate(testing_pred))

training accuracy (areaUnderROC):  0.634005193589372
testing accuracy (areaUnderROC):  0.671067106710671


### Confusion Matrix 

#### training data

In [27]:
train_data=training_pred.select('label','prediction')
train_data.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=1.0): 155,
             Row(label=1.0, prediction=1.0): 180,
             Row(label=0.0, prediction=0.0): 729,
             Row(label=1.0, prediction=0.0): 226})

#### Testing data

In [28]:
test_data=testing_pred.select('label','prediction')
test_data.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=1.0): 48,
             Row(label=1.0, prediction=1.0): 57,
             Row(label=0.0, prediction=0.0): 168,
             Row(label=1.0, prediction=0.0): 44})

### Best model

In [29]:
print('Max depth: ',cv_model.bestModel._java_obj.getMaxDepth())
print('minInfo Gain: ',cv_model.bestModel._java_obj.getMinInfoGain())
print('StepSize: ',cv_model.bestModel._java_obj.getStepSize())

Max depth:  3
minInfo Gain:  0.0
StepSize:  0.05
