In [1]:
#entry point
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
sc=SparkContext(master='local')

spark=SparkSession.builder.appName('Sql example').config('spark.some.config.option','some-value').getOrCreate()

## Import Data

In [2]:
raw_data=spark.read.csv('cuse_binary.csv',inferSchema=True,header=True)
raw_data.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import *
raw_data.select(approx_count_distinct(raw_data.age)).show()

+--------------------------+
|approx_count_distinct(age)|
+--------------------------+
|                         4|
+--------------------------+



In [16]:
raw_data.groupBy('education','y').sum().show()

+---------+---+------+
|education|  y|sum(y)|
+---------+---+------+
|     high|  1|   306|
|     high|  0|     0|
|      low|  1|   201|
|      low|  0|     0|
+---------+---+------+



## StringIndexer, oneHotEncoder, VectorAssembler 

In [19]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

In [21]:
categorical_columns=raw_data.columns[0:3]
categorical_columns

['age', 'education', 'wantsMore']

### String Indexer stages

In [22]:
stringindexer_stages=[StringIndexer(inputCol=c,outputCol='index_'+c) for c in categorical_columns]

#encode the label column and add it to the string indexer
stringindexer_stages +=[StringIndexer(inputCol='y',outputCol='label')]

### One Hot Encoder stages

In [23]:
onehotencoder=[OneHotEncoder(inputCol='index_'+c, outputCol='ohe_'+c) for c in categorical_columns]


### Vector Assembler stages

In [26]:
feature_columns=['ohe_'+c for c in categorical_columns]
vectorassembler=[VectorAssembler(inputCols=feature_columns,outputCol='features')]

### Pipeline 

In [30]:
stages=stringindexer_stages+onehotencoder+vectorassembler
pipeline=Pipeline(stages=stages)


### Fit the pipeline model

In [31]:
pipeline_model=pipeline.fit(raw_data)

### Transform the data

In [32]:
columns=feature_columns+['features','label']
cuse_df=pipeline_model.transform(raw_data).select(columns)
cuse_df.show(5)

+-------------+-------------+-------------+-------------------+-----+
|      ohe_age|ohe_education|ohe_wantsMore|           features|label|
+-------------+-------------+-------------+-------------------+-----+
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|    (1,[],[])|(1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
+-------------+-------------+-------------+-------------------+-----+
only showing top 5 rows



### Split the data into training and test data

In [33]:
train_df,test_df=cuse_df.randomSplit([0.8,0.2],seed=123)

### Build cross validation model

#### Estimator

In [36]:
from pyspark.ml.classification import DecisionTreeClassifier
dt=DecisionTreeClassifier(featuresCol='features',labelCol='label')

#### Parameter Grid

In [37]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid=ParamGridBuilder().addGrid(dt.maxDepth,[2,3,4,5]).build()

#### Evaluator

In [39]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',metricName='areaUnderROC')

#### Cross validator

In [40]:
from pyspark.ml.tuning import CrossValidator
cv=CrossValidator(estimator=dt,estimatorParamMaps=param_grid,evaluator=evaluator,numFolds=4)

#### Fit crossvalidation mode

In [41]:
cv_model=cv.fit(cuse_df)

#### Prediction

In [44]:
show_columns=['features','label','prediction','rawPrediction','probability']

#### Prediction on training data

In [45]:
training_prediction=cv_model.transform(train_df)
training_prediction.select(show_columns).show(5,False)

+---------+-----+----------+-------------+----------------------------------------+
|features |label|prediction|rawPrediction|probability                             |
+---------+-----+----------+-------------+----------------------------------------+
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
+---------+-----+----------+-------------+----------------------------------------+
only showing top 5 rows



#### Prediction on test data

In [46]:
pred_test_cv=cv_model.transform(test_df)
pred_test_cv.select(show_columns).show(5,False)

+---------+-----+----------+-------------+----------------------------------------+
|features |label|prediction|rawPrediction|probability                             |
+---------+-----+----------+-------------+----------------------------------------+
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
+---------+-----+----------+-------------+----------------------------------------+
only showing top 5 rows



#### Confusion Matrix

In [47]:
label_and_pred=cv_model.transform(cuse_df).select('label','prediction')
label_and_pred.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 897,
             Row(label=0.0, prediction=1.0): 203,
             Row(label=1.0, prediction=0.0): 270,
             Row(label=1.0, prediction=1.0): 237})

#### Best model parameter

In [48]:
print(cv_model.bestModel._java_obj.getMaxDepth())

3
