In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

## Create dataframe

In [16]:
#Create a DataFrame
from pyspark.sql.types import *

penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.read.csv("../Data/penbased.dat", samplingRatio=0.3, schema=penschema)

## Split dataframe into training and test sets

In [17]:
# Create Training and Test data.
pendtsets = dfpen.randomSplit([0.8, 0.2], 1)
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

## Define transformer and estimator and add to a pipeline.

In [18]:
# Transformer - Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=dfpen.columns[:-1], outputCol="features") 

In [19]:
# Estimator - DecisionTreeClassifier which creates a transformer (Decision Tree Classifier model)
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [20]:
# Fit the pipeline to training documents.
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[va, dt])

## Fit the training dataset to pipeline and create a model

In [21]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator().setMetricName('f1')

paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5, 10, 15, 20, 25, 30]).build()

cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator,
                    numFolds=5,
                    estimatorParamMaps=paramGrid)

In [22]:
cvmodel = cv.fit(pendttrain)

## Apply the model to the training data set

In [23]:
dtpredicts = cvmodel.bestModel.transform(pendtvalid)

## Evaluate the model

In [25]:
evaluator.evaluate(dtpredicts)

0.9483109728201898

In [26]:
ss.stop()