In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

## Create dataframe

In [13]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("../Data/penbased.dat", 4)\
            .map(lambda x:  x.split(", "))\
            .map(lambda row: [float(x) for x in row])
pen_raw.take(1)

                                                                                

[[47.0,
  100.0,
  27.0,
  81.0,
  57.0,
  37.0,
  26.0,
  0.0,
  0.0,
  23.0,
  56.0,
  53.0,
  100.0,
  90.0,
  40.0,
  98.0,
  8.0]]

[[47.0,
  100.0,
  27.0,
  81.0,
  57.0,
  37.0,
  26.0,
  0.0,
  0.0,
  23.0,
  56.0,
  53.0,
  100.0,
  90.0,
  40.0,
  98.0,
  8.0]]

In [14]:
#Create a DataFrame
from pyspark.sql.types import *

penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.createDataFrame(pen_raw, penschema)

In [15]:
dfpen.show()

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

## Create dataframe with a feature vector and label

In [16]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.
penlpoints = va.transform(dfpen).select("features", "label")

## Split dataframe into training and test sets

In [17]:
# Create Training and Test data.
pendtsets = penlpoints.randomSplit([0.8, 0.2], 1)
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

## Create a RandomForestClassifer and build a model using training Dataset

In [18]:
# Train the model.
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(maxDepth=20)
rfmodel = rf.fit(pendttrain)
print(rfmodel.toDebugString)

22/02/24 05:32:14 WARN DAGScheduler: Broadcasting large task binary with size 1429.8 KiB
22/02/24 05:32:14 WARN DAGScheduler: Broadcasting large task binary with size 1429.8 KiB
22/02/24 05:32:14 WARN DAGScheduler: Broadcasting large task binary with size 1904.4 KiB
22/02/24 05:32:14 WARN DAGScheduler: Broadcasting large task binary with size 1904.4 KiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
22/02/24 05:32:15 WARN DAGScheduler: Broad

RandomForestClassificationModel: uid=RandomForestClassifier_ef27ae2a5a6d, numTrees=20, numClasses=10, numFeatures=16
  Tree 0 (weight 1.0):
    If (feature 5 <= 61.5)
     If (feature 8 <= 59.5)
      If (feature 11 <= 24.5)
       If (feature 14 <= 71.5)
        If (feature 9 <= 20.5)
         If (feature 0 <= 20.5)
          Predict: 7.0
         Else (feature 0 > 20.5)
          If (feature 1 <= 78.5)
           Predict: 0.0
          Else (feature 1 > 78.5)
           If (feature 14 <= 54.5)
            Predict: 6.0
           Else (feature 14 > 54.5)
            If (feature 15 <= 32.5)
             Predict: 6.0
            Else (feature 15 > 32.5)
             Predict: 0.0
        Else (feature 9 > 20.5)
         Predict: 8.0
       Else (feature 14 > 71.5)
        If (feature 0 <= 54.5)
         If (feature 3 <= 93.5)
          If (feature 4 <= 13.5)
           If (feature 13 <= 25.5)
            Predict: 4.0
           Else (feature 13 > 25.5)
            Predict: 0.0
          

RandomForestClassificationModel: uid=RandomForestClassifier_ef27ae2a5a6d, numTrees=20, numClasses=10, numFeatures=16
  Tree 0 (weight 1.0):
    If (feature 5 <= 61.5)
     If (feature 8 <= 59.5)
      If (feature 11 <= 24.5)
       If (feature 14 <= 71.5)
        If (feature 9 <= 20.5)
         If (feature 0 <= 20.5)
          Predict: 7.0
         Else (feature 0 > 20.5)
          If (feature 1 <= 78.5)
           Predict: 0.0
          Else (feature 1 > 78.5)
           If (feature 14 <= 54.5)
            Predict: 6.0
           Else (feature 14 > 54.5)
            If (feature 15 <= 32.5)
             Predict: 6.0
            Else (feature 15 > 32.5)
             Predict: 0.0
        Else (feature 9 > 20.5)
         Predict: 8.0
       Else (feature 14 > 71.5)
        If (feature 0 <= 54.5)
         If (feature 3 <= 93.5)
          If (feature 4 <= 13.5)
           If (feature 13 <= 25.5)
            Predict: 4.0
           Else (feature 13 > 25.5)
            Predict: 0.0
          

## Evaluate the model

In [19]:
rfpredicts = rfmodel.transform(pendtvalid)

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# expects two input columns: prediction and label.

metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rfpredicts)

22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


0.9860611323318544

22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


0.9860611323318544

In [21]:
from pyspark.mllib.evaluation import MulticlassMetrics

#prediction and label
prediction_label = rfpredicts.select("prediction", "label").rdd

metrics = MulticlassMetrics(prediction_label)

confusionMetrics = metrics.confusionMatrix()

print("Confusion Metrics = \n%s" % confusionMetrics)

Confusion Metrics = 
DenseMatrix([[227.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.],
             [  0., 205.,   7.,   0.,   0.,   0.,   1.,   1.,   0.,   0.],
             [  0.,   0., 206.,   0.,   0.,   0.,   0.,   1.,   0.,   0.],
             [  0.,   1.,   0., 188.,   0.,   0.,   0.,   1.,   0.,   1.],
             [  0.,   0.,   0.,   0., 219.,   0.,   0.,   0.,   0.,   1.],
             [  0.,   0.,   0.,   1.,   0., 191.,   0.,   0.,   1.,   2.],
             [  0.,   0.,   1.,   0.,   1.,   1., 195.,   0.,   0.,   0.],
             [  0.,   0.,   1.,   0.,   0.,   0.,   0., 193.,   2.,   0.],
             [  1.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 173.,   0.],
             [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   2.,   0., 183.]])
Confusion Metrics = 
DenseMatrix([[227.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.],
             [  0., 205.,   7.,   0.,   0.,   0.,   1.,   1.,   0.,   0.],
             [  0.,   0., 206.,   0.,   0.,   0.,   0.,  

22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
22/02/24 05:32:18 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


In [22]:
ss.stop()