In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch04").getOrCreate()
spark.conf.set("spark.driver.memory", "4g")
sc = spark.sparkContext

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [5]:
colNames = ["Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
for i in range(4):
    colNames += ["Wilderness_Area_"+str(i),]
for i in range(40):
    colNames += ["Soil_Type_"+str(i),]
colNames += ["Cover_Type",]

In [6]:
schema = StructType()
for name in colNames:
    if name == "Cover_Type":
        schema.add(StructField(name, DoubleType(), True))
    else:
        schema.add(StructField(name, IntegerType(), True))

In [7]:
data = spark.read.csv("covtype.data", header=False, schema=schema)

In [8]:
data = data.sample(0.0.o1)
data.count()

5617

In [9]:
data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nullable = true)
 |-- Soil_Type

In [10]:
data.take(1)

[Row(Elevation=3107, Aspect=356, Slope=12, Horizontal_Distance_To_Hydrology=283, Vertical_Distance_To_Hydrology=89, Horizontal_Distance_To_Roadways=4855, Hillshade_9am=201, Hillshade_Noon=219, Hillshade_3pm=155, Horizontal_Distance_To_Fire_Points=2069, Wilderness_Area_0=1, Wilderness_Area_1=0, Wilderness_Area_2=0, Wilderness_Area_3=0, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=1.0)]

In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [12]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

In [13]:
inputCols = trainData.drop('Cover_Type').columns

In [14]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)

+--------------------------------------------------------------------------------------------------------+
|featureVector                                                                                           |
+--------------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[2061.0,67.0,20.0,228.0,104.0,277.0,236.0,195.0,82.0,450.0,1.0,1.0])    |
|(54,[0,1,2,3,4,5,6,7,8,9,13,30],[2073.0,119.0,11.0,30.0,-1.0,1276.0,239.0,230.0,120.0,582.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,23],[2085.0,77.0,32.0,150.0,35.0,210.0,240.0,164.0,29.0,474.0,1.0,1.0])     |
|(54,[0,1,2,3,4,5,6,7,8,9,13,23],[2133.0,292.0,26.0,150.0,70.0,642.0,139.0,225.0,221.0,853.0,1.0,1.0])   |
|(54,[0,1,2,5,6,7,8,9,13,30],[2136.0,203.0,20.0,1326.0,206.0,253.0,175.0,1351.0,1.0,1.0])                |
|(54,[0,1,2,3,4,5,6,7,8,9,13,16],[2140.0,105.0,32.0,60.0,29.0,993.0,254.0,182.0,29.0,342.0,1.0,1.0])     |
|(54,[0,1,2,3,4,5,6,7,8,9,13,16],[215

In [15]:
%%time
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembledTrainData)

print(model.toDebugString)
print(model.featureImportances)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4765952a6c4a374ac222) of depth 5 with 61 nodes
  If (feature 0 <= 3057.5)
   If (feature 0 <= 2567.5)
    If (feature 10 <= 0.5)
     If (feature 23 <= 0.5)
      If (feature 24 <= 0.5)
       Predict: 3.0
      Else (feature 24 > 0.5)
       Predict: 2.0
     Else (feature 23 > 0.5)
      If (feature 4 <= 108.5)
       Predict: 6.0
      Else (feature 4 > 108.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     If (feature 9 <= 5365.5)
      Predict: 2.0
     Else (feature 9 > 5365.5)
      If (feature 1 <= 88.5)
       Predict: 5.0
      Else (feature 1 > 88.5)
       Predict: 2.0
   Else (feature 0 > 2567.5)
    If (feature 0 <= 2947.5)
     If (feature 12 <= 0.5)
      If (feature 5 <= 448.0)
       Predict: 2.0
      Else (feature 5 > 448.0)
       Predict: 2.0
     Else (feature 12 > 0.5)
      If (feature 0 <= 2742.5)
       Predict: 2.0
      Else (feature 0 > 2742.5)
       Predict: 2.0
    Else (feature 0 > 2947.5

In [52]:
from pyspark.ml.classification import LinearSVC

from pyspark.ml.classification import LogisticRegression


# svm = LinearSVC(maxIter=5, regParam=0.01)
# model = svm.fit(assembledTrainData)

logi = LogisticRegression(labelCol="Cover_Type", featuresCol="featureVector", family="multinomial", predictionCol="prediction")
model = logi.fit(assembledTrainData)

In [53]:
%%time
predictions = model.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

+----------+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                                                             |
+----------+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|3.0       |3.0       |[2.76290108001562E-7,1.3086626661125166E-6,3.5776642178765396E-5,0.8177268862982532,0.04050058971453184,8.010180038988374E-5,0.14165438897268393,6.716191883479357E-7]  |
|4.0       |4.0       |[2.077645783782002E-7,1.898473048548013E-8,0.0010355683728960094,0.34682323647133984,0.5706282248479218,0.0010972423656961945,0.0804154854501259,1.5742711540045418E-8] |
|6.0       |3.0       |[4.781132851

In [54]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

In [55]:
evaluator.setMetricName("accuracy").evaluate(predictions)

0.7218712988551125

In [56]:
evaluator.setMetricName("f1").evaluate(predictions)

0.7116944546332543

In [24]:
#### confusion matrix - not supported in PySpark ML library

In [25]:
from pyspark.ml import Pipeline

inputCols = trainData.columns[:-1]
assembler = VectorAssembler(inputCols=inputCols, outputCol="featureVector")
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

In [26]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(classifier.impurity, ["gini", "entropy"])\
    .addGrid(classifier.maxBins, [40, 300])\
    .addGrid(classifier.minInfoGain, [0.0, 0.05])\
    .build()

#     .addGrid(classifier.numTrees, [1, 20])\

In [27]:
%%time
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")
multiclassEval.evaluate(predictions)

Wall time: 1.54 s


In [28]:
from pyspark.ml.tuning import TrainValidationSplit

validator = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=multiclassEval,
    trainRatio=0.9)

validatorModel = validator.fit(trainData)

In [29]:
bestModel = validatorModel.bestModel

In [60]:
bestModel.stages

[VectorAssembler_463391926e94708693ad,
 VectorIndexer_4badbdce6fdd84798ae0,
 RandomForestClassificationModel (uid=RandomForestClassifier_4a15a54a4df658497bc8) with 1 trees]

In [30]:
bestModel.stages[-1].extractParamMap()

{Param(parent='DecisionTreeClassifier_44298458fca7d1522d02', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='DecisionTreeClassifier_44298458fca7d1522d02', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='DecisionTreeClassifier_44298458fca7d1522d02', name='featuresCol', doc='features column name'): 'featureVector',
 Param(parent='DecisionTreeClassifier_44298458fca7d1522d02', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',
 Param(parent='DecisionTreeClassifier_44298458fca7d1522d02', na

In [31]:
paramsAndMetrics = validatorModel.validationMetrics
paramsAndMetrics

[0.7480769230769231,
 0.7115384615384616,
 0.7269230769230769,
 0.7115384615384616,
 0.7288461538461538,
 0.7192307692307692,
 0.7269230769230769,
 0.7192307692307692]

In [32]:
multiclassEval.evaluate(bestModel.transform(testData))

0.720508166969147

#### undoing the one-hot encoding

In [33]:
wildernessCols = []
for i in range(4):
    wildernessCols += ["Wilderness_Area_"+str(i),]

In [34]:
wildernessAssembler = VectorAssembler(
    inputCols=wildernessCols,
    outputCol="wilderness")

In [35]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType, StructType

unhotudf = udf(lambda x: float(x.toArray().nonzero()[0]), DoubleType())

In [36]:
withWilderness = wildernessAssembler.transform(data)
withWilderness = withWilderness\
    .drop(*wildernessCols)\
    .withColumn("wilderness", unhotudf(withWilderness['wilderness']))
withWilderness.take(1)

[Row(Elevation=3107, Aspect=356, Slope=12, Horizontal_Distance_To_Hydrology=283, Vertical_Distance_To_Hydrology=89, Horizontal_Distance_To_Roadways=4855, Hillshade_9am=201, Hillshade_Noon=219, Hillshade_3pm=155, Horizontal_Distance_To_Fire_Points=2069, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=1.0, wilderness=0.0)]

In [37]:
soilCols = []
for i in range(40):
    soilCols += ["Soil_Type_"+str(i),]


In [38]:
%%time
soilAssembler = VectorAssembler(
        inputCols=soilCols,
        outputCol="soil")

withWilderness = soilAssembler.transform(withWilderness)
unencData = withWilderness\
    .drop(*soilCols)\
    .withColumn("soil", unhotudf(withWilderness['soil']))
unencData.take(1)

Wall time: 2.16 s


#### Decision Tree Classifier with unencoded data

In [39]:
(unencTrainData, unencTestData) = unencData.randomSplit([0.9, 0.1])

In [40]:
from pyspark.ml.feature import VectorIndexer

inputCols = unencTrainData.drop('Cover_Type').columns
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
indexer = VectorIndexer(
    maxCategories=40,
    inputCol="featureVector",
    outputCol="indexedVector")
classifier = DecisionTreeClassifier(
    seed=42,
    labelCol="Cover_Type",
    featuresCol="indexedVector",
    predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, indexer, classifier])

#### Random Forest Classifier

In [41]:
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(
    seed=42,
    maxBins=40,.
    labelCol="Cover_Type",
    featuresCol=".",
    predictionCol="prediction"k)
pipeline = Pipeline(stages=[assembler, indexer, classifier])

In [42]:
paramGrid = ParamGridBuilder()\
    .addGrid(classifier.minInfoGain, [0.0, 0.05])\
    .addGrid(classifier.numTrees, [1, 10])\
    .build()

In [43]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")

In [44]:
validator = TrainValidationSplit(
    seed=42,
    estimator=pipeline,
    evaluator=multiclassEval,
    estimatorParamMaps=paramGrid,
    trainRatio=0.9)

In [45]:
%%time
validatorModel = validator.fit(unencTrainData)
bestModel = validatorModel.bestModel
forestModel = bestModel.stages[-1]
print(forestModel.extractParamMap())

{Param(parent='RandomForestClassifier_4a15a54a4df658497bc8', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False, Param(parent='RandomForestClassifier_4a15a54a4df658497bc8', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10, Param(parent='RandomForestClassifier_4a15a54a4df658497bc8', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto', Param(parent='RandomForestClassifier_4a15a54a4df658497bc8', name='featuresCol', doc='features column name'): 'indexedVector', Param(parent='Rando

In [46]:
forestModel.getNumTrees

1

In [47]:
sorted(list(zip(inputCols, forestModel.featureImportances)), key=lambda x: x[1], reverse=True)

[('Elevation', 0.7490484478495304),
 ('soil', 0.14747148426805073),
 ('Horizontal_Distance_To_Roadways', 0.028555255238566488),
 ('Horizontal_Distance_To_Hydrology', 0.027207391866001514),
 ('Hillshade_Noon', 0.018159766360003802),
 ('Hillshade_9am', 0.012692986886932833),
 ('wilderness', 0.008479764416715177),
 ('Horizontal_Distance_To_Fire_Points', 0.004904580279106281),
 ('Slope', 0.0034803228350927216),
 ('Aspect', 0.0),
 ('Vertical_Distance_To_Hydrology', 0.0),
 ('Hillshade_3pm', 0.0)]

In [48]:
testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
testAccuracy

0.7046728971962617

In [None]:
bestModel.transform(unencTestData.drop("Cover_Type")).show()