In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!rm -rf spark-2.3.2-bin-hadoop2.7*
!wget -q http://apache.osuosl.org/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz
!tar xf spark-2.3.2-bin-hadoop2.7.tgz
!pip install -q pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

In [0]:
!rm -rf covtype*
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
!gzip -d covtype.data.gz

In [1]:
!ls covtype.data

covtype.data


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch04").getOrCreate()
sc = spark.sparkContext

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [0]:
colNames = ["Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
for i in range(4):
    colNames += ["Wilderness_Area_"+str(i),]
for i in range(40):
    colNames += ["Soil_Type_"+str(i),]
colNames += ["Cover_Type",]

In [0]:
schema = StructType()
for name in colNames:
    if name == "Cover_Type":
        schema.add(StructField(name, DoubleType(), True))
    else:
        schema.add(StructField(name, IntegerType(), True))

In [0]:
data = spark.read.csv("covtype.data", header=False, schema=schema)

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

In [0]:
inputCols = trainData.drop('Cover_Type').columns

In [10]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)

+----------------------------------------------------------------------------------------------------+
|featureVector                                                                                       |
+----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1859.0,18.0,12.0,67.0,11.0,90.0,211.0,215.0,139.0,792.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1860.0,18.0,13.0,95.0,15.0,90.0,210.0,213.0,138.0,780.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1861.0,35.0,14.0,60.0,11.0,85.0,218.0,209.0,124.0,832.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1867.0,20.0,15.0,108.0,19.0,120.0,208.0,206.0,132.0,808.0,1.0,1.0])|
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1868.0,27.0,16.0,67.0,17.0,95.0,212.0,204.0,125.0,859.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1871.0,22.0,22.0,60.0,12.0,85.0,200.0,1

In [0]:
from pyspark.ml.classification import LogisticRegression
classifier = LogisticRegression(labelCol="Cover_Type",
                                featuresCol="featureVector",
                                predictionCol="prediction")
model = classifier.fit(assembledTrainData)

In [17]:
predictions = model.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

+----------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                                                               |
+----------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|3.0       |3.0       |[1.819358700341259E-8,4.713163381774143E-8,0.0032439642302469584,0.818949260310703,0.08837367252511137,4.237199823437059E-4,0.08900929700192,2.062445410373261E-8]        |
|3.0       |3.0       |[1.7089546766417245E-8,4.4374444901765384E-8,0.0032329951603223424,0.8382781379703986,0.07608409443920665,3.932641124816507E-4,0.08201142805745425,1.8796144789544508E-8] |
|3.0       |3.0       |[1

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction")

In [19]:
evaluator.setMetricName("accuracy").evaluate(predictions)

0.7137373922821834

In [20]:
evaluator.setMetricName("f1").evaluate(predictions)

0.7007313885972423