In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.4.4-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession
spark  = SparkSession.builder.appName('dogfood').getOrCreate()

In [3]:
df = spark.read.csv('data/dog_food.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [4]:
df.summary().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    25%|                 3|                 3|               8.0|                 3|                0.0|
|    50%|                 5|                 6|               9.0|                 6|                0.0|
|    75%|                 8|                 8

In [5]:
df.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

In [8]:
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')
df = assembler.transform(df)

In [11]:
train_data, test_data = df.randomSplit([0.7, 0.3])

In [12]:
clf = RandomForestClassifier(featuresCol='features', labelCol='Spoiled')
model = clf.fit(train_data)

In [13]:
model.featureImportances

SparseVector(4, {0: 0.0212, 1: 0.0161, 2: 0.9512, 3: 0.0116})

## Feature 2 caused the spoilage

In [14]:
results = model.transform(test_data)

In [17]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, MulticlassClassificationEvaluator)

In [19]:
bin_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Spoiled')
bin_eval.evaluate(results)

0.9673678532901834

In [21]:
acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', metricName='accuracy')
acc_eval.evaluate(results)

0.9784172661870504

In [22]:
precision_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', 
                                                   metricName='weightedPrecision')
precision_eval.evaluate(results)

0.978350067198988

In [23]:
recall_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', 
                                                   metricName='weightedRecall')
recall_eval.evaluate(results)

0.9784172661870504

In [24]:
f1_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', 
                                                   metricName='f1')
f1_eval.evaluate(results)

0.9783174073111381