## Predict if batch is going to spoil based on the percentage of each chemicals (A, B, C, D)

In [38]:
import findspark

findspark.init('/home/guipleite/spark-3.0.2-bin-hadoop3.2')

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('Tree_exe').getOrCreate()

df = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [39]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [40]:
clean_data = df.na.drop() # Removing rows with missing data

assembler = VectorAssembler(inputCols=['A',
                                       'B',
                                       'C',
                                       'D'],
                            outputCol='features'
                            ) 
output = assembler.transform(clean_data)

In [41]:
rfc =  RandomForestClassifier(labelCol='Spoiled', featuresCol='features')

final_data = output.select('features','Spoiled')

train_data, test_data = final_data.randomSplit([0.8,0.2])

In [42]:
rfc_model = rfc.fit(train_data)

In [43]:
results = rfc_model.transform(test_data)

eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Spoiled')

In [44]:
eval.evaluate(results)

0.9926470588235294

In [45]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0155, 1: 0.0192, 2: 0.9409, 3: 0.0244})

The cell above shows that the feature C (with idx 2) has a much larger importance then the other chemicals meaning it contributes for the spoilage of the food a lot more then the others