In [0]:
# You have been hired by a dog food company.  We want to predict why some baches of their dog food are spoiling much quicker than intended!
# They are using 5 preserving chemicals that can vary a lot. But which chemical has the strongest effect??
# The company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a 'filler' chemical.
# The company believes that one of the presrvatives(A,B,C,D) is causing the problem ... But we need to figure out wich one!
# We need to find wich parameter had the most predictive power 

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pres_chem').getOrCreate()
data = spark.read.csv('dbfs:/FileStore/shared_uploads/gkantirisrafael@gmail.com/dog_food.csv',header=True,inferSchema=True)

In [0]:
data.show(2)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
+---+---+----+---+-------+
only showing top 2 rows



In [0]:
from pyspark.ml.feature import VectorAssembler
data.columns

Out[10]: ['A', 'B', 'C', 'D', 'Spoiled']

In [0]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(labelCol='Spoiled',featuresCol='features')

In [0]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
# for our final data we only need the 2 columns that we created
final_data = output.select('Spoiled','features')

In [0]:
final_data.show()

+-------+-------------------+
|Spoiled|           features|
+-------+-------------------+
|    1.0| [4.0,2.0,12.0,3.0]|
|    1.0| [5.0,6.0,12.0,7.0]|
|    1.0| [6.0,2.0,13.0,6.0]|
|    1.0| [4.0,2.0,12.0,1.0]|
|    1.0| [4.0,2.0,12.0,3.0]|
|    1.0|[10.0,3.0,13.0,9.0]|
|    1.0| [8.0,5.0,14.0,5.0]|
|    1.0| [5.0,8.0,12.0,8.0]|
|    1.0| [6.0,5.0,12.0,9.0]|
|    1.0| [3.0,3.0,12.0,1.0]|
|    1.0| [9.0,8.0,11.0,3.0]|
|    1.0|[1.0,10.0,12.0,3.0]|
|    1.0|[1.0,5.0,13.0,10.0]|
|    1.0|[2.0,10.0,12.0,6.0]|
|    1.0|[1.0,10.0,11.0,4.0]|
|    1.0| [5.0,3.0,12.0,2.0]|
|    1.0| [4.0,9.0,11.0,8.0]|
|    1.0| [5.0,1.0,11.0,1.0]|
|    1.0|[4.0,9.0,12.0,10.0]|
|    1.0| [5.0,8.0,10.0,9.0]|
+-------+-------------------+
only showing top 20 rows



In [0]:
rfc_model = rfc.fit(final_data)

In [0]:
final_data.head(1)

Out[20]: [Row(Spoiled=1.0, features=DenseVector([4.0, 2.0, 12.0, 3.0]))]

In [0]:
rfc_model.featureImportances

Out[18]: SparseVector(4, {0: 0.0189, 1: 0.0141, 2: 0.9384, 3: 0.0287})

In [0]:
#We can see that [index:2 (C=12)] is by far the most important feature (0.9384) and is the most likely to be responsible for the spoiling of the dog food!