In [1]:
import findspark

In [2]:
findspark.init('/home/dimitris13/spark-3.0.0-bin-hadoop3.2')

In [3]:
import pyspark 

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('foodcomp').getOrCreate()

In [6]:
data = spark.read.csv('/home/dimitris13/Downloads/dog_food.csv',
                     inferSchema=True,header=True)

In [16]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [19]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [17]:
from pyspark.ml.feature import VectorAssembler

In [20]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],
                           outputCol='features')

In [22]:
data = assembler.transform(data)

In [24]:
final_data = data.select('features','Spoiled')

In [25]:
train, test = final_data.randomSplit([0.7,0.3])

In [26]:
from pyspark.ml.classification import LogisticRegression

In [27]:
logreg = LogisticRegression(labelCol='Spoiled')

In [28]:
logreg_model = logreg.fit(train)

In [29]:
logreg_preds = logreg_model.transform(test)

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [66]:
my_eval = MulticlassClassificationEvaluator(labelCol='Spoiled',
                                           metricName='accuracy')

In [67]:
my_eval.evaluate(logreg_preds)

0.9937888198757764

We notice that the Logistic Regression model has really good accuracy in predicting whether the dog food spoils or not.

In [39]:
from pyspark.sql.functions import corr

In [52]:
data.select(corr('C','Spoiled')).show()

+-----------------+
| corr(C, Spoiled)|
+-----------------+
|0.858620384785075|
+-----------------+



By noticing the high, positive correlation between the column 'C' and the labels, we suspect that this is the predictor affecting the outcome more than the other predictors. Hence, we apply another Logistic Regression classifier, with only predictors 'A', 'B' and 'D'. By evaluating this new model, we remark that the accuracy drops significantly.

In [53]:
assembler_ABD = VectorAssembler(inputCols=['A','B','D'],
                               outputCol='features_ABD')

In [54]:
data_ABD = assembler_ABD.transform(data)

In [56]:
final_ABD = data_ABD.select('features_ABD','Spoiled')

In [57]:
train_ABD, test_ABD = final_ABD.randomSplit([0.7,0.3])

In [58]:
logregABD = LogisticRegression(labelCol='Spoiled',
                              featuresCol = 'features_ABD')

In [59]:
logregABD_model = logregABD.fit(train_ABD)

In [60]:
logregABD_preds = logregABD_model.transform(test_ABD)

In [68]:
my_eval.evaluate(logregABD_preds)

0.7320261437908496

Now we the .featureImportances attribute to verify that predictor 'C' is the ingedient making the dog food spoiling. Note that this attribute is available on tree methods.

In [62]:
from pyspark.ml.classification import RandomForestClassifier

In [63]:
rfc = RandomForestClassifier(featuresCol='features',
                            labelCol='Spoiled')

In [64]:
rfc_model = rfc.fit(final_data)

In [65]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0223, 1: 0.0295, 2: 0.9233, 3: 0.0248})