In [1]:
import os
import findspark
findspark.init(os.getenv('SPARK_HOME'))
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as mce
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [2]:
spark = SparkSession.builder.appName('decision_tree_exercice').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/25 19:37:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
data = spark.read.csv("dog_food.csv", inferSchema=True, header=True)
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [10]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [11]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [12]:
assembly = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol="features")
data = assembly.transform(data)
data.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [None]:
train, test = data.randomSplit([0.8, 0.2])

In [16]:
rfc = RandomForestClassifier(labelCol="Spoiled", featuresCol="features")
rfc_model = rfc.fit(train)

In [18]:
result = rfc_model.transform(test)
result.show()

+---+---+----+---+-------+-------------------+--------------------+--------------------+----------+
|  A|  B|   C|  D|Spoiled|           features|       rawPrediction|         probability|prediction|
+---+---+----+---+-------+-------------------+--------------------+--------------------+----------+
|  1|  1|12.0|  2|    1.0| [1.0,1.0,12.0,2.0]|[0.79316036458216...|[0.03965801822910...|       1.0|
|  1|  3| 8.0|  3|    0.0|  [1.0,3.0,8.0,3.0]|[19.6943232411074...|[0.98471616205537...|       0.0|
|  1|  8| 6.0|  6|    0.0|  [1.0,8.0,6.0,6.0]|[19.7509377267621...|[0.98754688633810...|       0.0|
|  1|  9|11.0| 10|    1.0|[1.0,9.0,11.0,10.0]|[1.01561609045007...|[0.05078080452250...|       1.0|
|  2|  1| 7.0|  9|    0.0|  [2.0,1.0,7.0,9.0]|[19.6450315365888...|[0.98225157682944...|       0.0|
|  2|  1| 8.0|  9|    0.0|  [2.0,1.0,8.0,9.0]|[19.6450315365888...|[0.98225157682944...|       0.0|
|  2|  1| 9.0|  1|    0.0|  [2.0,1.0,9.0,1.0]|[19.4154734319472...|[0.97077367159736...|       0.0|


In [28]:
f1_eval = mce(metricName="f1", labelCol='Spoiled')

In [31]:
print("f1_score =", f1_eval.evaluate(result))

f1_score = 0.9775240384615383


In [42]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0244, 1: 0.0231, 2: 0.9322, 3: 0.0203})

In [43]:
f"O preservativo quimico 2(c) é o que mais influencia no resultado"

'O preservativo quimico 2(C) é o que mais influencia no resultado'