# Regression with Random Forests

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("regressionwithrandomforests").getOrCreate()

24/04/03 14:44:10 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/03 14:44:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 14:44:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import RFormula, Normalizer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
cars = spark.read.csv("../0_data/Carros.csv", header=True, inferSchema=True, sep=";")
print(cars.count())
cars.show(5)

32
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
r_formula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas", featuresCol="independant", labelCol="dependant")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.show(truncate=False)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+------------------+---------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |independant       |dependant|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+------------------+---------+
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|[21.0,6.0,160.0]  |110.0    |
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|[21.0,6.0,160.0]  |110.0    |
|228    |4        |108        |385            |232 |1861 |1        |1          |4      |1          |93 |[228.0,4.0,108.0] |93.0     |
|214    |6        |258        |308            |3215|1944 |1        |0          |3      |1          |110|[214.0,6.0,258.0] |110.0    |
|187    |8        |360        |315            |344 |1702 |0   

In [5]:
norm = Normalizer(inputCol="independant", outputCol="independant_norm", p=1.0)
cars_norm = norm.transform(cars_rf)

In [7]:
cars_norm.select("dependant", "independant", "independant_norm").show(10, truncate=False)

+---------+------------------+--------------------------------------------------------------+
|dependant|independant       |independant_norm                                              |
+---------+------------------+--------------------------------------------------------------+
|110.0    |[21.0,6.0,160.0]  |[0.11229946524064172,0.03208556149732621,0.8556149732620321]  |
|110.0    |[21.0,6.0,160.0]  |[0.11229946524064172,0.03208556149732621,0.8556149732620321]  |
|93.0     |[228.0,4.0,108.0] |[0.6705882352941176,0.011764705882352941,0.3176470588235294]  |
|110.0    |[214.0,6.0,258.0] |[0.4476987447698745,0.012552301255230125,0.5397489539748954]  |
|175.0    |[187.0,8.0,360.0] |[0.33693693693693694,0.014414414414414415,0.6486486486486487] |
|105.0    |[181.0,6.0,225.0] |[0.4393203883495146,0.014563106796116505,0.5461165048543689]  |
|245.0    |[143.0,8.0,360.0] |[0.27984344422700586,0.015655577299412915,0.7045009784735812] |
|62.0     |[244.0,4.0,1467.0]|[0.1422740524781341,0.00233236

In [13]:
cars_train, cars_test = cars_norm.randomSplit([0.7, 0.3])
print(cars_train.count(), cars_test.count())

25 7


In [14]:
cars_rf = RandomForestRegressor(featuresCol="independant_norm", 
                           labelCol="dependant",
                           maxDepth=10,
                           numTrees=500,
                           seed=42)
cars_rf_model = cars_rf.fit(cars_train)

24/04/03 14:49:53 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 25 (= number of training instances)
24/04/03 14:49:54 WARN DAGScheduler: Broadcasting large task binary with size 1240.4 KiB
24/04/03 14:49:54 WARN DAGScheduler: Broadcasting large task binary with size 1127.3 KiB


In [15]:
predict = cars_rf_model.transform(cars_test)
predict.select("dependant", "prediction").show(10, truncate=False)



+---------+----------+
|dependant|prediction|
+---------+----------+
|150.0    |173.19    |
|264.0    |177.7     |
|175.0    |150.664   |
|175.0    |132.876   |
|110.0    |115.156   |
|97.0     |106.628   |
|65.0     |108.602   |
+---------+----------+



In [18]:
rmse = RegressionEvaluator(labelCol="dependant", predictionCol="prediction", metricName="rmse").evaluate(predict)
print(f"RMSE: {rmse:.2f}")

RMSE: 42.04
