# Generalized

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("generalized").getOrCreate()

24/04/03 14:13:59 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/03 14:13:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 14:13:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
cars = spark.read.csv("../0_data/Carros.csv", header=True, inferSchema=True, sep=";")
print(cars.count())
cars.show(5)

32
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
r_formula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas", featuresCol="independant", labelCol="dependant")
rf = r_formula.fit(cars).transform(cars)
rf.show(truncate=False)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+------------------+---------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |independant       |dependant|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+------------------+---------+
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|[21.0,6.0,160.0]  |110.0    |
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|[21.0,6.0,160.0]  |110.0    |
|228    |4        |108        |385            |232 |1861 |1        |1          |4      |1          |93 |[228.0,4.0,108.0] |93.0     |
|214    |6        |258        |308            |3215|1944 |1        |0          |3      |1          |110|[214.0,6.0,258.0] |110.0    |
|187    |8        |360        |315            |344 |1702 |0   

In [5]:
cars_train, cars_test = rf.randomSplit([0.8, 0.2])
print(cars_train.count(), cars_test.count())

27 5


In [6]:
gen_lin = GeneralizedLinearRegression(family="gaussian",
                                             link="identity",
                                             featuresCol="independant", 
                                             labelCol="dependant",
                                             maxIter=100,
                                             regParam=0.08)
model = gen_lin.fit(cars_train)

In [7]:
pred = model.transform(cars_test)
pred.select("dependant", "prediction").show(truncate=False)

+---------+------------------+
|dependant|prediction        |
+---------+------------------+
|335.0    |193.04227094886426|
|180.0    |183.42702058329397|
|264.0    |192.90766218164774|
|97.0     |77.51610204474034 |
|52.0     |79.30319716287289 |
+---------+------------------+



In [8]:
validate = RegressionEvaluator(predictionCol="prediction", labelCol="dependant", metricName="rmse")
rmse = validate.evaluate(pred)
print(f"RMSE: {rmse:.2f}")

RMSE: 72.59
