<a href="https://colab.research.google.com/github/cruz-marco/pyspark_course/blob/main/pyspark_MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop3.2.tgz
!tar xf spark-3.2.3-bin-hadoop3.2.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
os.environ["SPARK_HOME"] = '/content/spark-3.2.3-bin-hadoop3.2'

!pip install -q findspark

import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

# Machine Learning com Spark

- PySpark tem bibliotecas voltadas para o treinamento e avaliação de modelos de aprendzagem de máquina;

- As variáveis independentes devem ficar, todas juntas, em um único vetor;


> Importe das bibliotecas a serem usadas:

In [2]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as f

> Criação do DataFrame carros_temp, a idéia deste mini-projeto é prever a potência do carro.

In [3]:
carros = spark.read.load(('/content/drive/MyDrive/Datasets/pyspark_course/'
                              'Carros.csv'), format='csv', sep=';', header=True,
                              inferSchema=True)

carros.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
|    181|        6|        225|            276| 346| 2022|        1|          0|      3|          1|105|
|    143|        8|        360|            321| 357| 15

In [4]:
vect_feats = VectorAssembler(inputCols=carros.columns[:-1], outputCol='vected_feats')

In [5]:
carros = vect_feats.transform(carros)
carros.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|        vected_feats|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|[21.0,6.0,160.0,3...|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|[21.0,6.0,160.0,3...|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|[228.0,4.0,108.0,...|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|[214.0,6.0,258.0,...|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|[187.0,8.0,360

In [6]:
carrosTreino, carrosTeste = carros.randomSplit([0.7, 0.3])

display(
    carrosTreino.count(),
    carrosTeste.count()
)

22

10

In [8]:
reglin = LinearRegression(featuresCol='vected_feats', labelCol='HP')
model_RL = reglin.fit(carrosTreino)

In [9]:
carrosPred = model_RL.transform(carrosTeste)
carrosPred.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+------------------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|        vected_feats|        prediction|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+------------------+
|    104|        8|        460|              3|5424| 1782|        0|          0|      3|          4|215|[104.0,8.0,460.0,...| 173.5649974544997|
|    133|        8|        350|            373| 384| 1541|        0|          0|      3|          4|245|[133.0,8.0,350.0,...|240.80817059157295|
|    152|        8|        304|            315|3435|  173|        0|          0|      3|          2|150|[152.0,8.0,304.0,...|151.90544420558822|
|    152|        8|       2758|            307| 378|   18|        0|          0|      3|          3|180|[152.0,8.0,2758.0...|177.6

In [13]:
evaluate = RegressionEvaluator(predictionCol='prediction', labelCol='HP',
                               metricName='rmse')

rmse_RL = evaluate.evaluate(carrosPred)
rmse_RL

56.514656844206385

In [14]:
rfreg = RandomForestRegressor(featuresCol='vected_feats', labelCol='HP')
model_RF = rfreg.fit(carrosTreino)

In [16]:
carrosPredRF = model_RF.transform(carrosTeste)
carrosPredRF.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+------------------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|        vected_feats|        prediction|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+--------------------+------------------+
|    104|        8|        460|              3|5424| 1782|        0|          0|      3|          4|215|[104.0,8.0,460.0,...|            198.25|
|    133|        8|        350|            373| 384| 1541|        0|          0|      3|          4|245|[133.0,8.0,350.0,...|205.91666666666669|
|    152|        8|        304|            315|3435|  173|        0|          0|      3|          2|150|[152.0,8.0,304.0,...|             184.5|
|    152|        8|       2758|            307| 378|   18|        0|          0|      3|          3|180|[152.0,8.0,2758.0...|200.1

In [17]:
rmse_RF = evaluate.evaluate(carrosPredRF)
rmse_RF

40.36708314924993