In [2]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.master("local").\
    appName("CNPJ").\
        config("spark.executer.memory","1gb").\
            getOrCreate()

In [4]:
carros_temp = spark.read.csv("D:\downloads\Carros.csv",inferSchema=True, header=True, sep=";")

In [5]:
carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [7]:
carros = carros_temp.select("Consumo","Cilindros","Cilindradas","HP")

In [8]:
carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [9]:
#vetor de caracteristicas
veccaracteristicas = VectorAssembler(inputCols=[("Consumo"),("Cilindros"),("Cilindradas")],outputCol="caracteristicas")

In [10]:
carros = veccaracteristicas.transform(carros)

In [11]:
carros.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [12]:
carrosTreino, carrosTeste = carros.randomSplit([0.7,0.3])

In [13]:
carrosTreino.count()

19

In [14]:
carrosTeste.count()

13

### Criando um modelo de Regressão

In [15]:
reglin = LinearRegression(featuresCol="caracteristicas",labelCol="HP")

In [16]:
modelo = reglin.fit(carrosTreino)

In [17]:
previsao = modelo.transform(carrosTeste)

In [18]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|182.18810406624036|
|     26|        4|       1203| 91| [26.0,4.0,1203.0]|111.76123706928254|
|    104|        8|        472|205| [104.0,8.0,472.0]|221.92930590408602|
|    133|        8|        350|245| [133.0,8.0,350.0]|218.40456244195238|
|    143|        8|        360|245| [143.0,8.0,360.0]|216.58581298550556|
|    152|        8|        304|150| [152.0,8.0,304.0]|215.70209044491227|
|    155|        8|        318|150| [155.0,8.0,318.0]| 215.0290091207121|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|185.22425092409662|
|    192|        6|       1676|123|[192.0,6.0,1676.0]| 135.5030360543686|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]|  79.5999804395124|
|    273|        4|         79| 66|  [

In [20]:
#avaliando a performance do modelo de regressão linear
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="HP", metricName="rmse")


In [22]:
rmse = avaliar.evaluate(previsao)

In [23]:
rmse

36.23121558487021

### Criando um modelo com Random Forest

In [24]:
rfreg = RandomForestRegressor(featuresCol="caracteristicas",labelCol="HP")

In [25]:
modelo2 = rfreg.fit(carrosTreino)

In [26]:
previsao2 = modelo2.transform(carrosTeste)

In [27]:
previsao2.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|124.25791666666666|
|     26|        4|       1203| 91| [26.0,4.0,1203.0]|148.68535714285713|
|    104|        8|        472|205| [104.0,8.0,472.0]|218.52190323565318|
|    133|        8|        350|245| [133.0,8.0,350.0]|233.54392704517704|
|    143|        8|        360|245| [143.0,8.0,360.0]| 223.1094032356532|
|    152|        8|        304|150| [152.0,8.0,304.0]| 238.7105937118437|
|    155|        8|        318|150| [155.0,8.0,318.0]| 238.7105937118437|
|    164|        8|       2758|180|[164.0,8.0,2758.0]| 212.9826175213675|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|132.92410714285717|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]|           92.8575|
|    273|        4|         79| 66|  [

In [28]:
rmse2 = avaliar.evaluate(previsao2)

In [29]:
rmse2

42.80772739029237