In [1]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local").\
    appName("CNPJ").\
        config("spark.executer.memory","1gb").\
            getOrCreate()

In [3]:
carros_temp = spark.read.csv("D:\downloads\Carros.csv", inferSchema=True, header=True, sep=";")

In [4]:
carros = carros_temp.select("Consumo","Cilindros","Cilindradas","HP")

In [5]:
carros.show()

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
|    181|        6|        225|105|
|    143|        8|        360|245|
|    244|        4|       1467| 62|
|    228|        4|       1408| 95|
|    192|        6|       1676|123|
|    178|        6|       1676|123|
|    164|        8|       2758|180|
|    173|        8|       2758|180|
|    152|        8|       2758|180|
|    104|        8|        472|205|
|    104|        8|        460|215|
|    147|        8|        440|230|
|    324|        4|        787| 66|
|    304|        4|        757| 52|
|    339|        4|        711| 65|
+-------+---------+-----------+---+
only showing top 20 rows



In [6]:
vecCaracteristias = VectorAssembler(inputCols=[("Consumo"),("Cilindros"),("Cilindradas")],outputCol="caracteristicas")

In [7]:
vecCarrosTreino = vecCaracteristias.transform(carros)

In [8]:
vecCarrosTreino.show()

+-------+---------+-----------+---+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|
+-------+---------+-----------+---+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|    228|        4|        108| 93| [228.0,4.0,108.0]|
|    214|        6|        258|110| [214.0,6.0,258.0]|
|    187|        8|        360|175| [187.0,8.0,360.0]|
|    181|        6|        225|105| [181.0,6.0,225.0]|
|    143|        8|        360|245| [143.0,8.0,360.0]|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|
|    178|        6|       1676|123|[178.0,6.0,1676.0]|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|
|    104|        8|        472|205| [104.0,8.0,472.0]|
|    104| 

In [9]:
reglin = LinearRegression(featuresCol="caracteristicas",labelCol="HP")

In [10]:
modelo = reglin.fit(vecCarrosTreino)

## Criando um Pipeline

In [11]:
from pyspark.ml import Pipeline

In [12]:
pipeline = Pipeline(stages=[vecCaracteristias,reglin])

In [13]:
pipelineModel = pipeline.fit(carros)

In [14]:
previsao = pipelineModel.transform(carros)

In [15]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|    228|        4|        108| 93| [228.0,4.0,108.0]| 82.51715587712931|
|    214|        6|        258|110| [214.0,6.0,258.0]|141.86680518718754|
|    187|        8|        360|175| [187.0,8.0,360.0]|202.93528239714834|
|    181|        6|        225|105| [181.0,6.0,225.0]| 145.4980634611832|
|    143|        8|        360|245| [143.0,8.0,360.0]|   207.41448530972|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]| 69.69282676584851|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]| 71.80767356085781|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|132.42483285541724|
|    178|        6|       1676|123|[17