# Installing required packages

In [14]:
from IPython.display import clear_output

!pip install --upgrade pip
!pip install findspark
!pip install pyspark

clear_output(wait=False)

# Importing global objects

In [17]:
import findspark, pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Global Settings
Needed for environments not Databricks

In [18]:
findspark.init()
spark = SparkSession.builder.getOrCreate()

# Reading data source

In [27]:
url = 'https://raw.githubusercontent.com/edsonlourenco/public_datasets/main/Carros.csv'
spark.sparkContext.addFile(url)
csv_cars = SparkFiles.get("Carros.csv")
df_cars = spark.read.csv(csv_cars, header=True, inferSchema=True, sep=';')

# Checking **data**

In [37]:
df_cars.orderBy('Consumo').show(truncate=False)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|15     |8        |301        |354            |357 |146  |0        |1          |5      |8          |335|
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|
|26     |4        |1203       |443            |214 |167  |0        |1          |5      |2          |91 |
|104    |8        |472        |293            |525 |1798 |0        |0          |3      |4          |205|
|104    |8        |460        |3              |5424|1782 |0        |0          |3      |4          |215|
|133    |8        |350        |373            |384 |154

## Transform VectorAssembler

### Importing **VectorAssembler** class

In [32]:
from pyspark.ml.feature import VectorAssembler

### Doing transformation and creating features column

In [38]:
vectas = VectorAssembler(inputCols=[
                                    "Consumo",
                                    "Cilindros",
                                    "Cilindradas",
                                    "RelEixoTraseiro",
                                    "Peso",
                                    "Tempo",
                                    "TipoMotor",
                                    "Transmissao",
                                    "Marchas",
                                    "Carburadors"
                                    ],
                        outputCol="features")

df_cars_vet = vectas.transform(df_cars)
df_cars_vet.orderBy('Consumo').select('features').show(truncate=True) #('caracteristicas').display()

+--------------------+
|            features|
+--------------------+
|[15.0,8.0,301.0,3...|
|[21.0,6.0,160.0,3...|
|[21.0,6.0,160.0,3...|
|[26.0,4.0,1203.0,...|
|[104.0,8.0,472.0,...|
|[104.0,8.0,460.0,...|
|[133.0,8.0,350.0,...|
|[143.0,8.0,360.0,...|
|[147.0,8.0,440.0,...|
|[152.0,8.0,2758.0...|
|[152.0,8.0,304.0,...|
|[155.0,8.0,318.0,...|
|[158.0,8.0,351.0,...|
|[164.0,8.0,2758.0...|
|[173.0,8.0,2758.0...|
|[178.0,6.0,1676.0...|
|[181.0,6.0,225.0,...|
|[187.0,8.0,360.0,...|
|[192.0,6.0,1676.0...|
|[192.0,8.0,400.0,...|
+--------------------+
only showing top 20 rows

