# Installing required packages

In [None]:
from IPython.display import clear_output

!pip install --upgrade pip
!pip install findspark
!pip install pyspark

clear_output(wait=False)

# Importing global objects

In [None]:
import findspark, pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Global Settings
Needed for environments not Databricks

In [None]:
findspark.init()
spark = SparkSession.builder.getOrCreate()

# Reading data source

In [None]:
url = 'https://raw.githubusercontent.com/edsonlourenco/public_datasets/main/Carros.csv'
spark.sparkContext.addFile(url)
csv_cars = SparkFiles.get("Carros.csv")
df_cars = spark.read.csv(csv_cars, header=True, inferSchema=True, sep=';')

# Checking **data**

In [None]:
df_cars.orderBy('Consumo').show(truncate=False)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|15     |8        |301        |354            |357 |146  |0        |1          |5      |8          |335|
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|
|26     |4        |1203       |443            |214 |167  |0        |1          |5      |2          |91 |
|104    |8        |472        |293            |525 |1798 |0        |0          |3      |4          |205|
|104    |8        |460        |3              |5424|1782 |0        |0          |3      |4          |215|
|133    |8        |350        |373            |384 |154

## Transform VectorAssembler

### Importing **VectorAssembler** class

In [None]:
from pyspark.ml.feature import VectorAssembler

### Doing transformation and creating features column

In [None]:
vectas = VectorAssembler(inputCols=[
                                    "Consumo",
                                    "Cilindros",
                                    "Cilindradas",
                                    "RelEixoTraseiro",
                                    "Peso",
                                    "Tempo",
                                    "TipoMotor",
                                    "Transmissao",
                                    "Marchas",
                                    "Carburadors"
                                    ],
                        outputCol="features")

df_cars_vet = vectas.transform(df_cars)
df_cars_vet.orderBy('Consumo').select('features').show(truncate=False)

+-----------------------------------------------------+
|features                                             |
+-----------------------------------------------------+
|[15.0,8.0,301.0,354.0,357.0,146.0,0.0,1.0,5.0,8.0]   |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |
|[26.0,4.0,1203.0,443.0,214.0,167.0,0.0,1.0,5.0,2.0]  |
|[104.0,8.0,472.0,293.0,525.0,1798.0,0.0,0.0,3.0,4.0] |
|[104.0,8.0,460.0,3.0,5424.0,1782.0,0.0,0.0,3.0,4.0]  |
|[133.0,8.0,350.0,373.0,384.0,1541.0,0.0,0.0,3.0,4.0] |
|[143.0,8.0,360.0,321.0,357.0,1584.0,0.0,0.0,3.0,4.0] |
|[147.0,8.0,440.0,323.0,5345.0,1742.0,0.0,0.0,3.0,4.0]|
|[152.0,8.0,2758.0,307.0,378.0,18.0,0.0,0.0,3.0,3.0]  |
|[152.0,8.0,304.0,315.0,3435.0,173.0,0.0,0.0,3.0,2.0] |
|[155.0,8.0,318.0,276.0,352.0,1687.0,0.0,0.0,3.0,2.0] |
|[158.0,8.0,351.0,422.0,317.0,145.0,0.0,1.0,5.0,4.0]  |
|[164.0,8.0,2758.0,307.0,407.0,174.0,0.0,0.0,3.0,3.0] |
|[173.0,8.0,2758.0,307.0,373.0,176.0,0.0,0.0,3.0

### Importing **PCA** class

In [None]:
from pyspark.ml.feature import PCA

In [None]:
pca = PCA(k=3, inputCol="features", outputCol="features_pca")
model = pca.fit(df_cars_vet)

### Transforming PCA

In [None]:
result = model.transform(df_cars_vet)

### Checking **data**

In [None]:
result.select('features_pca').show(truncate=False)

+-----------------------------------------------------------+
|features_pca                                               |
+-----------------------------------------------------------+
|[618.7707206779614,-937.712394997354,1231.9633529945509]   |
|[3112.9887675342206,-161.05746385491523,1191.861991305438] |
|[640.4959007710695,-1120.7188865110418,1320.0756315189049] |
|[3466.0956877556678,-149.69421418298342,1401.2041780368531]|
|[661.4577445758732,-812.4592128844115,1395.2949328316356]  |
|[769.234367178774,-1120.4160559477316,1518.7436632279525]  |
|[644.8369503533214,-727.9539376169615,1313.681521097935]   |
|[9.101880661709801,1061.295403667789,1045.1710500215693]   |
|[67.13360966508397,878.479368204501,1143.9379120496164]    |
|[31.390504477140617,1095.3694498285743,1306.0124861906327] |
|[32.89165922208959,1091.1521230845228,1310.0881577350906]  |
|[-118.37273751675397,1832.771927405815,2088.6955393326043] |
|[-150.18148405358022,1820.8730926512776,2091.1033550766124]|
|[-184.0