# Imputer

Vector assembly is a process of combining multiple input columns into a single vector column. 
It is commonly used in feature engineering tasks in machine learning pipelines.

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("polynomialexpansion").getOrCreate()

24/04/02 16:40:44 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/02 16:40:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/02 16:40:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
from pyspark.ml.feature import PolynomialExpansion, VectorAssembler

In [4]:
cars = spark.read.csv("../0_data/Carros.csv", header=True, inferSchema=True, sep=";")
cars.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [8]:
vecasembler = VectorAssembler(inputCols=["Consumo", "Cilindros", "Cilindradas"], outputCol="vec")
cars_vect = vecasembler.transform(cars)
cars_vect.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|              vec|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+-----------------+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110| [21.0,6.0,160.0]|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110| [21.0,6.0,160.0]|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|[228.0,4.0,108.0]|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|[214.0,6.0,258.0]|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|[187.0,8.0,360.0]|
+-------+-------

In [11]:
pe = PolynomialExpansion(degree=2, inputCol="vec", outputCol="vec_poly")
cars_poly = pe.transform(cars_vect)
cars_poly.select("Consumo", "Cilindros", "Cilindradas", "vec", "vec_poly").show(5, truncate=False)

+-------+---------+-----------+-----------------+-------------------------------------------------------------+
|Consumo|Cilindros|Cilindradas|vec              |vec_poly                                                     |
+-------+---------+-----------+-----------------+-------------------------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]       |
|21     |6        |160        |[21.0,6.0,160.0] |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]       |
|228    |4        |108        |[228.0,4.0,108.0]|[228.0,51984.0,4.0,912.0,16.0,108.0,24624.0,432.0,11664.0]   |
|214    |6        |258        |[214.0,6.0,258.0]|[214.0,45796.0,6.0,1284.0,36.0,258.0,55212.0,1548.0,66564.0] |
|187    |8        |360        |[187.0,8.0,360.0]|[187.0,34969.0,8.0,1496.0,64.0,360.0,67320.0,2880.0,129600.0]|
+-------+---------+-----------+-----------------+-------------------------------------------------------