# Imputer

Vector assembly is a process of combining multiple input columns into a single vector column. 
It is commonly used in feature engineering tasks in machine learning pipelines.

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("imputer").getOrCreate()

24/04/01 11:56:52 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/01 11:56:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/01 11:56:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import Imputer

In [3]:
cars = spark.read.csv("../0_data/CarrosNAN.csv", header=True, inferSchema=True, sep=";")

In [4]:
cars.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|       NULL|             39|2875| NULL|        0|          1|      4|          4|110|
|    228|        0|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        0|       NULL|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        0|        360|            315|NULL| 1702|        0|          0|      3|          2|175|
|    181|        6|        225|            276| 346| NULL|        1|          0|      3|          1|105|
|    143|        8|        360|            321| 357| 15

In [5]:
imput = Imputer(inputCols=["Cilindradas", "Peso"], outputCols=["Cilindradas_imputed", "Peso_imputed"])
model = imput.fit(cars)
cars = model.transform(cars)

In [6]:
cars.select("Cilindradas_imputed", "Peso_imputed").show()

+-------------------+------------+
|Cilindradas_imputed|Peso_imputed|
+-------------------+------------+
|                160|         262|
|                848|        2875|
|                108|         232|
|                848|        3215|
|                360|        1318|
|                225|         346|
|                360|         357|
|               1467|         319|
|               1408|         315|
|               1676|         344|
|               1676|        1318|
|               2758|         407|
|               2758|         373|
|               2758|         378|
|                472|         525|
|                848|        5424|
|                440|        5345|
|                787|          22|
|                757|        1615|
|                711|        1835|
+-------------------+------------+
only showing top 20 rows



In [9]:
imput = Imputer(inputCols=["Cilindros"], outputCols=["new_Cilindros"], strategy="median")
imput.setMissingValue(0).fit(cars).transform(cars).select("Cilindros", "new_Cilindros").show()

+---------+-------------+
|Cilindros|new_Cilindros|
+---------+-------------+
|        6|            6|
|        6|            6|
|        0|            6|
|        0|            6|
|        0|            6|
|        6|            6|
|        8|            8|
|        4|            4|
|        4|            4|
|        0|            6|
|        6|            6|
|        8|            8|
|        8|            8|
|        8|            8|
|        8|            8|
|        8|            8|
|        8|            8|
|        4|            4|
|        4|            4|
|        4|            4|
+---------+-------------+
only showing top 20 rows

