# MinMaxScaler

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("minmaxscaler").getOrCreate()

24/04/02 20:37:56 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/02 20:37:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/02 20:37:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/02 20:37:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

In [3]:
cars = spark.read.csv("../0_data/Carros.csv", header=True, inferSchema=True, sep=";")
cars.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
vecasembler = VectorAssembler(inputCols=["Consumo", "Cilindros", "Cilindradas"], outputCol="vec")
cars_vect = vecasembler.transform(cars)
cars_vect.select("Consumo", "Cilindros", "Cilindradas", "vec").show(10)

+-------+---------+-----------+------------------+
|Consumo|Cilindros|Cilindradas|               vec|
+-------+---------+-----------+------------------+
|     21|        6|        160|  [21.0,6.0,160.0]|
|     21|        6|        160|  [21.0,6.0,160.0]|
|    228|        4|        108| [228.0,4.0,108.0]|
|    214|        6|        258| [214.0,6.0,258.0]|
|    187|        8|        360| [187.0,8.0,360.0]|
|    181|        6|        225| [181.0,6.0,225.0]|
|    143|        8|        360| [143.0,8.0,360.0]|
|    244|        4|       1467|[244.0,4.0,1467.0]|
|    228|        4|       1408|[228.0,4.0,1408.0]|
|    192|        6|       1676|[192.0,6.0,1676.0]|
+-------+---------+-----------+------------------+
only showing top 10 rows



In [7]:
min_max_scaler = MinMaxScaler(inputCol="vec", 
                             outputCol="minmaxscale", 
                             min=0,
                             max=1)
model = min_max_scaler.fit(cars_vect)
cars_std = model.transform(cars_vect)
cars_std.select("Consumo", "Cilindros", "Cilindradas", "vec", "minmaxscale").show(10, truncate=False)

+-------+---------+-----------+------------------+-----------------------------------------------+
|Consumo|Cilindros|Cilindradas|vec               |minmaxscale                                    |
+-------+---------+-----------+------------------+-----------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0]  |[0.018518518518518517,0.5,0.030235162374020158]|
|21     |6        |160        |[21.0,6.0,160.0]  |[0.018518518518518517,0.5,0.030235162374020158]|
|228    |4        |108        |[228.0,4.0,108.0] |[0.6574074074074073,0.0,0.010824934677118328]  |
|214    |6        |258        |[214.0,6.0,258.0] |[0.6141975308641975,0.5,0.06681597611048899]   |
|187    |8        |360        |[187.0,8.0,360.0] |[0.5308641975308641,1.0,0.10488988428518103]   |
|181    |6        |225        |[181.0,6.0,225.0] |[0.5123456790123456,0.5,0.05449794699514744]   |
|143    |8        |360        |[143.0,8.0,360.0] |[0.3950617283950617,1.0,0.10488988428518103]   |
|244    |4