# Univariate Feature Selector

The Univariate Feature Selector is a feature selection technique in PySpark that selects the most relevant features based on their individual relationship with the target variable. It evaluates each feature independently and selects the features that have the strongest correlation or association with the target variable.

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("univariatefeatureselector").getOrCreate()

In [None]:
from pyspark.ml.feature import RFormula, UnivariateFeatureSelector

In [None]:
cars = spark.read.csv("../0_data/Carros.csv", header=True, inferSchema=True, sep=";")
cars.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [None]:
r_formula = RFormula(formula="HP ~ .", featuresCol="independant", labelCol="dependant")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.select("independant", "dependant").show(10, truncate=False)

+-----------------------------------------------------+---------+
|independant                                          |dependant|
+-----------------------------------------------------+---------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |110.0    |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |110.0    |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |93.0     |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|110.0    |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |175.0    |
|[181.0,6.0,225.0,276.0,346.0,2022.0,1.0,0.0,3.0,1.0] |105.0    |
|[143.0,8.0,360.0,321.0,357.0,1584.0,0.0,0.0,3.0,4.0] |245.0    |
|[244.0,4.0,1467.0,369.0,319.0,20.0,1.0,0.0,4.0,2.0]  |62.0     |
|[228.0,4.0,1408.0,392.0,315.0,229.0,1.0,0.0,4.0,2.0] |95.0     |
|[192.0,6.0,1676.0,392.0,344.0,183.0,1.0,0.0,4.0,4.0] |123.0    |
+-----------------------------------------------------+---------+
only showing top 10 rows



In [None]:
selector = UnivariateFeatureSelector(featuresCol="independant", outputCol="selectedFeatures", labelCol="dependant", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(5)
cars_univ = selector.fit(cars_rf).transform(cars_rf)
cars_univ.select("selectedFeatures").show(10, truncate=False)

+-----------------------+
|selectedFeatures       |
+-----------------------+
|[21.0,6.0,0.0,1.0,4.0] |
|[21.0,6.0,0.0,1.0,4.0] |
|[228.0,4.0,1.0,1.0,1.0]|
|[214.0,6.0,1.0,0.0,1.0]|
|[187.0,8.0,0.0,0.0,2.0]|
|[181.0,6.0,1.0,0.0,1.0]|
|[143.0,8.0,0.0,0.0,4.0]|
|[244.0,4.0,1.0,0.0,2.0]|
|[228.0,4.0,1.0,0.0,2.0]|
|[192.0,6.0,1.0,0.0,4.0]|
+-----------------------+
only showing top 10 rows

