# Implementation for Binarizer example


# Installing required packages
Needed for environments not Databricks

In [2]:
from IPython.display import clear_output

!pip install --upgrade pip
!pip install findspark
!pip install pyspark

clear_output(wait=False)

# Importing objects

In [16]:
import findspark, pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Global Settings
Needed for environments not Databricks

In [17]:
findspark.init()
spark = SparkSession.builder.getOrCreate()

# Reading data source

In [18]:
url = 'https://raw.githubusercontent.com/edsonlourenco/public_datasets/main/iris.csv'
spark.sparkContext.addFile(url)
csv_iris = SparkFiles.get("iris.csv")
df_iris = spark.read.csv(csv_iris, header=True, inferSchema=True, sep=',')

### Checking **data**

In [19]:
df_iris.show(truncate=False)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|class      |
+-----------+----------+-----------+----------+-----------+
|5.1        |3.5       |1.4        |0.2       |Iris-setosa|
|4.9        |3.0       |1.4        |0.2       |Iris-setosa|
|4.7        |3.2       |1.3        |0.2       |Iris-setosa|
|4.6        |3.1       |1.5        |0.2       |Iris-setosa|
|5.0        |3.6       |1.4        |0.2       |Iris-setosa|
|5.4        |3.9       |1.7        |0.4       |Iris-setosa|
|4.6        |3.4       |1.4        |0.3       |Iris-setosa|
|5.0        |3.4       |1.5        |0.2       |Iris-setosa|
|4.4        |2.9       |1.4        |0.2       |Iris-setosa|
|4.9        |3.1       |1.5        |0.1       |Iris-setosa|
|5.4        |3.7       |1.5        |0.2       |Iris-setosa|
|4.8        |3.4       |1.6        |0.2       |Iris-setosa|
|4.8        |3.0       |1.4        |0.1       |Iris-setosa|
|4.3        |3.0       |1.1        |0.1 

## Transform Binarizer

### Importing **Binarizer** class

In [20]:
from pyspark.ml.feature import Binarizer

### Doing binarization






In [21]:
binarizer = Binarizer(threshold=5, inputCol="sepallength", outputCol="sepallength_bin")

In [22]:
df_iris_bin = binarizer.transform(df_iris)
df_iris_bin.show(truncate=False)

+-----------+----------+-----------+----------+-----------+---------------+
|sepallength|sepalwidth|petallength|petalwidth|class      |sepallength_bin|
+-----------+----------+-----------+----------+-----------+---------------+
|5.1        |3.5       |1.4        |0.2       |Iris-setosa|1.0            |
|4.9        |3.0       |1.4        |0.2       |Iris-setosa|0.0            |
|4.7        |3.2       |1.3        |0.2       |Iris-setosa|0.0            |
|4.6        |3.1       |1.5        |0.2       |Iris-setosa|0.0            |
|5.0        |3.6       |1.4        |0.2       |Iris-setosa|0.0            |
|5.4        |3.9       |1.7        |0.4       |Iris-setosa|1.0            |
|4.6        |3.4       |1.4        |0.3       |Iris-setosa|0.0            |
|5.0        |3.4       |1.5        |0.2       |Iris-setosa|0.0            |
|4.4        |2.9       |1.4        |0.2       |Iris-setosa|0.0            |
|4.9        |3.1       |1.5        |0.1       |Iris-setosa|0.0            |
|5.4        