# Implementation for Index To String example


# Installing required packages
Needed for environments not Databricks

In [2]:
from IPython.display import clear_output

!pip install --upgrade pip
!pip install findspark
!pip install pyspark

clear_output(wait=False)

# Importing objects

In [3]:
import findspark, pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Global Settings
Needed for environments not Databricks

In [4]:
findspark.init()
spark = SparkSession.builder.getOrCreate()

# Reading data source

In [5]:
url = 'https://raw.githubusercontent.com/edsonlourenco/public_datasets/main/Churn.csv'
spark.sparkContext.addFile(url)
csv_churn = SparkFiles.get("Churn.csv")
df_churn = spark.read.csv(csv_churn, header=True, inferSchema=True, sep=';')

### Checking **data**

## Transform StringIndexer

### Importing **StringIndexer** class

In [6]:
from pyspark.ml.feature import StringIndexer

### Doing indexing to numbers






In [7]:
df_churn.show(truncate=False)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|Balance |NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|619        |France   |Female|42 |2     |0       |1            |1        |1             |10134888       |1     |
|608        |Spain    |Female|41 |1     |8380786 |1            |0        |1             |11254258       |0     |
|502        |France   |Female|42 |8     |1596608 |3            |1        |0             |11393157       |1     |
|699        |France   |Female|39 |1     |0       |2            |0        |0             |9382663        |0     |
|850        |Spain    |Female|43 |2     |12551082|1            |1        |1             |790841         |0     |
|645        |Spain    |Male  |44 |8     |11375578|2            |1        |0             |1497567

In [8]:
indice = StringIndexer(inputCol="Geography", outputCol="indice")
modelo = indice.fit(df_churn)
dadoscomindice = modelo.transform(df_churn)
dadoscomindice.select("Geography", "indice").show(10)

+---------+------+
|Geography|indice|
+---------+------+
|   France|   0.0|
|    Spain|   2.0|
|   France|   0.0|
|   France|   0.0|
|    Spain|   2.0|
|    Spain|   2.0|
|   France|   0.0|
|  Germany|   1.0|
|   France|   0.0|
|   France|   0.0|
+---------+------+
only showing top 10 rows



## Transform Index to String

### Importing **Index to String** class

In [9]:
from pyspark.ml.feature import IndexToString

### Doing indexing to strings

In [11]:
retorno = IndexToString(inputCol="indice", outputCol="categoriaoriginal")
convertido = retorno.transform(dadoscomindice)
convertido.select("Geography", "indice", "categoriaoriginal").show(5)

+---------+------+-----------------+
|Geography|indice|categoriaoriginal|
+---------+------+-----------------+
|   France|   0.0|           France|
|    Spain|   2.0|            Spain|
|   France|   0.0|           France|
|   France|   0.0|           France|
|    Spain|   2.0|            Spain|
+---------+------+-----------------+
only showing top 5 rows

