# Strings and Text Functions

## String to Index

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("stringandtextfunctions").getOrCreate()

24/03/29 16:34:39 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.111 instead (on interface wlo1)
24/03/29 16:34:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/29 16:34:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import StringIndexer

In [3]:
churn = spark.read.csv("../0_data/Churn.csv", header=True, inferSchema=True, sep=";")
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [4]:
index = StringIndexer(inputCol="Geography", outputCol="index")
model = index.fit(churn)
indexed = model.transform(churn)
indexed.select("Geography", "index").show(15)

+---------+-----+
|Geography|index|
+---------+-----+
|   France|  0.0|
|    Spain|  2.0|
|   France|  0.0|
|   France|  0.0|
|    Spain|  2.0|
|    Spain|  2.0|
|   France|  0.0|
|  Germany|  1.0|
|   France|  0.0|
|   France|  0.0|
|   France|  0.0|
|    Spain|  2.0|
|   France|  0.0|
|   France|  0.0|
|    Spain|  2.0|
+---------+-----+
only showing top 15 rows



## Index to String

In [5]:
from pyspark.ml.feature import IndexToString

In [6]:
text = IndexToString(inputCol="index", outputCol="original_category")
converted = text.transform(indexed)
converted.select("Geography", "index", "original_category").show(15)

+---------+-----+-----------------+
|Geography|index|original_category|
+---------+-----+-----------------+
|   France|  0.0|           France|
|    Spain|  2.0|            Spain|
|   France|  0.0|           France|
|   France|  0.0|           France|
|    Spain|  2.0|            Spain|
|    Spain|  2.0|            Spain|
|   France|  0.0|           France|
|  Germany|  1.0|          Germany|
|   France|  0.0|           France|
|   France|  0.0|           France|
|   France|  0.0|           France|
|    Spain|  2.0|            Spain|
|   France|  0.0|           France|
|   France|  0.0|           France|
|    Spain|  2.0|            Spain|
+---------+-----+-----------------+
only showing top 15 rows



## One-hot encoding

In [8]:
from pyspark.ml.feature import OneHotEncoder

In [9]:
index = StringIndexer(inputCol="Geography", outputCol="indexer_c1")
index_onehot = index.fit(churn).transform(churn)

index = StringIndexer(inputCol="Gender", outputCol="indexer_c2")
index_onehot = index.fit(index_onehot).transform(index_onehot)

In [11]:
index_onehot.select("indexer_c1", "indexer_c2").show(5)

+----------+----------+
|indexer_c1|indexer_c2|
+----------+----------+
|       0.0|       1.0|
|       2.0|       1.0|
|       0.0|       1.0|
|       0.0|       1.0|
|       2.0|       1.0|
+----------+----------+
only showing top 5 rows



In [12]:
onehot = OneHotEncoder(inputCols=["indexer_c1", "indexer_c2"], outputCols=["onehot_c1", "onehot_c2"])
onehot = onehot.fit(index_onehot)
onehot_out = onehot.transform(index_onehot)

In [14]:
onehot_out.select("indexer_c1", "indexer_c2").show(5)

+----------+----------+
|indexer_c1|indexer_c2|
+----------+----------+
|       0.0|       1.0|
|       2.0|       1.0|
|       0.0|       1.0|
|       0.0|       1.0|
|       2.0|       1.0|
+----------+----------+
only showing top 5 rows



In [15]:
onehot_out.select("onehot_c1", "onehot_c2").show(5)

+-------------+---------+
|    onehot_c1|onehot_c2|
+-------------+---------+
|(2,[0],[1.0])|(1,[],[])|
|    (2,[],[])|(1,[],[])|
|(2,[0],[1.0])|(1,[],[])|
|(2,[0],[1.0])|(1,[],[])|
|    (2,[],[])|(1,[],[])|
+-------------+---------+
only showing top 5 rows

