In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [9]:
spark = SparkSession.builder \
.appName("df_fundamentals") \
.config("spark.driver.memory", "2g") \
.config("spark.executor.memory", "4g") \
.master("local[*]") \
.getOrCreate()

In [10]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [13]:
df = spark.read \
.option("sep", ",") \
.option("header", "true") \
.option("inferSchema", "true") \
.csv(f"{os.getcwd()}/simple_dirty_data.csv")

In [14]:
df.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [17]:
df_concat = df \
.withColumn("meslek_sehir", concat(col("meslek"), lit("-"), col("sehir")))

df_concat.show(n=10, truncate=False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+----------------------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |meslek_sehir          |
+------+--------+---+--------+-----------+-----------+-----------+----------------------+----------------------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |Isci-Ankara           |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |Memur-Kayseri         |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |Müzüsyen-Istanbul     |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |Pazarlamacı-    Ankara|
|5     |Yasemin |23 |K       |Pazarlamaci|Bursa      |4800.0     |araba                 |Pazarlamaci-Bursa     |
|6     | Ali    |33 |E       |Memur      |Ankara     |4250.0     |ev                    |Memur-A

In [18]:
df_num_format = df \
.withColumn("aylik_gelir_format", format_number(col("aylik_gelir"), 2))

In [19]:
df_num_format.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|aylik_gelir_format|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|          3,500.00|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|          4,200.00|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|          9,000.00|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|          4,200.00|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|          4,800.00|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|          4,250.00|
|     7|    Dilek| 29|      

In [26]:
df_lower = df \
.withColumn("meslek_lower", lower(col("meslek"))) \
.withColumn("isim_lower", initcap(col("isim"))) \
.withColumn("sehir_length", length(col("sehir")))

In [27]:
df_lower.show(n=5, truncate=False)

+------+-------+---+--------+-----------+-----------+-----------+---------------+------------+----------+------------+
|sirano|isim   |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk       |meslek_lower|isim_lower|sehir_length|
+------+-------+---+--------+-----------+-----------+-----------+---------------+------------+----------+------------+
|1     |Cemal  |35 |E       |Isci       |Ankara     |3500.0     |araba          |isci        |Cemal     |6           |
|2     |ceyda  |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev       |memur       |Ceyda     |7           |
|3     |Timur  |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık|müzüsyen    |Timur     |11          |
|4     |Burcu  |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba          |pazarlamacı |Burcu     |10          |
|5     |Yasemin|23 |K       |Pazarlamaci|Bursa      |4800.0     |araba          |pazarlamaci |Yasemin   |5           |
+------+-------+---+--------+-----------+-------

In [28]:
df_trim = df \
.withColumn("sehir_ltrim", ltrim(col("sehir"))) \
.withColumn("sehir_rtrim", rtrim(col("sehir"))) \
.withColumn("sehir_trim", trim(col("sehir")))

In [29]:
df_trim.show(n=10, truncate=False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+-----------+----------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |sehir_ltrim|sehir_rtrim|sehir_trim|
+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+-----------+----------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |Ankara     |Ankara     |Ankara    |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |Kayseri    |Kayseri    |Kayseri   |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |Istanbul   |Istanbul   |Istanbul  |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |Ankara     |    Ankara |Ankara    |
|5     |Yasemin |23 |K       |Pazarlamaci|Bursa      |4800.0     |araba                 |Bursa      |Bursa      |Bursa     |


In [34]:
df_replace = df \
.withColumn("sehir_ist", regexp_replace(col("sehir"), "Ist", "İST")) \
.withColumn("mal_mulk_split", split(col("mal_mulk"), "\\|")) \
.withColumn("mal_mulk_first", col("mal_mulk_split")[0]) 

In [35]:
df_replace.show(n=10, truncate=False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+---------------------------+--------------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |sehir_ist  |mal_mulk_split             |mal_mulk_first|
+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+---------------------------+--------------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |Ankara     |[araba]                    |araba         |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |Kayseri    |[araba, ev]                |araba         |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |İSTanbul   |[araba, ev, yazlık]        |araba         |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |    Ankara |[araba]                    |a

In [36]:
df_replace.printSchema()

root
 |-- sirano: integer (nullable = true)
 |-- isim: string (nullable = true)
 |-- yas: integer (nullable = true)
 |-- cinsiyet: string (nullable = true)
 |-- meslek: string (nullable = true)
 |-- sehir: string (nullable = true)
 |-- aylik_gelir: double (nullable = true)
 |-- mal_mulk: string (nullable = true)
 |-- sehir_ist: string (nullable = true)
 |-- mal_mulk_split: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- mal_mulk_first: string (nullable = true)

