In [19]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("StringOps") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

In [20]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("sources/retail_db/simple_dirty_data.csv")

df.show()


+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [21]:
from pyspark.sql import functions as F

In [22]:
df2 = df.withColumn("isim",F.trim(F.initcap(df.isim))) \
.withColumn("cinsiyet",F.when(df.cinsiyet.isNull(),"Bilinmiyor").otherwise(df.cinsiyet)) \
.withColumn("sehir",F.when(df.sehir.isNull(),"Bilinmiyor").otherwise(F.trim(F.upper(df.sehir))))

df2.show(15)

+------+--------+---+----------+-----------+----------+-----------+--------------------+
|sirano|    isim|yas|  cinsiyet|     meslek|     sehir|aylik_gelir|            mal_mulk|
+------+--------+---+----------+-----------+----------+-----------+--------------------+
|     1|   Cemal| 35|         E|       Isci|    ANKARA|     3500.0|               araba|
|     2|   Ceyda| 42|         K|      Memur|   KAYSERI|     4200.0|            araba|ev|
|     3|   Timur| 30|Bilinmiyor|   Müzüsyen|  ISTANBUL|     9000.0|     araba|ev|yazlık|
|     4|   Burcu| 29|         K|Pazarlamacı|    ANKARA|     4200.0|               araba|
|     5| Yasemin| 23|         K|Pazarlamaci|     BURSA|     4800.0|               araba|
|     6|     Ali| 33|         E|      Memur|    ANKARA|     4250.0|                  ev|
|     7|   Dilek| 29|         K|Pazarlamaci|  ISTANBUL|     7300.0|        araba|yazlık|
|     8|   Murat| 31|         E|   Müzüsyen|  ISTANBUL|    12000.0|araba|ev|dükkan|y...|
|     9|   Ahmet| 33|

## Writing to Disk ##

In [24]:
df2.coalesce(1).write.mode("overwrite").option("sep",",").option("header",True).csv("sources/retail_db/simple_dirty_data")

In [25]:
## Read Back ##

In [27]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("sources/retail_db/simple_dirty_data")

df.show()

+------+--------+---+----------+-----------+----------+-----------+--------------------+
|sirano|    isim|yas|  cinsiyet|     meslek|     sehir|aylik_gelir|            mal_mulk|
+------+--------+---+----------+-----------+----------+-----------+--------------------+
|     1|   Cemal| 35|         E|       Isci|    ANKARA|     3500.0|               araba|
|     2|   Ceyda| 42|         K|      Memur|   KAYSERI|     4200.0|            araba|ev|
|     3|   Timur| 30|Bilinmiyor|   Müzüsyen|  ISTANBUL|     9000.0|     araba|ev|yazlık|
|     4|   Burcu| 29|         K|Pazarlamacı|    ANKARA|     4200.0|               araba|
|     5| Yasemin| 23|         K|Pazarlamaci|     BURSA|     4800.0|               araba|
|     6|     Ali| 33|         E|      Memur|    ANKARA|     4250.0|                  ev|
|     7|   Dilek| 29|         K|Pazarlamaci|  ISTANBUL|     7300.0|        araba|yazlık|
|     8|   Murat| 31|         E|   Müzüsyen|  ISTANBUL|    12000.0|araba|ev|dükkan|y...|
|     9|   Ahmet| 33|