In [7]:
from pyspark.sql import SparkSession
import os

In [8]:
spark = SparkSession.builder \
.master("local[4]") \
.appName("Write-CSV-TO-Disk") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

In [9]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [10]:
current_dir = os.getcwd()

In [12]:


df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv(f"{current_dir}/simple_dirty_data.csv")



In [13]:
df.show(15)

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [14]:
from pyspark.sql import functions as F
df2 = df \
.withColumn("isim", F.trim(F.initcap(df.isim))) \
.withColumn("cinsiyet", F.when(df['cinsiyet'].isNull(), "U").otherwise(df['cinsiyet'])) \
.withColumn("sehir", F.when(df['sehir'].isNull(), "BİLİNMİYOR").otherwise(F.trim(F.upper(df['sehir']))))

df2.show(15)

+------+--------+---+--------+-----------+----------+-----------+--------------------+
|sirano|    isim|yas|cinsiyet|     meslek|     sehir|aylik_gelir|            mal_mulk|
+------+--------+---+--------+-----------+----------+-----------+--------------------+
|     1|   Cemal| 35|       E|       Isci|    ANKARA|     3500.0|               araba|
|     2|   Ceyda| 42|       K|      Memur|   KAYSERI|     4200.0|            araba|ev|
|     3|   Timur| 30|       U|   Müzüsyen|  ISTANBUL|     9000.0|     araba|ev|yazlık|
|     4|   Burcu| 29|       K|Pazarlamacı|    ANKARA|     4200.0|               araba|
|     5| Yasemin| 23|       K|Pazarlamaci|     BURSA|     4800.0|               araba|
|     6|     Ali| 33|       E|      Memur|    ANKARA|     4250.0|                  ev|
|     7|   Dilek| 29|       K|Pazarlamaci|  ISTANBUL|     7300.0|        araba|yazlık|
|     8|   Murat| 31|       E|   Müzüsyen|  ISTANBUL|    12000.0|araba|ev|dükkan|y...|
|     9|   Ahmet| 33|       E|     Doktor| 

In [17]:
df2.coalesce(1) \
.write \
.mode("overwrite") \
.option("sep",",") \
.option("header","True") \
.csv(f"{current_dir}/clean_data")

In [18]:
df3 = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv(f"{current_dir}/clean_data")

df3.show(15)


+------+--------+---+--------+-----------+----------+-----------+--------------------+
|sirano|    isim|yas|cinsiyet|     meslek|     sehir|aylik_gelir|            mal_mulk|
+------+--------+---+--------+-----------+----------+-----------+--------------------+
|     1|   Cemal| 35|       E|       Isci|    ANKARA|     3500.0|               araba|
|     2|   Ceyda| 42|       K|      Memur|   KAYSERI|     4200.0|            araba|ev|
|     3|   Timur| 30|       U|   Müzüsyen|  ISTANBUL|     9000.0|     araba|ev|yazlık|
|     4|   Burcu| 29|       K|Pazarlamacı|    ANKARA|     4200.0|               araba|
|     5| Yasemin| 23|       K|Pazarlamaci|     BURSA|     4800.0|               araba|
|     6|     Ali| 33|       E|      Memur|    ANKARA|     4250.0|                  ev|
|     7|   Dilek| 29|       K|Pazarlamaci|  ISTANBUL|     7300.0|        araba|yazlık|
|     8|   Murat| 31|       E|   Müzüsyen|  ISTANBUL|    12000.0|araba|ev|dükkan|y...|
|     9|   Ahmet| 33|       E|     Doktor| 