In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("StringOps") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

In [4]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("sources/retail_db/simple_dirty_data.csv")

df.show()


+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [5]:
from pyspark.sql.functions import *

## 1 Concat ##

In [6]:
"""
    select'i kaldırırsan tümünü seçip yanına ekler.
"""
df.select("meslek","sehir") \
.withColumn("meslek_sehir", concat(col("meslek"),lit(" - "),col("sehir"))) \
.show(truncate=False)

+-----------+-----------+------------------------+
|meslek     |sehir      |meslek_sehir            |
+-----------+-----------+------------------------+
|Isci       |Ankara     |Isci - Ankara           |
|Memur      |Kayseri    |Memur - Kayseri         |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Pazarlamacı|    Ankara |Pazarlamacı -     Ankara|
|Pazarlamaci|Bursa      |Pazarlamaci - Bursa     |
|Memur      |Ankara     |Memur - Ankara          |
|Pazarlamaci|Istanbul   |Pazarlamaci - Istanbul  |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Doktor     |Ankara     |Doktor - Ankara         |
|Berber     | Istanbul  |Berber -  Istanbul      |
|Tuhafiyeci |null       |null                    |
|Tornacı    | Ankara    |Tornacı -  Ankara       |
|memur      |Çorum      |memur - Çorum           |
|Doktor     |İzmir      |Doktor - İzmir          |
|Müzisyen   | Ankara    |Müzisyen -  Ankara      |
+-----------+-----------+------------------------+



## Number Format¶ ##


In [8]:
df.select("aylik_gelir") \
.withColumn("aylik_gelir_format", format_number(col("aylik_gelir"), 2)) \
.show()

+-----------+------------------+
|aylik_gelir|aylik_gelir_format|
+-----------+------------------+
|     3500.0|          3,500.00|
|     4200.0|          4,200.00|
|     9000.0|          9,000.00|
|     4200.0|          4,200.00|
|     4800.0|          4,800.00|
|     4250.0|          4,250.00|
|     7300.0|          7,300.00|
|    12000.0|         12,000.00|
|   180000.0|        180,000.00|
|    12000.0|         12,000.00|
|        4.8|              4.80|
|     4200.0|          4,200.00|
|     3750.0|          3,750.00|
|    14250.0|         14,250.00|
|     8700.0|          8,700.00|
+-----------+------------------+



In [9]:
#init - lower - length
df.select("meslek","isim","sehir") \
.withColumn("meslek_lower", lower(col("meslek"))) \
.withColumn("isim_initcap", initcap(col("isim"))) \
.withColumn("sehir_length", length(col("sehir"))) \
.show()

+-----------+---------+-----------+------------+------------+------------+
|     meslek|     isim|      sehir|meslek_lower|isim_initcap|sehir_length|
+-----------+---------+-----------+------------+------------+------------+
|       Isci|    Cemal|     Ankara|        isci|       Cemal|           6|
|      Memur|   ceyda |    Kayseri|       memur|      Ceyda |           7|
|   Müzüsyen|    Timur|Istanbul   |    müzüsyen|       Timur|          11|
|Pazarlamacı|   Burcu |     Ankara| pazarlamacı|      Burcu |          10|
|Pazarlamaci|  Yasemin|      Bursa| pazarlamaci|     Yasemin|           5|
|      Memur|      Ali|     Ankara|       memur|         Ali|           6|
|Pazarlamaci|    Dilek|   Istanbul| pazarlamaci|       Dilek|           8|
|   Müzüsyen|    Murat|   Istanbul|    müzüsyen|       Murat|           8|
|     Doktor|    Ahmet|     Ankara|      doktor|       Ahmet|           6|
|     Berber| Muhittin|   Istanbul|      berber|    Muhittin|           9|
| Tuhafiyeci| Hicaziye|  

In [21]:
#selec'i kaldır.
df.select("sehir","mal_mulk") \
.withColumn("sehir_ist", regexp_replace(col("sehir"), "Ist", "İST")) \
.withColumn("mal_mulk_split", split(col("mal_mulk"), "\\|")) \
.withColumn("mal_mulk_ilk", col("mal_mulk_split")[0]) \
.show(truncate=False)

+-----------+----------------------+-----------+---------------------------+------------+
|sehir      |mal_mulk              |sehir_ist  |mal_mulk_split             |mal_mulk_ilk|
+-----------+----------------------+-----------+---------------------------+------------+
|Ankara     |araba                 |Ankara     |[araba]                    |araba       |
|Kayseri    |araba|ev              |Kayseri    |[araba, ev]                |araba       |
|Istanbul   |araba|ev|yazlık       |İSTanbul   |[araba, ev, yazlık]        |araba       |
|    Ankara |araba                 |    Ankara |[araba]                    |araba       |
|Bursa      |araba                 |Bursa      |[araba]                    |araba       |
|Ankara     |ev                    |Ankara     |[ev]                       |ev          |
|Istanbul   |araba|yazlık          |İSTanbul   |[araba, yazlık]            |araba       |
|Istanbul   |araba|ev|dükkan|yazlık|İSTanbul   |[araba, ev, dükkan, yazlık]|araba       |
|Ankara   

In [17]:
df2 = df \
.withColumn("sehir_ist", regexp_replace(col("sehir"), "Ist", "İST")) \
.withColumn("mal_mulk_split", split(col("mal_mulk"), "\\|")) \
.withColumn("mal_mulk_ilk", col("mal_mulk_split")[0])

In [18]:
df2.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+-----------+--------------------+------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|  sehir_ist|      mal_mulk_split|mal_mulk_ilk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+-----------+--------------------+------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|     Ankara|             [araba]|       araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|    Kayseri|         [araba, ev]|       araba|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|İSTanbul   | [araba, ev, yazlık]|       araba|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|     Ankara|             [araba]|       araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|