<a href="https://colab.research.google.com/github/egnsuresh/Spark_Practice/blob/master/difference_between_concat_ws_vs_concat_functions_in_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---



---



---



# 1. PySpark installation , spark session object creation and importing common functions.

---

---





In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=78f82f2344498ddea20a7d3d14ae4250748676fa77db9aec7d4877cad45ba198
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import pyspark
spark=pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql import functions as f

# 2. Sample data creation & Data profiling

---


---




In [4]:
cust_df = spark.createDataFrame(
    [(1, "John", "Doe", "Deposit", "2022-03-08", "10:00:00"),
     (2, None, "Doe", "Withdrawal", "2022-03-09", "11:00:00"),
     (3, "Jane", None, "Deposit", "2022-03-10", "12:00:00"),
     (4, "Jane", "Smith", "Withdrawal", "2022-03-11", "13:00:00")]
).toDF("cif", "first_name", "last_name", "tran_type", "date", "time")
cust_df.show(4,False)
cust_df.printSchema()
cust_df.count()

+---+----------+---------+----------+----------+--------+
|cif|first_name|last_name|tran_type |date      |time    |
+---+----------+---------+----------+----------+--------+
|1  |John      |Doe      |Deposit   |2022-03-08|10:00:00|
|2  |NULL      |Doe      |Withdrawal|2022-03-09|11:00:00|
|3  |Jane      |NULL     |Deposit   |2022-03-10|12:00:00|
|4  |Jane      |Smith    |Withdrawal|2022-03-11|13:00:00|
+---+----------+---------+----------+----------+--------+

root
 |-- cif: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- tran_type: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)



4

# 3. concat_ws vs concat (only differences)

---



---



  **what if concat function input fields has null values?**

In [5]:

cust_df.withColumn("full_name_with_concat", f.concat("first_name", f.lit(" "),"last_name")).show(4,False)


+---+----------+---------+----------+----------+--------+---------------------+
|cif|first_name|last_name|tran_type |date      |time    |full_name_with_concat|
+---+----------+---------+----------+----------+--------+---------------------+
|1  |John      |Doe      |Deposit   |2022-03-08|10:00:00|John Doe             |
|2  |NULL      |Doe      |Withdrawal|2022-03-09|11:00:00|NULL                 |
|3  |Jane      |NULL     |Deposit   |2022-03-10|12:00:00|NULL                 |
|4  |Jane      |Smith    |Withdrawal|2022-03-11|13:00:00|Jane Smith           |
+---+----------+---------+----------+----------+--------+---------------------+



In [6]:
cust_df.withColumn("full_name_with_concat_ws", f.concat_ws(" ", "first_name", "last_name")).show(4,False)
#syntax concat_ws(<separator ex: " " or "," or "|">, col1,col2..., coln)


+---+----------+---------+----------+----------+--------+------------------------+
|cif|first_name|last_name|tran_type |date      |time    |full_name_with_concat_ws|
+---+----------+---------+----------+----------+--------+------------------------+
|1  |John      |Doe      |Deposit   |2022-03-08|10:00:00|John Doe                |
|2  |NULL      |Doe      |Withdrawal|2022-03-09|11:00:00|Doe                     |
|3  |Jane      |NULL     |Deposit   |2022-03-10|12:00:00|Jane                    |
|4  |Jane      |Smith    |Withdrawal|2022-03-11|13:00:00|Jane Smith              |
+---+----------+---------+----------+----------+--------+------------------------+



In [7]:
cust_df.withColumn("full_name_with_concat_ws", f.concat_ws(",", "first_name", "last_name")).show(4,False)
cust_df.withColumn("full_name_with_concat_ws", f.concat_ws("|", "first_name", "last_name")).show(4,False)
#syntax concat_ws(<separator ex: " " or "," or "|">, col1,col2..., coln)


+---+----------+---------+----------+----------+--------+------------------------+
|cif|first_name|last_name|tran_type |date      |time    |full_name_with_concat_ws|
+---+----------+---------+----------+----------+--------+------------------------+
|1  |John      |Doe      |Deposit   |2022-03-08|10:00:00|John,Doe                |
|2  |NULL      |Doe      |Withdrawal|2022-03-09|11:00:00|Doe                     |
|3  |Jane      |NULL     |Deposit   |2022-03-10|12:00:00|Jane                    |
|4  |Jane      |Smith    |Withdrawal|2022-03-11|13:00:00|Jane,Smith              |
+---+----------+---------+----------+----------+--------+------------------------+

+---+----------+---------+----------+----------+--------+------------------------+
|cif|first_name|last_name|tran_type |date      |time    |full_name_with_concat_ws|
+---+----------+---------+----------+----------+--------+------------------------+
|1  |John      |Doe      |Deposit   |2022-03-08|10:00:00|John|Doe                |
|2 

The End

---



---



---

