#### Explode: 
   - Explode function is used to **convert** collection columns **(List, Array & Map) to rows**.
   - When an **array** is passed to explode function, it creates a **new row for each element in array**.
   - When a **map** is passed, it creates **two new columns one for key and one for value** and each element in map split into the rows. If the **array or map is NULL**, that **row is eliminated**.


In [0]:
from pyspark.sql.functions import col, explode
import pyspark.sql.functions as f

In [0]:
help(explode)

Help on function explode in module pyspark.sql.functions:

explode(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Returns a new row for each element in the given array or map.
    Uses the default column name `col` for elements in the array and
    `key` and `value` for elements in the map unless specified otherwise.
    
    .. versionadded:: 1.4.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or str
        target column to work on.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        one row per array item or map key value.
    
    See Also
    --------
    :meth:`pyspark.functions.posexplode`
    :meth:`pyspark.functions.explode_outer`
    :meth:`pyspark.functions.posexplode_outer`
    
    Examples
    --------
    >>> from pyspark.sql import Row
    >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
    >>> eDF.select(explode(eDF.in

##### 1) Create dataframe with array column
- Explode(Array Type)

In [0]:
data = [(1, "Suresh", [".net", "Python", "Spark", "Azure"]),
        (2, "Ramya", ["java", "PySpark", "AWS"]),
        (3, "Apurba", ["C", "SAP", "Mainframes"]),
        (4, "Pranitha", ["COBOL", "DEVOPS"]),
        (5, "Sowmya", ["ABAP"])]

schema = ["id", "Name", "skills"]

df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)
df.printSchema()
print("Number of Rows:", df.count())

+---+--------+----------------------------+
|id |Name    |skills                      |
+---+--------+----------------------------+
|1  |Suresh  |[.net, Python, Spark, Azure]|
|2  |Ramya   |[java, PySpark, AWS]        |
|3  |Apurba  |[C, SAP, Mainframes]        |
|4  |Pranitha|[COBOL, DEVOPS]             |
|5  |Sowmya  |[ABAP]                      |
+---+--------+----------------------------+

root
 |-- id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

Number of Rows: 5


In [0]:
df = df.withColumn("New-Skills", explode(col('skills')))
display(df)
df.printSchema()

id,Name,skills,New-Skills
1,Suresh,"List(.net, Python, Spark, Azure)",.net
1,Suresh,"List(.net, Python, Spark, Azure)",Python
1,Suresh,"List(.net, Python, Spark, Azure)",Spark
1,Suresh,"List(.net, Python, Spark, Azure)",Azure
2,Ramya,"List(java, PySpark, AWS)",java
2,Ramya,"List(java, PySpark, AWS)",PySpark
2,Ramya,"List(java, PySpark, AWS)",AWS
3,Apurba,"List(C, SAP, Mainframes)",C
3,Apurba,"List(C, SAP, Mainframes)",SAP
3,Apurba,"List(C, SAP, Mainframes)",Mainframes


root
 |-- id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- New-Skills: string (nullable = true)



In [0]:
data1 = [(1, "Suresh", [".net", "Python", "Spark", "Azure"]),
         (2, "Ramya", ["java", "PySpark", "AWS"]),
         (3, "Rakesh", ["ADF", "SQL", None, "GCC"]),
         (4, "Apurba", ["C", "SAP", "Mainframes"]),
         (5, "Pranitha", ["COBOL", "DEVOPS"]),
         (6, "Sowmya", ["ABAP"]),
         (7, "Anand", None),
         (8, "Sourabh", [])]

schema1 = ["id", "Name", "skills"]

df1 = spark.createDataFrame(data=data1, schema=schema1)
df1.show(truncate=False)
df1.printSchema()
print("Number of Rows:", df1.count())

+---+--------+----------------------------+
|id |Name    |skills                      |
+---+--------+----------------------------+
|1  |Suresh  |[.net, Python, Spark, Azure]|
|2  |Ramya   |[java, PySpark, AWS]        |
|3  |Rakesh  |[ADF, SQL, null, GCC]       |
|4  |Apurba  |[C, SAP, Mainframes]        |
|5  |Pranitha|[COBOL, DEVOPS]             |
|6  |Sowmya  |[ABAP]                      |
|7  |Anand   |null                        |
|8  |Sourabh |[]                          |
+---+--------+----------------------------+

root
 |-- id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

Number of Rows: 8


In [0]:
df1 = df1.withColumn("New-Skills", explode(col('skills')))
display(df1)
df1.printSchema()

id,Name,skills,New-Skills
1,Suresh,"List(.net, Python, Spark, Azure)",.net
1,Suresh,"List(.net, Python, Spark, Azure)",Python
1,Suresh,"List(.net, Python, Spark, Azure)",Spark
1,Suresh,"List(.net, Python, Spark, Azure)",Azure
2,Ramya,"List(java, PySpark, AWS)",java
2,Ramya,"List(java, PySpark, AWS)",PySpark
2,Ramya,"List(java, PySpark, AWS)",AWS
3,Rakesh,"List(ADF, SQL, null, GCC)",ADF
3,Rakesh,"List(ADF, SQL, null, GCC)",SQL
3,Rakesh,"List(ADF, SQL, null, GCC)",


root
 |-- id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- New-Skills: string (nullable = true)



In [0]:
df1.withColumn("New-Skills", explode(col('skills'))).select("id", "Name", "New-Skills").show()

+---+------+----------+
| id|  Name|New-Skills|
+---+------+----------+
|  1|Suresh|      .net|
|  1|Suresh|    Python|
|  1|Suresh|     Spark|
|  1|Suresh|     Azure|
|  1|Suresh|      .net|
|  1|Suresh|    Python|
|  1|Suresh|     Spark|
|  1|Suresh|     Azure|
|  1|Suresh|      .net|
|  1|Suresh|    Python|
|  1|Suresh|     Spark|
|  1|Suresh|     Azure|
|  1|Suresh|      .net|
|  1|Suresh|    Python|
|  1|Suresh|     Spark|
|  1|Suresh|     Azure|
|  2| Ramya|      java|
|  2| Ramya|   PySpark|
|  2| Ramya|       AWS|
|  2| Ramya|      java|
+---+------+----------+
only showing top 20 rows



In [0]:
# using select method
df1.select("id", "Name", explode(col("skills")).alias("New-Skills")).show(truncate=False)

+---+------+----------+
|id |Name  |New-Skills|
+---+------+----------+
|1  |Suresh|.net      |
|1  |Suresh|Python    |
|1  |Suresh|Spark     |
|1  |Suresh|Azure     |
|1  |Suresh|.net      |
|1  |Suresh|Python    |
|1  |Suresh|Spark     |
|1  |Suresh|Azure     |
|1  |Suresh|.net      |
|1  |Suresh|Python    |
|1  |Suresh|Spark     |
|1  |Suresh|Azure     |
|1  |Suresh|.net      |
|1  |Suresh|Python    |
|1  |Suresh|Spark     |
|1  |Suresh|Azure     |
|2  |Ramya |java      |
|2  |Ramya |PySpark   |
|2  |Ramya |AWS       |
|2  |Ramya |java      |
+---+------+----------+
only showing top 20 rows



##### 2) Create dataframe with map column
- Explode(MapType)

In [0]:
data2 = [('Sam', {'Car':'Baleno', 'Bike':'Honda', 'Office':'EcoSpace', 'Technology':'Azure'}),
         ('Krishna', {'Car':'Santro', 'Bike':'RoyalEnfield', 'Office':'Bharathi', 'Technology':'AWS'}),
         ('Arijit', {'Car':'Etios', 'Bike':'BMW', 'Office':'EcoWorld'}),
         ('Swamy', {'Car':'Swift', 'Bike':'TVS'}),
         ('Senthil', None),
         ("Anand", {})]

schema2 = ['EmpName', 'EmpDetails']

df2 = spark.createDataFrame(data=data2, schema=schema2)
display(df2)
df2.show(truncate=False)
df2.printSchema()

EmpName,EmpDetails
Sam,"Map(Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda)"
Krishna,"Map(Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield)"
Arijit,"Map(Office -> EcoWorld, Car -> Etios, Bike -> BMW)"
Swamy,"Map(Car -> Swift, Bike -> TVS)"
Senthil,
Anand,Map()


+-------+----------------------------------------------------------------------------+
|EmpName|EmpDetails                                                                  |
+-------+----------------------------------------------------------------------------+
|Sam    |{Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda}     |
|Krishna|{Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield}|
|Arijit |{Office -> EcoWorld, Car -> Etios, Bike -> BMW}                             |
|Swamy  |{Car -> Swift, Bike -> TVS}                                                 |
|Senthil|null                                                                        |
|Anand  |{}                                                                          |
+-------+----------------------------------------------------------------------------+

root
 |-- EmpName: string (nullable = true)
 |-- EmpDetails: map (nullable = true)
 |    |-- key: string
 |    |-- value: string 

In [0]:
df2 = df2.select(df2.EmpName, df2.EmpDetails, explode(df2.EmpDetails))
display(df2)
df2.printSchema()

EmpName,EmpDetails,key,value
Sam,"Map(Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda)",Office,EcoSpace
Sam,"Map(Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda)",Technology,Azure
Sam,"Map(Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda)",Car,Baleno
Sam,"Map(Office -> EcoSpace, Technology -> Azure, Car -> Baleno, Bike -> Honda)",Bike,Honda
Krishna,"Map(Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield)",Office,Bharathi
Krishna,"Map(Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield)",Technology,AWS
Krishna,"Map(Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield)",Car,Santro
Krishna,"Map(Office -> Bharathi, Technology -> AWS, Car -> Santro, Bike -> RoyalEnfield)",Bike,RoyalEnfield
Arijit,"Map(Office -> EcoWorld, Car -> Etios, Bike -> BMW)",Office,EcoWorld
Arijit,"Map(Office -> EcoWorld, Car -> Etios, Bike -> BMW)",Car,Etios


root
 |-- EmpName: string (nullable = true)
 |-- EmpDetails: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

