#### **posexplode**
- It is used in the PySpark data model to explode an **array or map** related **columns to rows**. 
- It creates a **row for each element** in the array and creates two columns **pos** to hold the **position of the array element** and the **col** to hold the **actual array value**.
- When the input column is **map**, posexplode function creates 3 columns **pos** to hold the **position of the map element**, **key and value** columns.
- If array is **NULL** then that row is **Ignored / Eliminated**.

- Returns a **new row** for **each element** in the given **array**.
- when **array or map** is passed, creates **positional column** for each element.

In [0]:
from pyspark.sql.functions import col, explode, posexplode, split
import pyspark.sql.functions as f

**Array Type**

In [0]:
data = [(1, "Suresh", [".net", "Python", "Spark", "Azure"]),
        (2, "Ramya", ["java", "PySpark", "AWS"]),
        (3, "Rakesh", ["ADF", "SQL", None, "GCC"]),
        (4, "Apurba", ["C", "SAP", None]),
        (5, "Pranitha", ["COBOL", "DEVOPS"]),
        (6, "Sowmya", ["ABAP", None]),
        (7, "Anand", None),
        (8, "Sourabh", [])]
schema = ["id", "Name", "skills"]

df = spark.createDataFrame(data, schema)
df.show(truncate=False)
display(df)
df.printSchema()
print("Number of Rows:", df.count())

+---+--------+----------------------------+
|id |Name    |skills                      |
+---+--------+----------------------------+
|1  |Suresh  |[.net, Python, Spark, Azure]|
|2  |Ramya   |[java, PySpark, AWS]        |
|3  |Rakesh  |[ADF, SQL, null, GCC]       |
|4  |Apurba  |[C, SAP, null]              |
|5  |Pranitha|[COBOL, DEVOPS]             |
|6  |Sowmya  |[ABAP, null]                |
|7  |Anand   |null                        |
|8  |Sourabh |[]                          |
+---+--------+----------------------------+



id,Name,skills
1,Suresh,"List(.net, Python, Spark, Azure)"
2,Ramya,"List(java, PySpark, AWS)"
3,Rakesh,"List(ADF, SQL, null, GCC)"
4,Apurba,"List(C, SAP, null)"
5,Pranitha,"List(COBOL, DEVOPS)"
6,Sowmya,"List(ABAP, null)"
7,Anand,
8,Sourabh,List()


root
 |-- id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

Number of Rows: 8


In [0]:
display(df.select(df.id, df.Name, posexplode(df.skills)))

id,Name,pos,col
1,Suresh,0,.net
1,Suresh,1,Python
1,Suresh,2,Spark
1,Suresh,3,Azure
2,Ramya,0,java
2,Ramya,1,PySpark
2,Ramya,2,AWS
3,Rakesh,0,ADF
3,Rakesh,1,SQL
3,Rakesh,2,


**Map Type**

In [0]:
data1 = [('Raja', {'TV':'LG', 'Refrigerator':'Samsung', 'Oven':'Philips', 'AC':'Voltas'}),
        ('Raghav', {'AC':'Samsung', 'Washing machine': 'LG'}),
        ('Ram', {'Grinder':'Preeti', 'TV':""}),
        ('Ramesh', {'Refrigerator':'LG', 'TV':'Croma'}),
        ('Rajesh', None)]

schema1 = ['name', 'brand']

df1 = spark.createDataFrame(data=data1, schema=schema1)
display(df1)
df1.show(truncate=False)
df1.printSchema()
print("Number of Rows:", df1.count())

name,brand
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips)"
Raghav,"Map(AC -> Samsung, Washing machine -> LG)"
Ram,"Map(TV -> , Grinder -> Preeti)"
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)"
Rajesh,


+------+------------------------------------------------------------------+
|name  |brand                                                             |
+------+------------------------------------------------------------------+
|Raja  |{Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips}|
|Raghav|{AC -> Samsung, Washing machine -> LG}                            |
|Ram   |{TV -> , Grinder -> Preeti}                                       |
|Ramesh|{Refrigerator -> LG, TV -> Croma}                                 |
|Rajesh|null                                                              |
+------+------------------------------------------------------------------+

root
 |-- name: string (nullable = true)
 |-- brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

Number of Rows: 5


In [0]:
display(df1.select(df1.name, df1.brand, posexplode(df1.brand)))

name,brand,pos,key,value
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips)",0,Refrigerator,Samsung
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips)",1,AC,Voltas
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips)",2,TV,LG
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Philips)",3,Oven,Philips
Raghav,"Map(AC -> Samsung, Washing machine -> LG)",0,AC,Samsung
Raghav,"Map(AC -> Samsung, Washing machine -> LG)",1,Washing machine,LG
Ram,"Map(TV -> , Grinder -> Preeti)",0,TV,
Ram,"Map(TV -> , Grinder -> Preeti)",1,Grinder,Preeti
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)",0,Refrigerator,LG
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)",1,TV,Croma


**.txt**

In [0]:
df2 = spark.read.csv("/FileStore/tables/posexplode.txt", sep="|", header=True, inferSchema=True)
display(df2)

S.No,EmpName,Dept,Technology,Age,Year
1,Sundar,Azure,"ADB, ADF, PySpark, SQL",30,2022
2,Sheetal,Azure,"None, ADF, PySpark, SQL",35,2023
3,Amar,Azure,"ADB, None, PySpark",40,2021
4,Rakesh,Azure,"ADB, ADF",33,2023
5,Royal,Azure,"ADB, None",35,2022
6,Swapnil,AWS,,38,2002


In [0]:
df2 = df2.withColumn("New_Technology", split(("Technology"), ',')).select('EmpName', 'Dept', 'Technology', 'New_Technology')
display(df2)

EmpName,Dept,Technology,New_Technology
Sundar,Azure,"ADB, ADF, PySpark, SQL","List(ADB, ADF, PySpark, SQL)"
Sheetal,Azure,"None, ADF, PySpark, SQL","List(None, ADF, PySpark, SQL)"
Amar,Azure,"ADB, None, PySpark","List(ADB, None, PySpark)"
Rakesh,Azure,"ADB, ADF","List(ADB, ADF)"
Royal,Azure,"ADB, None","List(ADB, None)"
Swapnil,AWS,,


In [0]:
df3 = df2.select("*", posexplode("New_Technology"))
display(df3)

# df3 = df2.select("*", posexplode(split(("Technology"), ',')))
# display(df3)

EmpName,Dept,Technology,New_Technology,pos,col
Sundar,Azure,"ADB, ADF, PySpark, SQL","List(ADB, ADF, PySpark, SQL)",0,ADB
Sundar,Azure,"ADB, ADF, PySpark, SQL","List(ADB, ADF, PySpark, SQL)",1,ADF
Sundar,Azure,"ADB, ADF, PySpark, SQL","List(ADB, ADF, PySpark, SQL)",2,PySpark
Sundar,Azure,"ADB, ADF, PySpark, SQL","List(ADB, ADF, PySpark, SQL)",3,SQL
Sheetal,Azure,"None, ADF, PySpark, SQL","List(None, ADF, PySpark, SQL)",0,
Sheetal,Azure,"None, ADF, PySpark, SQL","List(None, ADF, PySpark, SQL)",1,ADF
Sheetal,Azure,"None, ADF, PySpark, SQL","List(None, ADF, PySpark, SQL)",2,PySpark
Sheetal,Azure,"None, ADF, PySpark, SQL","List(None, ADF, PySpark, SQL)",3,SQL
Amar,Azure,"ADB, None, PySpark","List(ADB, None, PySpark)",0,ADB
Amar,Azure,"ADB, None, PySpark","List(ADB, None, PySpark)",1,


In [0]:
df3 = df3.withColumnRenamed("col", "CoreTechnology")\
         .withColumnRenamed("pos", "Index")\
         .drop("Technology", "New_Technology")        
display(df3)

EmpName,Dept,Index,CoreTechnology
Sundar,Azure,0,ADB
Sundar,Azure,1,ADF
Sundar,Azure,2,PySpark
Sundar,Azure,3,SQL
Sheetal,Azure,0,
Sheetal,Azure,1,ADF
Sheetal,Azure,2,PySpark
Sheetal,Azure,3,SQL
Amar,Azure,0,ADB
Amar,Azure,1,
