**Q) How to Explode nested array into rows?**

In [0]:
from pyspark.sql.functions import col, explode, posexplode, flatten

data = [("Revanth", [["ADF", "Spark", "ADB"], ["ETL", "Devops", None], ["SQL", None]]),
        ("Reshma", [["SSMS", None, "Salesforce"], ["SAP", "ERP", None]]),
        ("Raashi", [["Python" "VB", None], ["C++", "GitHub", "Git"]]),
        ("Krishna", [["SHELL", "DRG"], ["JAVA", None]]),
        ("Sudarshan", None),
        ("Kamal", [])
       ]

columns = ["EmpName", "Technology"]

df = spark.createDataFrame(data=data, schema=columns)
display(df)
df.printSchema()

EmpName,Technology
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))"
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))"
Raashi,"List(List(PythonVB, null), List(C++, GitHub, Git))"
Krishna,"List(List(SHELL, DRG), List(JAVA, null))"
Sudarshan,
Kamal,List()


root
 |-- EmpName: string (nullable = true)
 |-- Technology: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



**Method 01**: Flatten | Explode

     df1 = df.select(col("EmpName"), posexplode(col("Technology")).alias('pos', 'CoreTechnology'))\
             .withColumnRenamed("pos", "Index")
                        (or)
     df1 = df.select(col("EmpName"), posexplode(col("Technology")))\
             .withColumnRenamed("col", "CoreTechnology")\
             .withColumnRenamed("pos", "Index")                    

In [0]:
df1 = df.select(col("EmpName"), posexplode(col("Technology")).alias('pos', 'CoreTechnology'))\
        .withColumnRenamed("pos", "Index")
display(df1)

EmpName,Index,CoreTechnology
Revanth,0,"List(ADF, Spark, ADB)"
Revanth,1,"List(ETL, Devops, null)"
Revanth,2,"List(SQL, null)"
Reshma,0,"List(SSMS, null, Salesforce)"
Reshma,1,"List(SAP, ERP, null)"
Raashi,0,"List(PythonVB, null)"
Raashi,1,"List(C++, GitHub, Git)"
Krishna,0,"List(SHELL, DRG)"
Krishna,1,"List(JAVA, null)"


- but we would end up with this dataframe, which we have **1 row per array**. That's not exactly what we want... we are trying to obtain **1 row per element for each one of the names**.

In [0]:
df2 = df.select(col("EmpName"), flatten(col("Technology")).alias('CoreTechnology'))
display(df2)

EmpName,CoreTechnology
Revanth,"List(ADF, Spark, ADB, ETL, Devops, null, SQL, null)"
Reshma,"List(SSMS, null, Salesforce, SAP, ERP, null)"
Raashi,"List(PythonVB, null, C++, GitHub, Git)"
Krishna,"List(SHELL, DRG, JAVA, null)"
Sudarshan,
Kamal,List()


- It will generate a dataframe like this one bellow. Now we have **all the elements, in a singles array per row**.

     df3 = df2.select(col("EmpName"), posexplode(col("CoreTechnology")))\
              .withColumnRenamed("col", "FlattenTechnology")\
              .withColumnRenamed("pos", "Index")
                             (or)
     df3 = df2.select(col("EmpName"), posexplode(col("CoreTechnology")).alias('pos', 'FlattenTechnology'))\
              .withColumnRenamed("pos", "Index")
                             (or)
     df3 = df2.withColumn("CoreTechnology", explode(col("CoreTechnology")).alias('FlattenTechnology'))

In [0]:
df3 = df2.select(col("EmpName"), posexplode(col("CoreTechnology")))\
         .withColumnRenamed("col", "FlattenTechnology")\
         .withColumnRenamed("pos", "Index")
display(df3)

EmpName,Index,FlattenTechnology
Revanth,0,ADF
Revanth,1,Spark
Revanth,2,ADB
Revanth,3,ETL
Revanth,4,Devops
Revanth,5,
Revanth,6,SQL
Revanth,7,
Reshma,0,SSMS
Reshma,1,


**Method 02**: Explode | Explode

#### **a) SELECT**

In [0]:
dff = df.select("*", explode(col("Technology"))).withColumnRenamed("col", "New_Tech")
display(dff)

EmpName,Technology,New_Tech
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ADF, Spark, ADB)"
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ETL, Devops, null)"
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(SQL, null)"
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SSMS, null, Salesforce)"
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SAP, ERP, null)"
Raashi,"List(List(PythonVB, null), List(C++, GitHub, Git))","List(PythonVB, null)"
Raashi,"List(List(PythonVB, null), List(C++, GitHub, Git))","List(C++, GitHub, Git)"
Krishna,"List(List(SHELL, DRG), List(JAVA, null))","List(SHELL, DRG)"
Krishna,"List(List(SHELL, DRG), List(JAVA, null))","List(JAVA, null)"


In [0]:
dff = dff.select("*", explode(col("New_Tech"))).drop("Technology", "New_Tech")\
         .withColumnRenamed("col", "Final_Tech")
display(dff)

EmpName,Final_Tech
Revanth,ADF
Revanth,Spark
Revanth,ADB
Revanth,ETL
Revanth,Devops
Revanth,
Revanth,SQL
Revanth,
Reshma,SSMS
Reshma,


#### **b) withColumn**

In [0]:
# First, explode the outer array
df_outer_exploded = df.withColumn("inner_array", explode(col("Technology")))
display(df_outer_exploded)

EmpName,Technology,inner_array
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ADF, Spark, ADB)"
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ETL, Devops, null)"
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(SQL, null)"
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SSMS, null, Salesforce)"
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SAP, ERP, null)"
Raashi,"List(List(PythonVB, null), List(C++, GitHub, Git))","List(PythonVB, null)"
Raashi,"List(List(PythonVB, null), List(C++, GitHub, Git))","List(C++, GitHub, Git)"
Krishna,"List(List(SHELL, DRG), List(JAVA, null))","List(SHELL, DRG)"
Krishna,"List(List(SHELL, DRG), List(JAVA, null))","List(JAVA, null)"


In [0]:
# Then, explode the inner array
df_inner_exploded = df_outer_exploded.withColumn("FlattenTechnology", explode(col("inner_array")))
display(df_inner_exploded)

EmpName,Technology,inner_array,FlattenTechnology
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ADF, Spark, ADB)",ADF
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ADF, Spark, ADB)",Spark
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ADF, Spark, ADB)",ADB
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ETL, Devops, null)",ETL
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ETL, Devops, null)",Devops
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(ETL, Devops, null)",
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(SQL, null)",SQL
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))","List(SQL, null)",
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SSMS, null, Salesforce)",SSMS
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))","List(SSMS, null, Salesforce)",


In [0]:
# Drop the intermediate column if not needed
df_final = df_inner_exploded.drop("inner_array")
display(df_final)

EmpName,Technology,FlattenTechnology
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",ADF
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",Spark
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",ADB
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",ETL
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",Devops
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",SQL
Revanth,"List(List(ADF, Spark, ADB), List(ETL, Devops, null), List(SQL, null))",
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))",SSMS
Reshma,"List(List(SSMS, null, Salesforce), List(SAP, ERP, null))",
