**PROBLEM STATEMENT**
- Split Array Elements into Separate Columns

In [0]:
schema = ["Name", "SkillSet"]
    
data = (["ABC", ['.Net', 'Git', 'C#']],
        ["XYZ", ['Wordpress', 'PHP']],
        ["IJK", ['Python', 'MongoDB', 'Git']],
        ["DEF", ['SSIS', 'SSAS', 'Power BI', 'SQL Server', 'Data Warehouse']],
        ["PQR", ['Azure']])

df = spark.createDataFrame(data, schema)
df.show(truncate=False)
df.printSchema()

+----+--------------------------------------------------+
|Name|SkillSet                                          |
+----+--------------------------------------------------+
|ABC |[.Net, Git, C#]                                   |
|XYZ |[Wordpress, PHP]                                  |
|IJK |[Python, MongoDB, Git]                            |
|DEF |[SSIS, SSAS, Power BI, SQL Server, Data Warehouse]|
|PQR |[Azure]                                           |
+----+--------------------------------------------------+

root
 |-- Name: string (nullable = true)
 |-- SkillSet: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
from pyspark.sql.functions import split, col, size

**Method 01**

In [0]:
d21 = df.withColumn("Skill1", df['SkillSet'][0]) \
        .withColumn("Skill2", df['SkillSet'][1]) \
        .withColumn("Skill3", df['SkillSet'][2]) \
        .withColumn("Skill4", df['SkillSet'][3]) \
        .withColumn("Skill5", df['SkillSet'][4])
d21.show(truncate=False)

+----+--------------------------------------------------+---------+-------+--------+----------+--------------+
|Name|SkillSet                                          |Skill1   |Skill2 |Skill3  |Skill4    |Skill5        |
+----+--------------------------------------------------+---------+-------+--------+----------+--------------+
|ABC |[.Net, Git, C#]                                   |.Net     |Git    |C#      |null      |null          |
|XYZ |[Wordpress, PHP]                                  |Wordpress|PHP    |null    |null      |null          |
|IJK |[Python, MongoDB, Git]                            |Python   |MongoDB|Git     |null      |null          |
|DEF |[SSIS, SSAS, Power BI, SQL Server, Data Warehouse]|SSIS     |SSAS   |Power BI|SQL Server|Data Warehouse|
|PQR |[Azure]                                           |Azure    |null   |null    |null      |null          |
+----+--------------------------------------------------+---------+-------+--------+----------+--------------+



**Method 02**

In [0]:
df.select('Name', 'SkillSet', df.SkillSet[0].alias('Skill1'),\
                              df.SkillSet[1].alias('Skill2'),\
                              df.SkillSet[2].alias('Skill3'),\
                              df.SkillSet[3].alias('Skill4'),\
                              df.SkillSet[4].alias('Skill5')).show(truncate=False)

+----+--------------------------------------------------+---------+-------+--------+----------+--------------+
|Name|SkillSet                                          |Skill1   |Skill2 |Skill3  |Skill4    |Skill5        |
+----+--------------------------------------------------+---------+-------+--------+----------+--------------+
|ABC |[.Net, Git, C#]                                   |.Net     |Git    |C#      |null      |null          |
|XYZ |[Wordpress, PHP]                                  |Wordpress|PHP    |null    |null      |null          |
|IJK |[Python, MongoDB, Git]                            |Python   |MongoDB|Git     |null      |null          |
|DEF |[SSIS, SSAS, Power BI, SQL Server, Data Warehouse]|SSIS     |SSAS   |Power BI|SQL Server|Data Warehouse|
|PQR |[Azure]                                           |Azure    |null   |null    |null      |null          |
+----+--------------------------------------------------+---------+-------+--------+----------+--------------+



**Method 03**

In [0]:
for i in range(5):
  df = df.withColumn(f"skill_{i}", df.SkillSet[i])
df.show()

+----+--------------------+---------+-------+--------+----------+--------------+
|Name|            SkillSet|  skill_0|skill_1| skill_2|   skill_3|       skill_4|
+----+--------------------+---------+-------+--------+----------+--------------+
| ABC|     [.Net, Git, C#]|     .Net|    Git|      C#|      null|          null|
| XYZ|    [Wordpress, PHP]|Wordpress|    PHP|    null|      null|          null|
| IJK|[Python, MongoDB,...|   Python|MongoDB|     Git|      null|          null|
| DEF|[SSIS, SSAS, Powe...|     SSIS|   SSAS|Power BI|SQL Server|Data Warehouse|
| PQR|             [Azure]|    Azure|   null|    null|      null|          null|
+----+--------------------+---------+-------+--------+----------+--------------+



**Method 04**

**Determine the size of each array**

In [0]:
dfsize = df.select("Name", "SkillSet", size("SkillSet").alias("NoOfArrayElements"))
dfsize.show(truncate=False)

+----+--------------------------------------------------+-----------------+
|Name|SkillSet                                          |NoOfArrayElements|
+----+--------------------------------------------------+-----------------+
|ABC |[.Net, Git, C#]                                   |3                |
|XYZ |[Wordpress, PHP]                                  |2                |
|IJK |[Python, MongoDB, Git]                            |3                |
|DEF |[SSIS, SSAS, Power BI, SQL Server, Data Warehouse]|5                |
|PQR |[Azure]                                           |1                |
+----+--------------------------------------------------+-----------------+



**Get the Maximum Size of All Arrays**

In [0]:
max_value = dfsize.agg({"NoOfArrayElements": "max"}).collect()[0][0]
print(max_value)

5


**UDF to Convert Array Elements into Columns**

In [0]:
def arraySplitIntoCols(df, maxElements):
  for i in range(maxElements):
    df = df.withColumn(f"new_col_{i}", df.SkillSet[i])
  return df

**Call UDF**

In [0]:
dfout = arraySplitIntoCols(df, max_value)
display(df)

Name,SkillSet,skill_0,skill_1,skill_2,skill_3,skill_4
ABC,"List(.Net, Git, C#)",.Net,Git,C#,,
XYZ,"List(Wordpress, PHP)",Wordpress,PHP,,,
IJK,"List(Python, MongoDB, Git)",Python,MongoDB,Git,,
DEF,"List(SSIS, SSAS, Power BI, SQL Server, Data Warehouse)",SSIS,SSAS,Power BI,SQL Server,Data Warehouse
PQR,List(Azure),Azure,,,,
