**How to transform String Columns to ArrayType column and flatten into individual columns?**

     index
     getItem
     element_at
     Array Size

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, array, array_contains, size, element_at
from pyspark.sql.types import IntegerType, StringType, ArrayType, StructType, StructField

In [0]:
data = [("Amar,,Singh",["Java","Scala","C++"], ["Spark","Java","Azure Databricks"], [8, 9, 5, 7], "Bangalore", "Chennai", 25, 7),
        ("Ramesh,Rathode,", ["Python","PySpark","C"], ["spark sql","ADF"], [11, 3, 6, 8], "Hyderabad", "Kochin", 35, 8),
        ("Asha,,Rani", ["Devops","VB","Git"], ["ApacheSpark","Python"], [5, 6, 8, 10], "Amaravathi", "Noida", 30, 10),
        ("Rakesh,Kothur,", ["SQL","Azure","AWS"], ["PySpark","Oracle","Confluence"], [12, 6, 8, 15], "Noida", "Mumbai", 33, 5),
        ("Krishna,,Joshi", ["GCC","Visual Studio"], ["SQL","Databricks","SQL Editor"], [2, 6, 5, 8], "Delhi", "Kolkata", 28, 6),
        ("Hari,,Rani", ["Devops","VB","Git"], ["ApacheSpark","Python"], [5, 6, 8, 10], "Amaravathi", "Noida", 30, 10),
        ("Rakesh,kumar,", ["SQL","Azure","AWS"], ["PySpark","Oracle","Schema"], [12, 6, 8, 15], "luknow", "Mumbai", 33, 5),
        ("karan,,Joshi", ["AWS","Visual Studio"], ["SQL","Git","SQL Editor"], [2, 6, 5, 8], "Delhi", "Noida", 28, 6),
        ]

schema = StructType([ 
    StructField("FullName", StringType(), True), 
    StructField("LearntLanguages", ArrayType(StringType()), True), 
    StructField("ToLearnLanguages", ArrayType(StringType()), True),
    StructField("Rating", ArrayType(IntegerType()), True), 
    StructField("PresentState", StringType(), True), 
    StructField("PreviousState", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Experience", IntegerType(), True)
  ])

df2 = spark.createDataFrame(data=data, schema=schema)
df2.printSchema()
display(df2)

root
 |-- FullName: string (nullable = true)
 |-- LearntLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ToLearnLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Rating: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- PresentState: string (nullable = true)
 |-- PreviousState: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



FullName,LearntLanguages,ToLearnLanguages,Rating,PresentState,PreviousState,Age,Experience
"Amar,,Singh","List(Java, Scala, C++)","List(Spark, Java, Azure Databricks)","List(8, 9, 5, 7)",Bangalore,Chennai,25,7
"Ramesh,Rathode,","List(Python, PySpark, C)","List(spark sql, ADF)","List(11, 3, 6, 8)",Hyderabad,Kochin,35,8
"Asha,,Rani","List(Devops, VB, Git)","List(ApacheSpark, Python)","List(5, 6, 8, 10)",Amaravathi,Noida,30,10
"Rakesh,Kothur,","List(SQL, Azure, AWS)","List(PySpark, Oracle, Confluence)","List(12, 6, 8, 15)",Noida,Mumbai,33,5
"Krishna,,Joshi","List(GCC, Visual Studio)","List(SQL, Databricks, SQL Editor)","List(2, 6, 5, 8)",Delhi,Kolkata,28,6
"Hari,,Rani","List(Devops, VB, Git)","List(ApacheSpark, Python)","List(5, 6, 8, 10)",Amaravathi,Noida,30,10
"Rakesh,kumar,","List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema)","List(12, 6, 8, 15)",luknow,Mumbai,33,5
"karan,,Joshi","List(AWS, Visual Studio)","List(SQL, Git, SQL Editor)","List(2, 6, 5, 8)",Delhi,Noida,28,6


The below example **combines** the data from **PresentState and PreviousState** and creates a **new column states**.

In [0]:
df2.select(df2.FullName, array(df2.PresentState, df2.PreviousState).alias("States"))\
   .withColumn("Size", F.size("States")).display()

FullName,States,Size
"Amar,,Singh","List(Bangalore, Chennai)",2
"Ramesh,Rathode,","List(Hyderabad, Kochin)",2
"Asha,,Rani","List(Amaravathi, Noida)",2
"Rakesh,Kothur,","List(Noida, Mumbai)",2
"Krishna,,Joshi","List(Delhi, Kolkata)",2
"Hari,,Rani","List(Amaravathi, Noida)",2
"Rakesh,kumar,","List(luknow, Mumbai)",2
"karan,,Joshi","List(Delhi, Noida)",2


#### **Index**

In [0]:
df2.withColumn("Rating1", df2["Rating"][0])\
   .withColumn("Rating2", df2["Rating"][1])\
   .withColumn("Rating3", df2["Rating"][2])\
   .withColumn("Rating4", df2["Rating"][3])\
   .select("FullName", "Rating", "Rating1", "Rating2", "Rating3", "Rating4").display()

FullName,Rating,Rating1,Rating2,Rating3,Rating4
"Amar,,Singh","List(8, 9, 5, 7)",8,9,5,7
"Ramesh,Rathode,","List(11, 3, 6, 8)",11,3,6,8
"Asha,,Rani","List(5, 6, 8, 10)",5,6,8,10
"Rakesh,Kothur,","List(12, 6, 8, 15)",12,6,8,15
"Krishna,,Joshi","List(2, 6, 5, 8)",2,6,5,8
"Hari,,Rani","List(5, 6, 8, 10)",5,6,8,10
"Rakesh,kumar,","List(12, 6, 8, 15)",12,6,8,15
"karan,,Joshi","List(2, 6, 5, 8)",2,6,5,8


#### **getItem**

In [0]:
display(df2.withColumn("get_TLL_01", col("ToLearnLanguages").getItem(0))\
           .withColumn("get_TLL_02", col("ToLearnLanguages").getItem(1))\
           .withColumn("get_TLL_03", col("ToLearnLanguages").getItem(2))\
           .select("ToLearnLanguages", "get_TLL_01", "get_TLL_02", "get_TLL_03"))

ToLearnLanguages,get_TLL_01,get_TLL_02,get_TLL_03
"List(Spark, Java, Azure Databricks)",Spark,Java,Azure Databricks
"List(spark sql, ADF)",spark sql,ADF,
"List(ApacheSpark, Python)",ApacheSpark,Python,
"List(PySpark, Oracle, Confluence)",PySpark,Oracle,Confluence
"List(SQL, Databricks, SQL Editor)",SQL,Databricks,SQL Editor
"List(ApacheSpark, Python)",ApacheSpark,Python,
"List(PySpark, Oracle, Schema)",PySpark,Oracle,Schema
"List(SQL, Git, SQL Editor)",SQL,Git,SQL Editor


#### **element_at**

- The **position is not zero** based, but **1 based index**.

In [0]:
display(df2.withColumn("elm_TLL_01", element_at("ToLearnLanguages", 1))\
           .withColumn("elm_TLL_02", element_at("ToLearnLanguages", 2))\
           .withColumn("elm_TLL_03", element_at("ToLearnLanguages", 3))\
           .withColumn("elm_TLL_04", element_at("ToLearnLanguages", -1))\
           .select("ToLearnLanguages", "elm_TLL_01", "elm_TLL_02", "elm_TLL_03", "elm_TLL_04"))

ToLearnLanguages,elm_TLL_01,elm_TLL_02,elm_TLL_03,elm_TLL_04
"List(Spark, Java, Azure Databricks)",Spark,Java,Azure Databricks,Azure Databricks
"List(spark sql, ADF)",spark sql,ADF,,ADF
"List(ApacheSpark, Python)",ApacheSpark,Python,,Python
"List(PySpark, Oracle, Confluence)",PySpark,Oracle,Confluence,Confluence
"List(SQL, Databricks, SQL Editor)",SQL,Databricks,SQL Editor,SQL Editor
"List(ApacheSpark, Python)",ApacheSpark,Python,,Python
"List(PySpark, Oracle, Schema)",PySpark,Oracle,Schema,Schema
"List(SQL, Git, SQL Editor)",SQL,Git,SQL Editor,SQL Editor


In [0]:
display(df2.withColumn("lit", element_at("ToLearnLanguages", lit(2)))\
           .select("ToLearnLanguages", "lit"))

ToLearnLanguages,lit
"List(Spark, Java, Azure Databricks)",Java
"List(spark sql, ADF)",ADF
"List(ApacheSpark, Python)",Python
"List(PySpark, Oracle, Confluence)",Oracle
"List(SQL, Databricks, SQL Editor)",Databricks
"List(ApacheSpark, Python)",Python
"List(PySpark, Oracle, Schema)",Oracle
"List(SQL, Git, SQL Editor)",Git


#### **Array Size**
- returns the total **number of elements** in the **array column**.
- If your input array column is **null**, it returns **null**.

     # 12.2 LTS (includes Apache Spark 3.3.2, Scala 2.12)
     from pyspark.sql.functions import size

     # 15.4 LTS (includes Apache Spark 3.5.0, Scala 2.12)
     from pyspark.sql.functions import array_size

In [0]:
display(df2.withColumn("Arr_Size_01", array("PresentState", "PreviousState", size("ToLearnLanguages")))\
           .withColumn("Arr_Size_02", array("PresentState", size("LearntLanguages"), size("ToLearnLanguages")))\
           .withColumn("Arr_Size_03", array(size("Rating"), size("LearntLanguages"), size("ToLearnLanguages")))\
           .select("LearntLanguages", "ToLearnLanguages", "PresentState", "PreviousState", "Arr_Size_01", "Arr_Size_02", "Arr_Size_03"))

LearntLanguages,ToLearnLanguages,PresentState,PreviousState,Arr_Size_01,Arr_Size_02,Arr_Size_03
"List(Java, Scala, C++)","List(Spark, Java, Azure Databricks)",Bangalore,Chennai,"List(Bangalore, Chennai, 3)","List(Bangalore, 3, 3)","List(4, 3, 3)"
"List(Python, PySpark, C)","List(spark sql, ADF)",Hyderabad,Kochin,"List(Hyderabad, Kochin, 2)","List(Hyderabad, 3, 2)","List(4, 3, 2)"
"List(Devops, VB, Git)","List(ApacheSpark, Python)",Amaravathi,Noida,"List(Amaravathi, Noida, 2)","List(Amaravathi, 3, 2)","List(4, 3, 2)"
"List(SQL, Azure, AWS)","List(PySpark, Oracle, Confluence)",Noida,Mumbai,"List(Noida, Mumbai, 3)","List(Noida, 3, 3)","List(4, 3, 3)"
"List(GCC, Visual Studio)","List(SQL, Databricks, SQL Editor)",Delhi,Kolkata,"List(Delhi, Kolkata, 3)","List(Delhi, 2, 3)","List(4, 2, 3)"
"List(Devops, VB, Git)","List(ApacheSpark, Python)",Amaravathi,Noida,"List(Amaravathi, Noida, 2)","List(Amaravathi, 3, 2)","List(4, 3, 2)"
"List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema)",luknow,Mumbai,"List(luknow, Mumbai, 3)","List(luknow, 3, 3)","List(4, 3, 3)"
"List(AWS, Visual Studio)","List(SQL, Git, SQL Editor)",Delhi,Noida,"List(Delhi, Noida, 3)","List(Delhi, 2, 3)","List(4, 2, 3)"
