#### **Array**

- Array function is used to create a **new column** of **array type** by **combining two columns**.
- All input columns must have the **same data type**.
- **All elements** of ArrayType should have the **same type of elements**.
- when we work with **json** data, it is very common to get **ArrayType columns**.  
- You can think of a **PySpark array** column in a similar way to a **Python list**.

**Why Change a Column from String to Array?**

- In PySpark, the **explode** function is used to **transform each element** of an **array** in a DataFrame column into a **separate row**. However, this function requires the column to be an **array**. If your data is in **string** format, you’ll need to **convert it to an array** before using explode.

     How to transform Integer Column to Array Type?
     How to transform String Columns to Array Type?
     How to get size of an Array?

#### **Syntax**

     array(columns)
     columns: It represents the list of columns to be grouped together.

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, array, array_contains, size, element_at
from pyspark.sql.types import IntegerType, StringType, ArrayType, StructType, StructField

**EX 01**

In [0]:
data = [("Naresh", ["databricks", "Azure", "SQL"], [5, 7, 8]),
        ("Kalmesh", ["ADF", "AWS", "Spark", "SQL"], [6, 2, 3, 12]),
        ("Rohit", ["GCC", "PySpark", "Devops"], [3, 6, 8]),
        ("Kumar", ["GitHub", "PowerBi", "Tableau", "SQL"], [2, 5, 6, 10]),
        ("Rohini", ["SQL Editor", "Python", "Oracle", "Azure", "AWS"], [4, 7, 3, 8, 9])]

columns = ["Name", "Technology", "Experience"]

df = spark.createDataFrame(data=data, schema=columns)
display(df)

Name,Technology,Experience
Naresh,"List(databricks, Azure, SQL)","List(5, 7, 8)"
Kalmesh,"List(ADF, AWS, Spark, SQL)","List(6, 2, 3, 12)"
Rohit,"List(GCC, PySpark, Devops)","List(3, 6, 8)"
Kumar,"List(GitHub, PowerBi, Tableau, SQL)","List(2, 5, 6, 10)"
Rohini,"List(SQL Editor, Python, Oracle, Azure, AWS)","List(4, 7, 3, 8, 9)"


In [0]:
schema = StructType([StructField("Name", StringType(), True),
                     StructField("Technology", ArrayType(StringType())),
                     StructField("Experience", ArrayType(IntegerType()))]
                    )

In [0]:
df1 = spark.createDataFrame(data=data, schema=schema)
display(df1)

Name,Technology,Experience
Naresh,"List(databricks, Azure, SQL)","List(5, 7, 8)"
Kalmesh,"List(ADF, AWS, Spark, SQL)","List(6, 2, 3, 12)"
Rohit,"List(GCC, PySpark, Devops)","List(3, 6, 8)"
Kumar,"List(GitHub, PowerBi, Tableau, SQL)","List(2, 5, 6, 10)"
Rohini,"List(SQL Editor, Python, Oracle, Azure, AWS)","List(4, 7, 3, 8, 9)"


#### **1) How to transform Integer Column to Array Type?**

In [0]:
df_ex1 = spark.createDataFrame([("Aman", "Chennai", 22), ("Sundar", "Bangalore", 25), ("Sheela", "Hyderabad", 30), ("Shobha", "Noida", 35), ("Behra", "Mumbai", 36)], ("name", "City", "age"))
display(df_ex1)

name,City,age
Aman,Chennai,22
Sundar,Bangalore,25
Sheela,Hyderabad,30
Shobha,Noida,35
Behra,Mumbai,36


     # 12.2 LTS (includes Apache Spark 3.3.2, Scala 2.12)
     from pyspark.sql.functions import size

     # 15.4 LTS (includes Apache Spark 3.5.0, Scala 2.12)
     from pyspark.sql.functions import array_size

     from pyspark.sql.functions import size
     df.withColumn("Size", size("Array"))
                    (or)
     import pyspark.sql.functions as F
     df.withColumn("Size", F.size("Array"))

In [0]:
df_ex = df_ex1.select("age", array('age', 'age').alias("Array"))\
              .withColumn("Size", F.size("Array"))
display(df_ex)

age,Array,Size
22,"List(22, 22)",2
25,"List(25, 25)",2
30,"List(30, 30)",2
35,"List(35, 35)",2
36,"List(36, 36)",2


#### **2) How to transform String Columns to Array Type?**

In [0]:
data = [("Adarsh", "Chennai", "Cochin", "Hyderabad"),
        ("Akash", "Coimbatore", "Mumbai", "Chennai"),
        ("Senthil", "Salem", "Bangalore", None),
        ("Kalyan", "Delhi", "Bangalore", "Noida"),
        ("Sohile", "Mumbai", "Pune", "Cochin"),
        ("Gouthami", "Chennai", "Mumbai", None),
        ("Hemanth", "Delhi", "Noida", "Kolkata")
        ]
 
columns = ["Name", "pLocation", "sLocation", "aLocation"]
df = spark.createDataFrame(data, schema=columns)
df.printSchema()
display(df)

root
 |-- Name: string (nullable = true)
 |-- pLocation: string (nullable = true)
 |-- sLocation: string (nullable = true)
 |-- aLocation: string (nullable = true)



Name,pLocation,sLocation,aLocation
Adarsh,Chennai,Cochin,Hyderabad
Akash,Coimbatore,Mumbai,Chennai
Senthil,Salem,Bangalore,
Kalyan,Delhi,Bangalore,Noida
Sohile,Mumbai,Pune,Cochin
Gouthami,Chennai,Mumbai,
Hemanth,Delhi,Noida,Kolkata


#### **a) How to create an ArrayType column from existing columns (using SELECT)**

     # Method 1:
     df.select("name", array("pLocation", "sLocation", "aLocation").alias("Pref_Loc")).show(truncate=False)
 
     # Method 2:
     pref_Loc = ["pLocation", "sLocation", "aLocation"]
     df.select("name", array(*prefs_col).alias("Pref_Loc")).show(truncate=False)
 
     # Method 3:
     pref_Loc = ["pLocation", "sLocation", "aLocation"]
     df.select("name", array([col(prefLoc) for prefLoc in pref_col]).alias("Pref_Loc")).show(truncate=False)

In [0]:
# Method 1:
df_S_M1 = df.select("name", array("pLocation", "sLocation", "aLocation").alias("Pref_Loc"))\
            .withColumn("Size", F.size("Pref_Loc"))
df_S_M1.printSchema()
display(df_S_M1)

root
 |-- name: string (nullable = true)
 |-- Pref_Loc: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,Pref_Loc,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3


In [0]:
# Method 2:
pref_Loc = ["pLocation", "sLocation", "aLocation"]
df_S_M2 = df.select("name", array(*pref_Loc).alias("Pref_Loc"))\
            .withColumn("Size", F.size("Pref_Loc"))
df_S_M2.printSchema()
display(df_S_M2)

root
 |-- name: string (nullable = true)
 |-- Pref_Loc: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,Pref_Loc,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3


In [0]:
# Method 3:
pref_Loc = ["pLocation", "sLocation", "aLocation"]
df_S_M3 = df.select("name", array([col(prefLoc) for prefLoc in pref_Loc]).alias("Pref_Loc"))\
            .withColumn("Size", F.size("Pref_Loc"))
df_S_M3.printSchema()
display(df_S_M3)

root
 |-- name: string (nullable = true)
 |-- Pref_Loc: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,Pref_Loc,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3


#### **b) How to create an ArrayType column from existing columns (using withColumn)**  

     # Method 1:
     df.withColumn("preferences", array("pLocation", "sLocation", "aLocation")).select("name", "preferences").show(truncate=False)
     
     # Method 2:
     pref_Loc = ["pLocation", "sLocation", "aLocation"]
     df.withColumn("preferences", array(*pref_Loc)).select("name", "preferences").show(truncate=False)

     # Method 3:
     pref_Loc = ["pLocation", "sLocation", "aLocation"]
     df.withColumn("preferences", array([col(prefLoc) for prefLoc in pref_Loc])).select("name", "preferences").show(truncate=False)

In [0]:
# Method 1:
df_wC_M1 = df.withColumn("preferences", array("pLocation", "sLocation", "aLocation"))\
             .withColumn("Size", F.size("preferences"))\
             .select("name", "preferences", "Size")
df_wC_M1.printSchema()
display(df_wC_M1)

root
 |-- name: string (nullable = true)
 |-- preferences: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,preferences,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3


In [0]:
# Method 2:
pref_Loc = ["pLocation", "sLocation", "aLocation"]
df_wC_M2 = df.withColumn("preferences", array(*pref_Loc))\
             .withColumn("Size", F.size("preferences"))\
             .select("name", "preferences", "Size")
df_wC_M2.printSchema()
display(df_wC_M2)

root
 |-- name: string (nullable = true)
 |-- preferences: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,preferences,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3


In [0]:
# Method 3:
pref_Loc = ["pLocation", "sLocation", "aLocation"]
df_wC_M3 = df.withColumn("preferences", array([col(prefLoc) for prefLoc in pref_Loc]))\
             .withColumn("Size", F.size("preferences"))\
             .select("name", "preferences", "Size")
df_wC_M3.printSchema()
display(df_wC_M3)

root
 |-- name: string (nullable = true)
 |-- preferences: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Size: integer (nullable = false)



name,preferences,Size
Adarsh,"List(Chennai, Cochin, Hyderabad)",3
Akash,"List(Coimbatore, Mumbai, Chennai)",3
Senthil,"List(Salem, Bangalore, null)",3
Kalyan,"List(Delhi, Bangalore, Noida)",3
Sohile,"List(Mumbai, Pune, Cochin)",3
Gouthami,"List(Chennai, Mumbai, null)",3
Hemanth,"List(Delhi, Noida, Kolkata)",3
