#### **Array Functions**

- array_max
- array_min
- array_sort
- array_position
- array_size

In [0]:
from pyspark.sql.types import IntegerType, StringType, ArrayType, StructType, StructField
from pyspark.sql.functions import explode, split, array, array_contains, array_max, array_min, array_sort, array_position, array_size, element_at

In [0]:
data = [("Akash", ["Java", "Scala", "C++"], ["Spark", "Java", "Azure Databricks"], [8, 9, 8, 5, 7]),
        ("Ramprasad", ["Python", "PySpark", "C", "Python"], ["spark sql", "ADF"], [11, 8, 3, 6, 8]),
        ("Rohit", ["Scala", "Devops", "VB", "Git"], ["ApacheSpark", "Python"], [5, 6, 8, 10, 8]),
        ("Raju", ["SQL", "Azure", "Scala", "AWS"], ["PySpark", "Oracle", "Confluence"], [12, 6, 5, 8, 8]),
        ("Kamalakar", ["GCC", "Visual Studio", "Python"], ["SQL", "Databricks", "SQL"], [1, 2, 6, 5, 8]),
        ("Swetha", ["Devops", "VB", "Git", "Scala"], ["ApacheSpark", "Python"], [5, 6, 8, 10, 3]),
        ("Mallik", ["SQL", "Azure", "AWS"], ["PySpark", "Oracle", "Schema"], [12, None, 8, 15, 9]),
        ("Deepak", ["AWS", "Python", "Scala"], ["Git", "SQL Editor", "SQL"], [2, 6, 5, 8, 4]),
        ("Kiran", ["Scala", "Python", "Spark"], ["ADF", "AWS", "SQL"], [None, None, None, None, None]),
        ("Sumanth", ["Python", "Scala", "ADF"], ["KSql", "Databricks", "SQL"], [])
       ]

schema = StructType([
    StructField("FullName", StringType(), True),
    StructField("LearntLanguages", ArrayType(StringType()), True),
    StructField("ToLearnLanguages", ArrayType(StringType()), True),
    StructField("Rating", ArrayType(IntegerType()), True)
])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
display(df)

root
 |-- FullName: string (nullable = true)
 |-- LearntLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ToLearnLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Rating: array (nullable = true)
 |    |-- element: integer (containsNull = true)



FullName,LearntLanguages,ToLearnLanguages,Rating
Akash,"List(Java, Scala, C++)","List(Spark, Java, Azure Databricks)","List(8, 9, 8, 5, 7)"
Ramprasad,"List(Python, PySpark, C, Python)","List(spark sql, ADF)","List(11, 8, 3, 6, 8)"
Rohit,"List(Scala, Devops, VB, Git)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 8)"
Raju,"List(SQL, Azure, Scala, AWS)","List(PySpark, Oracle, Confluence)","List(12, 6, 5, 8, 8)"
Kamalakar,"List(GCC, Visual Studio, Python)","List(SQL, Databricks, SQL)","List(1, 2, 6, 5, 8)"
Swetha,"List(Devops, VB, Git, Scala)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 3)"
Mallik,"List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema)","List(12, null, 8, 15, 9)"
Deepak,"List(AWS, Python, Scala)","List(Git, SQL Editor, SQL)","List(2, 6, 5, 8, 4)"
Kiran,"List(Scala, Python, Spark)","List(ADF, AWS, SQL)","List(null, null, null, null, null)"
Sumanth,"List(Python, Scala, ADF)","List(KSql, Databricks, SQL)",List()


#### **1) array_max**

- returns the **maximum value** of the array.
- **NULL** elements are **skipped**.
- If array is **empty**, or contains **only NULL** elements, NULL is returned.
- array_max works with **arrays of numeric types**. It finds the **maximum** value in an array of **numeric elements**.
- If you try to use it on an **array of strings**, it will **not work as intended** because it's not designed to compare string values or determine the "maximum" string in terms of **alphabetical or lexicographical order**.

#### **Syntax**

     array_max(array)
     array: Any ARRAY with elements for which order is supported

In [0]:
arr_max = df.withColumn("max_Rating", array_max("Rating"))\
            .select("Rating", "max_Rating")            
display(arr_max)

Rating,max_Rating
"List(8, 9, 8, 5, 7)",9.0
"List(11, 8, 3, 6, 8)",11.0
"List(5, 6, 8, 10, 8)",10.0
"List(12, 6, 5, 8, 8)",12.0
"List(1, 2, 6, 5, 8)",8.0
"List(5, 6, 8, 10, 3)",10.0
"List(12, null, 8, 15, 9)",15.0
"List(2, 6, 5, 8, 4)",8.0
"List(null, null, null, null, null)",
List(),


#### **2) array_min**

- returns the **minimum value** of the array.

In [0]:
arr_min = df.withColumn("min_Rating", array_min("Rating"))\
            .select("Rating", "min_Rating")            
display(arr_min)

Rating,min_Rating
"List(8, 9, 8, 5, 7)",5.0
"List(11, 8, 3, 6, 8)",3.0
"List(5, 6, 8, 10, 8)",5.0
"List(12, 6, 5, 8, 8)",5.0
"List(1, 2, 6, 5, 8)",1.0
"List(5, 6, 8, 10, 3)",3.0
"List(12, null, 8, 15, 9)",8.0
"List(2, 6, 5, 8, 4)",2.0
"List(null, null, null, null, null)",
List(),


#### **3) array_sort()**

- arranges the input array in **ascending order**.
- When you have **NaN** values in an array, the following applies.
  - For **double/float type**, NaN is considered greater than any **non-NaN elements**.
  - **Null** elements are positioned at the **end** of the **resulting array**.

In [0]:
df_arr_sort = df.withColumn("arr_sort_Learnt", array_sort("LearntLanguages"))\
                .withColumn("arr_sort_ToLearn", array_sort("ToLearnLanguages"))\
                .withColumn("arr_sort_Rating", array_sort("Rating"))\
                .select("LearntLanguages", "arr_sort_Learnt", "ToLearnLanguages", "arr_sort_ToLearn", "Rating", "arr_sort_Rating")
display(df_arr_sort)

LearntLanguages,arr_sort_Learnt,ToLearnLanguages,arr_sort_ToLearn,Rating,arr_sort_Rating
"List(Java, Scala, C++)","List(C++, Java, Scala)","List(Spark, Java, Azure Databricks)","List(Azure Databricks, Java, Spark)","List(8, 9, 8, 5, 7)","List(5, 7, 8, 8, 9)"
"List(Python, PySpark, C, Python)","List(C, PySpark, Python, Python)","List(spark sql, ADF)","List(ADF, spark sql)","List(11, 8, 3, 6, 8)","List(3, 6, 8, 8, 11)"
"List(Scala, Devops, VB, Git)","List(Devops, Git, Scala, VB)","List(ApacheSpark, Python)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 8)","List(5, 6, 8, 8, 10)"
"List(SQL, Azure, Scala, AWS)","List(AWS, Azure, SQL, Scala)","List(PySpark, Oracle, Confluence)","List(Confluence, Oracle, PySpark)","List(12, 6, 5, 8, 8)","List(5, 6, 8, 8, 12)"
"List(GCC, Visual Studio, Python)","List(GCC, Python, Visual Studio)","List(SQL, Databricks, SQL)","List(Databricks, SQL, SQL)","List(1, 2, 6, 5, 8)","List(1, 2, 5, 6, 8)"
"List(Devops, VB, Git, Scala)","List(Devops, Git, Scala, VB)","List(ApacheSpark, Python)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 3)","List(3, 5, 6, 8, 10)"
"List(SQL, Azure, AWS)","List(AWS, Azure, SQL)","List(PySpark, Oracle, Schema)","List(Oracle, PySpark, Schema)","List(12, 6, 8, 15, 9)","List(6, 8, 9, 12, 15)"
"List(AWS, Python, Scala)","List(AWS, Python, Scala)","List(Git, SQL Editor, SQL)","List(Git, SQL, SQL Editor)","List(2, 6, 5, 8, 4)","List(2, 4, 5, 6, 8)"


#### **4) array_size()**

- The array_size() returns the total **number of elements** in the **array column**.
- If your input array column is **null**, it returns **null**.

In [0]:
# array_size is not available in 12.2 LTS (includes Apache Spark 3.3.2, Scala 2.12). instead use size.
from pyspark.sql.functions import size

# array_size is available in 15.4 LTS (includes Apache Spark 3.5.0, Scala 2.12).
from pyspark.sql.functions import array_size

In [0]:
df_arr_size = df.withColumn("Arr_Size_Learnt", array_size("LearntLanguages"))\
                .withColumn("Arr_Size_ToLearn", array_size("ToLearnLanguages"))\
                .withColumn("Arr_Size_Rating", array_size("Rating"))\
                .select("LearntLanguages", "Arr_Size_Learnt", "ToLearnLanguages", "Arr_Size_ToLearn", "Rating", "Arr_Size_Rating")
display(df_arr_size)

LearntLanguages,Arr_Size_Learnt,ToLearnLanguages,Arr_Size_ToLearn,Rating,Arr_Size_Rating
"List(Java, Scala, C++)",3,"List(Spark, Java, Azure Databricks)",3,"List(8, 9, 8, 5, 7)",5
"List(Python, PySpark, C, Python)",4,"List(spark sql, ADF)",2,"List(11, 8, 3, 6, 8)",5
"List(Scala, Devops, VB, Git)",4,"List(ApacheSpark, Python)",2,"List(5, 6, 8, 10, 8)",5
"List(SQL, Azure, Scala, AWS)",4,"List(PySpark, Oracle, Confluence)",3,"List(12, 6, 5, 8, 8)",5
"List(GCC, Visual Studio, Python)",3,"List(SQL, Databricks, SQL)",3,"List(1, 2, 6, 5, 8)",5
"List(Devops, VB, Git, Scala)",4,"List(ApacheSpark, Python)",2,"List(5, 6, 8, 10, 3)",5
"List(SQL, Azure, AWS)",3,"List(PySpark, Oracle, Schema)",3,"List(12, null, 8, 15, 9)",5
"List(AWS, Python, Scala)",3,"List(Git, SQL Editor, SQL)",3,"List(2, 6, 5, 8, 4)",5
"List(Scala, Python, Spark)",3,"List(ADF, AWS, SQL)",3,"List(null, null, null, null, null)",5
"List(Python, Scala, ADF)",3,"List(KSql, Databricks, SQL)",3,List(),0


In [0]:
df_arr_Lernt_filt = df.select("LearntLanguages")\
                      .filter(array_size("LearntLanguages") > 3)                     
display(df_arr_Lernt_filt)

LearntLanguages
"List(Python, PySpark, C, Python)"
"List(Scala, Devops, VB, Git)"
"List(SQL, Azure, Scala, AWS)"
"List(Devops, VB, Git, Scala)"


In [0]:
df_arr_ToLernt_filt = df.select("ToLearnLanguages")\
                        .filter(array_size("ToLearnLanguages") > 2)
display(df_arr_ToLernt_filt)

ToLearnLanguages
"List(Spark, Java, Azure Databricks)"
"List(PySpark, Oracle, Confluence)"
"List(SQL, Databricks, SQL)"
"List(PySpark, Oracle, Schema)"
"List(Git, SQL Editor, SQL)"
"List(ADF, AWS, SQL)"
"List(KSql, Databricks, SQL)"


In [0]:
df_arr_Rat_filt = df.select("Rating")\
                    .filter(array_size("Rating") > 2)
display(df_arr_Rat_filt)

Rating
"List(8, 9, 8, 5, 7)"
"List(11, 8, 3, 6, 8)"
"List(5, 6, 8, 10, 8)"
"List(12, 6, 5, 8, 8)"
"List(1, 2, 6, 5, 8)"
"List(5, 6, 8, 10, 3)"
"List(12, null, 8, 15, 9)"
"List(2, 6, 5, 8, 4)"
"List(null, null, null, null, null)"


In [0]:
df_arr_size_filt = df.select("LearntLanguages", "ToLearnLanguages", "Rating")\
                     .filter(array_size("LearntLanguages") > 3)\
                     .filter(array_size("ToLearnLanguages") > 2)\
                     .filter(array_size("Rating") > 2)
display(df_arr_size_filt)

LearntLanguages,ToLearnLanguages,Rating
"List(SQL, Azure, Scala, AWS)","List(PySpark, Oracle, Confluence)","List(12, 6, 5, 8, 8)"


#### **5) array_position()**

- To find the **position** of the **first occurrence** of the value in the given **array**.
- It returns **null** if **either of the arguments is null**.
- Note that the position is **not zero-based** but **1 1-based index**.
- Returns **0** if the value could **not be found** in the array.

#### **Syntax**

     array_position(column, value)

In [0]:
df_arr_pos = df.withColumn("Learnt_position", array_position("LearntLanguages", "Python"))\
               .withColumn("ToLearn_position", array_position("ToLearnLanguages", "SQL"))\
               .withColumn("ToLearn_NULL_position", array_position("ToLearnLanguages", "Git"))\
               .withColumn("Rating_position", array_position("Rating", 8))\
               .select("LearntLanguages", "Learnt_position", "ToLearnLanguages", "ToLearn_position", "ToLearn_NULL_position", "Rating", "Rating_position")
display(df_arr_pos)

LearntLanguages,Learnt_position,ToLearnLanguages,ToLearn_position,ToLearn_NULL_position,Rating,Rating_position
"List(Java, Scala, C++)",0,"List(Spark, Java, Azure Databricks)",0,0,"List(8, 9, 8, 5, 7)",1
"List(Python, PySpark, C, Python)",1,"List(spark sql, ADF)",0,0,"List(11, 8, 3, 6, 8)",2
"List(Scala, Devops, VB, Git)",0,"List(ApacheSpark, Python)",0,0,"List(5, 6, 8, 10, 8)",3
"List(SQL, Azure, Scala, AWS)",0,"List(PySpark, Oracle, Confluence)",0,0,"List(12, 6, 5, 8, 8)",4
"List(GCC, Visual Studio, Python)",3,"List(SQL, Databricks, SQL)",1,0,"List(1, 2, 6, 5, 8)",5
"List(Devops, VB, Git, Scala)",0,"List(ApacheSpark, Python)",0,0,"List(5, 6, 8, 10, 3)",3
"List(SQL, Azure, AWS)",0,"List(PySpark, Oracle, Schema)",0,0,"List(12, 6, 8, 15, 9)",3
"List(AWS, Python, Scala)",2,"List(Git, SQL Editor, SQL)",3,1,"List(2, 6, 5, 8, 4)",4


In [0]:
df_arr_ele_pos = df.withColumn("Learnt_position", element_at("LearntLanguages", 2))\
                   .withColumn("Learnt_position_Elemt", array_position("LearntLanguages", "Scala"))\
                   .select("LearntLanguages", "Learnt_position", "Learnt_position_Elemt")
display(df_arr_ele_pos) 

LearntLanguages,Learnt_position,Learnt_position_Elemt
"List(Java, Scala, C++)",Scala,2
"List(Python, PySpark, C, Python)",PySpark,0
"List(Scala, Devops, VB, Git)",Devops,1
"List(SQL, Azure, Scala, AWS)",Azure,3
"List(GCC, Visual Studio, Python)",Visual Studio,0
"List(Devops, VB, Git, Scala)",VB,4
"List(SQL, Azure, AWS)",Azure,0
"List(AWS, Python, Scala)",Python,3


In [0]:
trns_data = df.withColumn("max", array_max("ToLearnLanguages"))\
              .withColumn("min", array_min("ToLearnLanguages"))\
              .withColumn("sort", array_sort("ToLearnLanguages"))\
              .withColumn("location", array_position("Rating",8))\
              .select("ToLearnLanguages", "Rating", "max", "min", "sort", "location")

display(trns_data)

ToLearnLanguages,Rating,max,min,sort,location
"List(Spark, Java, Azure Databricks)","List(8, 9, 5, 7)",Spark,Azure Databricks,"List(Azure Databricks, Java, Spark)",1
"List(spark sql, ADF)","List(11, 3, 6, 8)",spark sql,ADF,"List(ADF, spark sql)",4
"List(ApacheSpark, Python)","List(5, 6, 8, 10)",Python,ApacheSpark,"List(ApacheSpark, Python)",3
"List(PySpark, Oracle, Confluence)","List(12, 6, 8, 15)",PySpark,Confluence,"List(Confluence, Oracle, PySpark)",3
"List(SQL, Databricks, SQL Editor)","List(2, 6, 5, 8)",SQL Editor,Databricks,"List(Databricks, SQL, SQL Editor)",4
"List(ApacheSpark, Python)","List(5, 6, 8, 10)",Python,ApacheSpark,"List(ApacheSpark, Python)",3
"List(PySpark, Oracle, Schema)","List(12, 6, 8, 15)",Schema,Oracle,"List(Oracle, PySpark, Schema)",3
"List(SQL, Git, SQL Editor)","List(2, 6, 5, 8)",SQL Editor,Git,"List(Git, SQL, SQL Editor)",4
