#### **Array Functions**

1) Explode
2) Split
3) Array
4) Array_contains
5) array_distinct
6) array_remove

In [0]:
from pyspark.sql.types import IntegerType, StringType, ArrayType, StructType, StructField
from pyspark.sql.functions import explode, split, array, array_contains, array_distinct, array_remove

In [0]:
data = [("Amar,,Singh",["Java","Scala","C++"], ["Spark","Java","Azure Databricks"], [8, 9, 5, 7], "Bangalore", "Chennai", 25, 7),
         ("Ramesh,Rathode,", ["Python","PySpark","C"], ["spark sql","ADF"], [11, 3, 6, 8], "Hyderabad", "Kochin", 35, 8),
        ("Asha,,Rani", ["Devops","VB","Git"], ["ApacheSpark","Python"], [5, 6, 8, 10], "Amaravathi", "Noida", 30, 10),
        ("Rakesh,Kothur,", ["SQL","Azure","AWS"], ["PySpark","Oracle","Confluence"], [12, 6, 8, 15], "Noida", "Mumbai", 33, 5),
        ("Krishna,,Joshi", ["GCC","Visual Studio"], ["SQL","Databricks","SQL Editor"], [2, 6, 5, 8], "Delhi", "Kolkata", 28, 6),
        ("Hari,,Rani", ["Devops","VB","Git"], ["ApacheSpark","Python"], [5, 6, 8, 10], "Amaravathi", "Noida", 30, 10),
        ("Rakesh,kumar,", ["SQL","Azure","AWS"], ["PySpark","Oracle","Schema"], [12, 6, 8, 15], "luknow", "Mumbai", 33, 5),
        ("karan,,Joshi", ["AWS","Visual Studio"], ["SQL","Git","SQL Editor"], [2, 6, 5, 8], "Delhi", "Noida", 28, 6),
        ]

schema = StructType([ 
    StructField("FullName", StringType(), True), 
    StructField("LearntLanguages", ArrayType(StringType()), True), 
    StructField("ToLearnLanguages", ArrayType(StringType()), True),
    StructField("Rating", ArrayType(IntegerType()), True), 
    StructField("PresentState", StringType(), True), 
    StructField("PreviousState", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Experience", IntegerType(), True)
  ])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
display(df)

root
 |-- FullName: string (nullable = true)
 |-- LearntLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ToLearnLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Rating: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- PresentState: string (nullable = true)
 |-- PreviousState: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



FullName,LearntLanguages,ToLearnLanguages,Rating,PresentState,PreviousState,Age,Experience
"Amar,,Singh","List(Java, Scala, C++)","List(Spark, Java, Azure Databricks)","List(8, 9, 5, 7)",Bangalore,Chennai,25,7
"Ramesh,Rathode,","List(Python, PySpark, C)","List(spark sql, ADF)","List(11, 3, 6, 8)",Hyderabad,Kochin,35,8
"Asha,,Rani","List(Devops, VB, Git)","List(ApacheSpark, Python)","List(5, 6, 8, 10)",Amaravathi,Noida,30,10
"Rakesh,Kothur,","List(SQL, Azure, AWS)","List(PySpark, Oracle, Confluence)","List(12, 6, 8, 15)",Noida,Mumbai,33,5
"Krishna,,Joshi","List(GCC, Visual Studio)","List(SQL, Databricks, SQL Editor)","List(2, 6, 5, 8)",Delhi,Kolkata,28,6
"Hari,,Rani","List(Devops, VB, Git)","List(ApacheSpark, Python)","List(5, 6, 8, 10)",Amaravathi,Noida,30,10
"Rakesh,kumar,","List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema)","List(12, 6, 8, 15)",luknow,Mumbai,33,5
"karan,,Joshi","List(AWS, Visual Studio)","List(SQL, Git, SQL Editor)","List(2, 6, 5, 8)",Delhi,Noida,28,6


#### **1) Explode**

- To create a **new row for each element** in the given **array column**.

In [0]:
df_expl = df.select("FullName", explode("LearntLanguages"))
display(df_expl)

FullName,col
"Amar,,Singh",Java
"Amar,,Singh",Scala
"Amar,,Singh",C++
"Ramesh,Rathode,",Python
"Ramesh,Rathode,",PySpark
"Ramesh,Rathode,",C
"Asha,,Rani",Devops
"Asha,,Rani",VB
"Asha,,Rani",Git
"Rakesh,Kothur,",SQL


In [0]:
df_expl1 = df.select("FullName", explode("Rating"))
display(df_expl1)

FullName,col
"Amar,,Singh",8
"Amar,,Singh",9
"Amar,,Singh",5
"Amar,,Singh",7
"Ramesh,Rathode,",11
"Ramesh,Rathode,",3
"Ramesh,Rathode,",6
"Ramesh,Rathode,",8
"Asha,,Rani",5
"Asha,,Rani",6


#### **2) Split**

- It Convert **String Column to Array**.

- Returns an **array type** after splitting the **string column by delimiter**.
- The split function takes two arguments: the name of the **column** to split and the **delimiter**.

      Convert array to string: F.concat_ws()
      Convert string to array: F.split()

In [0]:
df_spl = df.select("FullName", split("FullName",",").alias("FullNameSplit"))
display(df_spl)

FullName,FullNameSplit
"Amar,,Singh","List(Amar, , Singh)"
"Ramesh,Rathode,","List(Ramesh, Rathode, )"
"Asha,,Rani","List(Asha, , Rani)"
"Rakesh,Kothur,","List(Rakesh, Kothur, )"
"Krishna,,Joshi","List(Krishna, , Joshi)"
"Hari,,Rani","List(Hari, , Rani)"
"Rakesh,kumar,","List(Rakesh, kumar, )"
"karan,,Joshi","List(karan, , Joshi)"


#### **3) Array**

- To create a **new array column** by **merging** the data from **multiple columns**.
- All input columns must have the **same data type**.

In [0]:
df_arry = df.withColumn("Comb_Lang", array("LearntLanguages", "ToLearnLanguages"))\
            .withColumn("Age_Exp", array("Age", "Experience"))\
            .select("LearntLanguages", "ToLearnLanguages", "Comb_Lang", "Age", "Experience", "Age_Exp")
display(df_arry)

LearntLanguages,ToLearnLanguages,Comb_Lang,Age,Experience,Age_Exp
"List(Java, Scala, C++)","List(Spark, Java, Azure Databricks)","List(List(Java, Scala, C++), List(Spark, Java, Azure Databricks))",25,7,"List(25, 7)"
"List(Python, PySpark, C)","List(spark sql, ADF)","List(List(Python, PySpark, C), List(spark sql, ADF))",35,8,"List(35, 8)"
"List(Devops, VB, Git)","List(ApacheSpark, Python)","List(List(Devops, VB, Git), List(ApacheSpark, Python))",30,10,"List(30, 10)"
"List(SQL, Azure, AWS)","List(PySpark, Oracle, Confluence)","List(List(SQL, Azure, AWS), List(PySpark, Oracle, Confluence))",33,5,"List(33, 5)"
"List(GCC, Visual Studio)","List(SQL, Databricks, SQL Editor)","List(List(GCC, Visual Studio), List(SQL, Databricks, SQL Editor))",28,6,"List(28, 6)"
"List(Devops, VB, Git)","List(ApacheSpark, Python)","List(List(Devops, VB, Git), List(ApacheSpark, Python))",30,10,"List(30, 10)"
"List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema)","List(List(SQL, Azure, AWS), List(PySpark, Oracle, Schema))",33,5,"List(33, 5)"
"List(AWS, Visual Studio)","List(SQL, Git, SQL Editor)","List(List(AWS, Visual Studio), List(SQL, Git, SQL Editor))",28,6,"List(28, 6)"


#### **4) array_contains**

- used to check if **array column contains a value**.
  - **True**: If the value is **present**.
  - **False**: If the value is **not present**.
  - **null**: If the array column is **null/None**.

#### **Syntax**

     array_contains(array_column, value)
     
     column (str, Column): It represents a column of ArrayType
     value (str): It represents the value to check if it is in the array column

In [0]:
df_arr_con = df.select("FullName",\
  "LearntLanguages", array_contains(df.LearntLanguages, "SQL").alias("Knows_Python"),\
  "ToLearnLanguages", array_contains(df.ToLearnLanguages, "PySpark").alias("Knows_PySpark"),\
  "Rating", array_contains(df.Rating, 8).alias("rating"))
display(df_arr_con)

FullName,LearntLanguages,Knows_Python,ToLearnLanguages,Knows_PySpark,Rating,rating
"Amar,,Singh","List(Java, Scala, C++)",False,"List(Spark, Java, Azure Databricks)",False,"List(8, 9, 5, 7)",True
"Ramesh,Rathode,","List(Python, PySpark, C)",False,"List(spark sql, ADF)",False,"List(11, 3, 6, 8)",True
"Asha,,Rani","List(Devops, VB, Git)",False,"List(ApacheSpark, Python)",False,"List(5, 6, 8, 10)",True
"Rakesh,Kothur,","List(SQL, Azure, AWS)",True,"List(PySpark, Oracle, Confluence)",True,"List(12, 6, 8, 15)",True
"Krishna,,Joshi","List(GCC, Visual Studio)",False,"List(SQL, Databricks, SQL Editor)",False,"List(2, 6, 5, 8)",True
"Hari,,Rani","List(Devops, VB, Git)",False,"List(ApacheSpark, Python)",False,"List(5, 6, 8, 10)",True
"Rakesh,kumar,","List(SQL, Azure, AWS)",True,"List(PySpark, Oracle, Schema)",True,"List(12, 6, 8, 15)",True
"karan,,Joshi","List(AWS, Visual Studio)",False,"List(SQL, Git, SQL Editor)",False,"List(2, 6, 5, 8)",True


#### **5) array_distinct**

- To **remove duplicate** values from **array column**.

#### **Syntax**

      array_distinct(column)

In [0]:
data_dup = [("Amar",["Java","Scala","C++","C++"], ["Spark","Java","Azure","Java"], [8, 9, 5, 7, 5, 8]),
            ("Ramesh", ["Python","PySpark","C","Python"], ["spark sql","ADF","ADF"], [11, 3, 6, 8, 3]),
            ("Asha", ["Devops","VB","Git","VB"], ["ApacheSpark","Python","Python"], [5, 6, 8, 10, 5, 6]),
            ("Rakesh", ["SQL","Azure","AWS","SQL"], ["PySpark","Oracle","Confluence","PySpark"], [12, 6, 8, 15, 6]),
            ("Krishna", ["GCC","Visual Studio","GCC"], ["SQL","Databricks","SQL Editor","Databricks"], [2, 6, 5, 8, 8, 6]),
            ("Hari", ["Devops","VB","Git","VB"], ["ApacheSpark","Python","Python"], [5, 6, 8, 10, 5, 6]),
            ("Rakesh", ["SQL","Azure","AWS","Azure"], ["PySpark","Oracle","Schema","PySpark"], [12, 6, 8, 15, 12, 8]),
            ("karan", ["AWS","Visual Studio","SQL","AWS"], ["SQL","Git","SQL Editor","Git"], [2, 6, 5, 8, 6, 8, 5]),
           ]

schema_dup = StructType([
  StructField("FullName", StringType(), True), 
  StructField("LearntLanguages", ArrayType(StringType()), True), 
  StructField("ToLearnLanguages", ArrayType(StringType()), True),
  StructField("Rating", ArrayType(IntegerType()), True)
  ])

df_dup = spark.createDataFrame(data=data_dup, schema=schema_dup)
display(df_dup)

FullName,LearntLanguages,ToLearnLanguages,Rating
Amar,"List(Java, Scala, C++, C++)","List(Spark, Java, Azure, Java)","List(8, 9, 5, 7, 5, 8)"
Ramesh,"List(Python, PySpark, C, Python)","List(spark sql, ADF, ADF)","List(11, 3, 6, 8, 3)"
Asha,"List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)","List(5, 6, 8, 10, 5, 6)"
Rakesh,"List(SQL, Azure, AWS, SQL)","List(PySpark, Oracle, Confluence, PySpark)","List(12, 6, 8, 15, 6)"
Krishna,"List(GCC, Visual Studio, GCC)","List(SQL, Databricks, SQL Editor, Databricks)","List(2, 6, 5, 8, 8, 6)"
Hari,"List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)","List(5, 6, 8, 10, 5, 6)"
Rakesh,"List(SQL, Azure, AWS, Azure)","List(PySpark, Oracle, Schema, PySpark)","List(12, 6, 8, 15, 12, 8)"
karan,"List(AWS, Visual Studio, SQL, AWS)","List(SQL, Git, SQL Editor, Git)","List(2, 6, 5, 8, 6, 8, 5)"


In [0]:
df_dist = df_dup.withColumn("dup_Learnt", array_distinct("LearntLanguages"))\
                .withColumn("dup_ToLearn", array_distinct("ToLearnLanguages"))\
                .withColumn("dup_Rating", array_distinct("Rating"))\
                .select("LearntLanguages", "dup_Learnt", "ToLearnLanguages", "dup_ToLearn", "Rating", "dup_Rating")
display(df_dist)

LearntLanguages,dup_Learnt,ToLearnLanguages,dup_ToLearn,Rating,dup_Rating
"List(Java, Scala, C++, C++)","List(Java, Scala, C++)","List(Spark, Java, Azure, Java)","List(Spark, Java, Azure)","List(8, 9, 5, 7, 5, 8)","List(8, 9, 5, 7)"
"List(Python, PySpark, C, Python)","List(Python, PySpark, C)","List(spark sql, ADF, ADF)","List(spark sql, ADF)","List(11, 3, 6, 8, 3)","List(11, 3, 6, 8)"
"List(Devops, VB, Git, VB)","List(Devops, VB, Git)","List(ApacheSpark, Python, Python)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 5, 6)","List(5, 6, 8, 10)"
"List(SQL, Azure, AWS, SQL)","List(SQL, Azure, AWS)","List(PySpark, Oracle, Confluence, PySpark)","List(PySpark, Oracle, Confluence)","List(12, 6, 8, 15, 6)","List(12, 6, 8, 15)"
"List(GCC, Visual Studio, GCC)","List(GCC, Visual Studio)","List(SQL, Databricks, SQL Editor, Databricks)","List(SQL, Databricks, SQL Editor)","List(2, 6, 5, 8, 8, 6)","List(2, 6, 5, 8)"
"List(Devops, VB, Git, VB)","List(Devops, VB, Git)","List(ApacheSpark, Python, Python)","List(ApacheSpark, Python)","List(5, 6, 8, 10, 5, 6)","List(5, 6, 8, 10)"
"List(SQL, Azure, AWS, Azure)","List(SQL, Azure, AWS)","List(PySpark, Oracle, Schema, PySpark)","List(PySpark, Oracle, Schema)","List(12, 6, 8, 15, 12, 8)","List(12, 6, 8, 15)"
"List(AWS, Visual Studio, SQL, AWS)","List(AWS, Visual Studio, SQL)","List(SQL, Git, SQL Editor, Git)","List(SQL, Git, SQL Editor)","List(2, 6, 5, 8, 6, 8, 5)","List(2, 6, 5, 8)"


#### **6) array_remove**

- To **remove** particular element from the **array column**.
- It will **remove** all the **occurrence** of that element.

#### **Syntax**

     array_remove(column, value)

- 1st parameter(column) takes a **column name** containing **array**.
- 2nd parameter(value) takes a **value** to **find index** of that element.     

In [0]:
data_rm = [("Amar",["Java","Scala","C++","C++"], ["Spark","Java","Azure","Python"], [8, 9, 5, 7, 5, 8, 8]),
           ("Ramesh", ["Python","PySpark","C","Python"], ["spark sql","ADF","ADF"], [11, 3, 6, 8, 3, 8]),
           ("Asha", ["Devops","VB","Git","VB"], ["ApacheSpark","Python","Python"], [5, 6, 8, 10, 5, 6, 8]),
           ("Rakesh", ["SQL","Azure","AWS","SQL"], ["PySpark","Oracle","Python","PySpark"], [12, 6, 8, 15, 6, 8]),
           ("Krishna", ["GCC","Visual Studio","GCC"], ["SQL","Databricks","Python","Databricks"], [2, 6, 5, 8, 8, 6, 8]),
           ("Hari", ["Devops","VB","Git","VB"], ["ApacheSpark","Python","Python"], [5, 6, 8, 10, 5, 6, 8]),
           ("Rakesh", ["SQL","Azure","AWS","Azure"], ["PySpark","Oracle","Schema","Python"], [12, 6, 8, 15, 12, 8, 8]),
           ("karan", ["AWS","Visual Studio","SQL","AWS"], ["SQL","Git","SQL Editor","Python"], [2, 6, 5, 8, 6, 8, 5, 8]),
          ]

schema_rm = StructType([
  StructField("FullName", StringType(), True), 
  StructField("LearntLanguages", ArrayType(StringType()), True), 
  StructField("ToLearnLanguages", ArrayType(StringType()), True),
  StructField("Rating", ArrayType(IntegerType()), True)
  ])

df_rm = spark.createDataFrame(data=data_rm, schema=schema_rm)
display(df_rm)

FullName,LearntLanguages,ToLearnLanguages,Rating
Amar,"List(Java, Scala, C++, C++)","List(Spark, Java, Azure, Python)","List(8, 9, 5, 7, 5, 8, 8)"
Ramesh,"List(Python, PySpark, C, Python)","List(spark sql, ADF, ADF)","List(11, 3, 6, 8, 3, 8)"
Asha,"List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)","List(5, 6, 8, 10, 5, 6, 8)"
Rakesh,"List(SQL, Azure, AWS, SQL)","List(PySpark, Oracle, Python, PySpark)","List(12, 6, 8, 15, 6, 8)"
Krishna,"List(GCC, Visual Studio, GCC)","List(SQL, Databricks, Python, Databricks)","List(2, 6, 5, 8, 8, 6, 8)"
Hari,"List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)","List(5, 6, 8, 10, 5, 6, 8)"
Rakesh,"List(SQL, Azure, AWS, Azure)","List(PySpark, Oracle, Schema, Python)","List(12, 6, 8, 15, 12, 8, 8)"
karan,"List(AWS, Visual Studio, SQL, AWS)","List(SQL, Git, SQL Editor, Python)","List(2, 6, 5, 8, 6, 8, 5, 8)"


In [0]:
# All occurrences of element "a" were removed.
df_rem = df_rm.withColumn("rm_Learnt", array_remove("LearntLanguages", "C++"))\
              .withColumn("rm_ToLearn", array_remove("ToLearnLanguages", "Python"))\
              .withColumn("rm_Rating", array_remove("Rating", 8))\
              .select("LearntLanguages", "rm_Learnt", "ToLearnLanguages", "rm_ToLearn", "Rating", "rm_Rating")
display(df_rem)

LearntLanguages,rm_Learnt,ToLearnLanguages,rm_ToLearn,Rating,rm_Rating
"List(Java, Scala, C++, C++)","List(Java, Scala)","List(Spark, Java, Azure, Python)","List(Spark, Java, Azure)","List(8, 9, 5, 7, 5, 8, 8)","List(9, 5, 7, 5)"
"List(Python, PySpark, C, Python)","List(Python, PySpark, C, Python)","List(spark sql, ADF, ADF)","List(spark sql, ADF, ADF)","List(11, 3, 6, 8, 3, 8)","List(11, 3, 6, 3)"
"List(Devops, VB, Git, VB)","List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)",List(ApacheSpark),"List(5, 6, 8, 10, 5, 6, 8)","List(5, 6, 10, 5, 6)"
"List(SQL, Azure, AWS, SQL)","List(SQL, Azure, AWS, SQL)","List(PySpark, Oracle, Python, PySpark)","List(PySpark, Oracle, PySpark)","List(12, 6, 8, 15, 6, 8)","List(12, 6, 15, 6)"
"List(GCC, Visual Studio, GCC)","List(GCC, Visual Studio, GCC)","List(SQL, Databricks, Python, Databricks)","List(SQL, Databricks, Databricks)","List(2, 6, 5, 8, 8, 6, 8)","List(2, 6, 5, 6)"
"List(Devops, VB, Git, VB)","List(Devops, VB, Git, VB)","List(ApacheSpark, Python, Python)",List(ApacheSpark),"List(5, 6, 8, 10, 5, 6, 8)","List(5, 6, 10, 5, 6)"
"List(SQL, Azure, AWS, Azure)","List(SQL, Azure, AWS, Azure)","List(PySpark, Oracle, Schema, Python)","List(PySpark, Oracle, Schema)","List(12, 6, 8, 15, 12, 8, 8)","List(12, 6, 15, 12)"
"List(AWS, Visual Studio, SQL, AWS)","List(AWS, Visual Studio, SQL, AWS)","List(SQL, Git, SQL Editor, Python)","List(SQL, Git, SQL Editor)","List(2, 6, 5, 8, 6, 8, 5, 8)","List(2, 6, 5, 6, 5)"
