**collect_list()**: 
- Returns all values from input column as **list with duplicates**.
- It is an aggregate function which returns an **array** that has all values with the group.
- It is used to create an **array type column** on dataframe by merging rows typically after **group by or window partitions**.
- **order of elements** inside array is **maintained**.

**collect_set()**:
- Returns all values from input column as **list without duplicates**.
- dedupes and **eliminates the duplicates** and results in **unique values**.
- **order of elements** inside array is **not maintained**.

In [0]:
from pyspark.sql.functions import collect_list, collect_set, array_distinct, struct
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as f

In [0]:
data = [(1, 'James','Java', 25),
        (2, 'James','Python', 28),
        (3, 'James','Python', 35),
        (4, 'Anna','PHP', 29),
        (5, 'Anna','Javascript', None),
        (6, 'Maria','Java', 54),
        (7, 'Maria','C++', 47),
        (8, 'James','Scala', 35),
        (9, 'Anna','PHP', None),
        (10, 'Anna','HTML', 21),
        (11, 'Anna',None, 65),
        (12, 'Rakesh', None, None)
       ]

schema = StructType([StructField("id", IntegerType(), False),
                     StructField("Name", StringType(), False),
                     StructField("Languages", StringType(), True),
                     StructField("Age", IntegerType(), True)])

df = spark.createDataFrame(data=data, schema=schema)
display(df)
df.printSchema()

id,Name,Languages,Age
1,James,Java,25.0
2,James,Python,28.0
3,James,Python,35.0
4,Anna,PHP,29.0
5,Anna,Javascript,
6,Maria,Java,54.0
7,Maria,C++,47.0
8,James,Scala,35.0
9,Anna,PHP,
10,Anna,HTML,21.0


root
 |-- id: integer (nullable = false)
 |-- Name: string (nullable = false)
 |-- Languages: string (nullable = true)
 |-- Age: integer (nullable = true)



#### **Single Column: collect_list()**

In [0]:
df1 = df.select(collect_list("languages").alias("Languages_List"))
display(df1)

Languages_List
"List(Java, Python, Python, PHP, Javascript, Java, C++, Scala, PHP, HTML)"


In [0]:
df1 = df.select(collect_list("Age").alias("Age_List"))
display(df1)

Age_List
"List(25, 28, 35, 29, 54, 47, 35, 21, 65)"


#### **Single Column: collect_set()**

In [0]:
df2 = df.select(collect_set("languages").alias("Languages_Set"))
display(df2)

Languages_Set
"List(Scala, PHP, Javascript, Java, C++, Python, HTML)"


In [0]:
df2 = df.select(collect_set("Age").alias("Age_Set"))
display(df2)

Age_Set
"List(35, 21, 54, 28, 25, 29, 47, 65)"


#### **array_distinct --> collect_list**

In [0]:
df3 = df.select(array_distinct(collect_list("languages")).alias("Languages_List"))
display(df3)

Languages_List
"List(Java, Python, PHP, Javascript, C++, Scala, HTML)"


#### **Single Column: groupBy with collect_list() & collect_set()**

In [0]:
df4 = df.groupBy("name").agg(collect_list("languages").alias("List_Languages"))
display(df4)
df4.printSchema() 

name,List_Languages
James,"List(Java, Python, Python, Scala)"
Anna,"List(PHP, Javascript, PHP, HTML)"
Maria,"List(Java, C++)"
Rakesh,List()


root
 |-- name: string (nullable = false)
 |-- List_Languages: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [0]:
df5 = df.groupBy("name").agg(collect_set("languages").alias("Set_Languages"))
display(df5)
df5.printSchema() 

name,Set_Languages
James,"List(Scala, Java, Python)"
Anna,"List(PHP, Javascript, HTML)"
Maria,"List(Java, C++)"
Rakesh,List()


root
 |-- name: string (nullable = false)
 |-- Set_Languages: array (nullable = false)
 |    |-- element: string (containsNull = false)



#### **Multiple Columns: groupBy --> struct --> collect_list()**

In [0]:
df.select("name").distinct().show()

+------+
|  name|
+------+
| James|
|  Anna|
| Maria|
|Rakesh|
+------+



In [0]:
df6 = df.groupBy("name").agg(collect_list(struct("languages", "Age")).alias("Name_Languages"))
display(df6)
df6.printSchema() 

name,Name_Languages
James,"List(List(Java, 25), List(Python, 28), List(Python, 35), List(Scala, 35))"
Anna,"List(List(PHP, 29), List(Javascript, null), List(PHP, null), List(HTML, 21), List(null, 65))"
Maria,"List(List(Java, 54), List(C++, 47))"
Rakesh,"List(List(null, null))"


root
 |-- name: string (nullable = false)
 |-- Name_Languages: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- languages: string (nullable = true)
 |    |    |-- Age: integer (nullable = true)

