<a href="https://colab.research.google.com/github/egnsuresh/Spark_Practice/blob/master/use_cases_of_collect_list_and_collect_set_functions_in_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---



---



---



# 1. PySpark installation , spark session object creation and importing common functions.

---

---





In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=fc60abba118b83f6a02472f7bbdf8f896c440abe6eb788dd2e7f28c2c02919ce
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [3]:
import pyspark
spark=pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
from pyspark.sql import functions as f

# 2. Sample data creation & Data profiling

---


---




In [5]:
# prompt: create sample 10 different records of dataframe with cust_name,cif,tran_type, branch_name,outlet_name,terminal_name,tran_amount and timestamp

data = [('John Doe', '1234567890', 'Deposit', 'New York', 'ABC Bank outlet1', '123', 100.0, '2022-01-01'),
 ('Jane Doe', '0987654321', 'Withdrawal', 'Los Angeles', 'XYZ Bank outlet1', '456', 50.0, '2022-01-02'),
 ('Michael Jones', '2345678901', 'Transfer', 'Chicago', 'PQR Bank outlet1', '789', 25.0, '2022-01-03'),
('Maccha Maama', '1234567890', 'Deposit', 'New York', 'ABC Bank outlet1', '123', 200.0, '2022-01-01'),
 ('Sarah Smith', '3456789012', 'Bill payment', 'Boston', 'STU Bank outlet1', '012', 15.0, '2022-01-04'),
 ('William Johnson', '4567890123', 'ATM withdrawal', 'Philadelphia', 'UVW Bank outlet1', '345', 10.0, '2022-01-05'),
 ('Mary Brown', '5678901234', 'Online purchase', 'San Francisco', 'XYZ Bank outlet1', '678', 5.0, '2022-01-06'),
 ('David Miller', '6789012345', 'ACH transfer', 'Dallas', 'ABC Bank outlet1', '901', 2.5, '2022-01-07'),
 ('Robert Anderson', '7890123456', 'Check deposit', 'Houston', 'PQR Bank outlet1', '234', 1.0, '2022-01-08'),
 ('Jessica Garcia', '8901234567', 'Money order', 'Atlanta', 'STU Bank outlet1', '567', 0.5, '2022-01-09'),
 ('Matthew Rodriguez', '9012345678', 'Payroll', 'Seattle', 'UVW Bank outlet1', '890', 0.25, '2022-01-10')]

cust_df = spark.createDataFrame(data, ['cust_name', 'cif', 'tran_type', 'branch_name'\
, 'outlet_name', 'terminal_name', 'tran_amount', 'tran_date'])
cust_df.show(20,False)
cust_df.printSchema()
cust_df.count()
cust_df.select("tran_type","outlet_name").show(11,False)

+-----------------+----------+---------------+-------------+----------------+-------------+-----------+----------+
|cust_name        |cif       |tran_type      |branch_name  |outlet_name     |terminal_name|tran_amount|tran_date |
+-----------------+----------+---------------+-------------+----------------+-------------+-----------+----------+
|John Doe         |1234567890|Deposit        |New York     |ABC Bank outlet1|123          |100.0      |2022-01-01|
|Jane Doe         |0987654321|Withdrawal     |Los Angeles  |XYZ Bank outlet1|456          |50.0       |2022-01-02|
|Michael Jones    |2345678901|Transfer       |Chicago      |PQR Bank outlet1|789          |25.0       |2022-01-03|
|Maccha Maama     |1234567890|Deposit        |New York     |ABC Bank outlet1|123          |200.0      |2022-01-01|
|Sarah Smith      |3456789012|Bill payment   |Boston       |STU Bank outlet1|012          |15.0       |2022-01-04|
|William Johnson  |4567890123|ATM withdrawal |Philadelphia |UVW Bank outlet1|345

# 3. use cases of  collect_list and collect_set

---



---



<table>
<tr>
  <td>
  <pre>
+---------------+----------------+
|tran_type      |outlet_name     |
+---------------+----------------+
|Deposit        |ABC Bank outlet1|
|Withdrawal     |XYZ Bank outlet1|
|Transfer       |PQR Bank outlet1|
|Deposit        |ABC Bank outlet1|
|Bill payment   |STU Bank outlet1|
|ATM withdrawal |UVW Bank outlet1|
|Online purchase|XYZ Bank outlet1|
|ACH transfer   |ABC Bank outlet1|
|Check deposit  |PQR Bank outlet1|
|Money order    |STU Bank outlet1|
|Payroll        |UVW Bank outlet1|
+---------------+----------------+
</pre></td><td></td><td>
<pre>
+----------------+----------------------------+
|outlet_name     |outlet_tran_list            |
+----------------+----------------------------+
|ABC Bank outlet1|Deposit|Deposit|ACH transfer|
|PQR Bank outlet1|Transfer|Check deposit      |
|STU Bank outlet1|Bill payment|Money order    |
|UVW Bank outlet1|ATM withdrawal|Payroll      |
|XYZ Bank outlet1|Withdrawal|Online purchase  |
+----------------+----------------------------+
</pre>
</td></tr></table>

  **give me the list of different transaction types as a *single value* with pipe (|) separated for each outlet?**


# The point

In [12]:
#importing required window class
from pyspark.sql.window import Window

outlet_df=cust_df.withColumn("outlet_tran_list"\
                             ,f.concat_ws("|", f.collect_list("tran_type")\
                             .over(Window.partitionBy("outlet_name"))))\
                  .withColumn("outlet_tran_set"\
                             ,f.concat_ws("|", f.collect_set("tran_type")\
                             .over(Window.partitionBy("outlet_name"))))

outlet_df.select("outlet_name","outlet_tran_list","outlet_tran_set")\
.dropDuplicates()\
.show(10,False)



+----------------+----------------------------+--------------------------+
|outlet_name     |outlet_tran_list            |outlet_tran_set           |
+----------------+----------------------------+--------------------------+
|ABC Bank outlet1|Deposit|Deposit|ACH transfer|ACH transfer|Deposit      |
|PQR Bank outlet1|Transfer|Check deposit      |Transfer|Check deposit    |
|STU Bank outlet1|Bill payment|Money order    |Bill payment|Money order  |
|UVW Bank outlet1|ATM withdrawal|Payroll      |ATM withdrawal|Payroll    |
|XYZ Bank outlet1|Withdrawal|Online purchase  |Withdrawal|Online purchase|
+----------------+----------------------------+--------------------------+



# The explanation

<table style="background:red; width:85%; border:1px solid black;">
<tr style="color:red">
  <td>1.what have u(input)</td>
  <td>cust_df</td>
</tr>
<tr>
  <td colspan="3"><code>cust_df.<..></code></td>
</tr>
<tr>
  <td>2.what do u want</td>
  <td>new column as outlet_tran_list</td>
  <td>then use withColumn</td>
</tr>
<tr>
<td colspan="3">cust_df<code>.withColumn("outlet_tran_list",<..>)</code></code></td>
</tr>
<tr>
  <td>3.What for?</td>
  <td>to store comma separted tran_type values list</td>
  <td>then use collect_list</td>
</tr>
<tr>
  <td>4.from which column</td>
  <td>tran_type</td>
  <td></td>
</tr>
<tr>
<td colspan="3">cust_df.withColumn("outlet_tran_list",<code> f.collect_list("tran_type").<..>)</code></td>
</tr>
<tr>
  <td>5.how u want it</td>
  <td>for each outlet_name unique value</td>
  <td>then use over(Window.partitionBy("outlet_name")).<..></td>
</tr>
<tr><td colspan="3">cust_df.withColumn("outlet_tran_list", f.collect_list("tran_type")<code>.over(Window.partitionBy("outlet_name"))</code>)</td>
<tr>
  <td> 6.where do u want store result</td>
  <td>outlet_df</td>
</tr>
<tr>
  <td colspan="3" style="background:red;"><code>outlet_df=</code>cust_df.withColumn("outlet_tran_list", f.collect_list("tran_type").over(Window.partitionBy("outlet_name")))
  </td>
</tr>
</table>

The End

---



---



---

