<a href="https://colab.research.google.com/github/drmartins2/EDIT_DE/blob/main/spark/examples/05-aggregations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/05-aggregations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aggregations
- Group By
- Windows Functions

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()

# Aggregations

https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#aggregate-functions

https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-aggregate.html

In [19]:
sql_query = """CREATE OR REPLACE TEMPORARY VIEW basic_pays AS SELECT * FROM VALUES
('Diane Murphy','Accounting',8435),
('Mary Patterson','Accounting',9998),
('Jeff Firrelli','Accounting',8992),
('William Patterson','Accounting',8870),
('Gerard Bondur','Accounting',11472),
('Anthony Bow','Accounting',6627),
('Leslie Jennings','IT',8113),
('Leslie Thompson','IT',5186),
('Julie Firrelli','Sales',9181),
('Steve Patterson','Sales',9441),
('Foon Yue Tseng','Sales',6660),
('George Vanauf','Sales',10563),
('Loui Bondur','SCM',10449),
('Gerard Hernandez','SCM',6949),
('Pamela Castillo','SCM',11303),
('Larry Bott','SCM',11798),
('Barry Jones','SCM',10586)
AS basic_pays(employee_name, department, salary)"""

# creating temp view
spark.sql(sql_query)

df = spark.table("basic_pays")
df.show()


+-----------------+----------+------+
|    employee_name|department|salary|
+-----------------+----------+------+
|     Diane Murphy|Accounting|  8435|
|   Mary Patterson|Accounting|  9998|
|    Jeff Firrelli|Accounting|  8992|
|William Patterson|Accounting|  8870|
|    Gerard Bondur|Accounting| 11472|
|      Anthony Bow|Accounting|  6627|
|  Leslie Jennings|        IT|  8113|
|  Leslie Thompson|        IT|  5186|
|   Julie Firrelli|     Sales|  9181|
|  Steve Patterson|     Sales|  9441|
|   Foon Yue Tseng|     Sales|  6660|
|    George Vanauf|     Sales| 10563|
|      Loui Bondur|       SCM| 10449|
| Gerard Hernandez|       SCM|  6949|
|  Pamela Castillo|       SCM| 11303|
|       Larry Bott|       SCM| 11798|
|      Barry Jones|       SCM| 10586|
+-----------------+----------+------+



In [3]:
perc_query = """SELECT
    department,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) AS pc1,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pc2,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pc3,
    percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pc4,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) AS pd1,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pd2,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pd3,
    percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pd4
FROM basic_pays
GROUP BY department
ORDER BY department;"""

spark.sql(perc_query).show()

+----------+-------+--------+-------+--------+-------+-------+-------+-------+
|department|    pc1|     pc2|    pc3|     pc4|    pd1|    pd2|    pd3|    pd4|
+----------+-------+--------+-------+--------+-------+-------+-------+-------+
|Accounting|8543.75| 7838.25| 9746.5|10260.75| 8435.0| 6627.0| 9998.0|11472.0|
|        IT|5917.75|    NULL|7381.25|    NULL| 5186.0|   NULL| 8113.0|   NULL|
|       SCM|10449.0|10786.25|11303.0|11460.75|10449.0|10449.0|11303.0|11798.0|
|     Sales|8550.75|    NULL| 9721.5|    NULL| 6660.0|   NULL|10563.0|   NULL|
+----------+-------+--------+-------+--------+-------+-------+-------+-------+



In [9]:
from pyspark.sql.functions import *
(df
 .groupBy("department")
 .agg(sum("salary").alias("sum_salary"),
      round(avg("salary"),2).alias("avg_salary"),
      min("salary").alias("min_salary"),
      array_agg("employee_name").alias("employees"),
      count(lit("")).alias("count_employees"))
 .filter(col("count_employees") > 2)
 .show(10, False))

+----------+----------+----------+----------+--------------------------------------------------------------------------------------------+---------------+
|department|sum_salary|avg_salary|min_salary|employees                                                                                   |count_employees|
+----------+----------+----------+----------+--------------------------------------------------------------------------------------------+---------------+
|Sales     |35845     |8961.25   |6660      |[Julie Firrelli, Steve Patterson, Foon Yue Tseng, George Vanauf]                            |4              |
|Accounting|54394     |9065.67   |6627      |[Diane Murphy, Mary Patterson, Jeff Firrelli, William Patterson, Gerard Bondur, Anthony Bow]|6              |
|SCM       |51085     |10217.0   |6949      |[Loui Bondur, Gerard Hernandez, Pamela Castillo, Larry Bott, Barry Jones]                   |5              |
+----------+----------+----------+----------+-------------------------

# Question

In [40]:
# Q1
# Aggregate data by surname
# Calculate highest salary by surname
# Include the respective employee that has the highest salary
# Include department information about this employee
# Count how many employees has that surname
# Put in an array all the first_names of the respective surname ordered

from pyspark.sql.functions import *
from pyspark.sql.window import Window # Importing the Window class

split_col = split(df['employee_name'], ' ')

df1 = df.withColumn('surname', split_col.getItem(1))
#df1.show()

df2 = df1.withColumn('first_name', split_col.getItem(0))
#df2.show()


# employee first name with the highest salary
df3 = df2.withColumn("row", row_number().over(Window.partitionBy("surname").orderBy(col("salary").desc()))) # Now Window is defined
df3.show()

q1 =(df2
    .groupBy("surname")
    .agg(count(lit("")).alias("count_employees")
        ,max("salary").alias("highest_salary")
        ,array_agg("first_name").alias("employee_with_highest_salary"))
    .withColumn("department_with_highest_salary", array_max(col("employee_with_highest_salary")))
    )
q1.show()

# schema expected:
# surname | count_employees | highest_salary | employee_with_highest_salary | department_with_highest_salary | array_with_all_the_first_names |

+-----------------+----------+------+---------+----------+---+
|    employee_name|department|salary|  surname|first_name|row|
+-----------------+----------+------+---------+----------+---+
|    Gerard Bondur|Accounting| 11472|   Bondur|    Gerard|  1|
|      Loui Bondur|       SCM| 10449|   Bondur|      Loui|  2|
|       Larry Bott|       SCM| 11798|     Bott|     Larry|  1|
|      Anthony Bow|Accounting|  6627|      Bow|   Anthony|  1|
|  Pamela Castillo|       SCM| 11303| Castillo|    Pamela|  1|
|   Julie Firrelli|     Sales|  9181| Firrelli|     Julie|  1|
|    Jeff Firrelli|Accounting|  8992| Firrelli|      Jeff|  2|
| Gerard Hernandez|       SCM|  6949|Hernandez|    Gerard|  1|
|  Leslie Jennings|        IT|  8113| Jennings|    Leslie|  1|
|      Barry Jones|       SCM| 10586|    Jones|     Barry|  1|
|     Diane Murphy|Accounting|  8435|   Murphy|     Diane|  1|
|   Mary Patterson|Accounting|  9998|Patterson|      Mary|  1|
|  Steve Patterson|     Sales|  9441|Patterson|     Ste

In [41]:
# Solução Aula

from pyspark.sql import functions as F
df2 = df
df2 = (
    df2
      .withColumn("names_array", split(col("employee_name"), " "))
      .withColumn("first_name", col("names_array")[0])
      .withColumn("surname", col("names_array")[1])
      .groupBy("surname")
        .agg(
          F.max("salary").alias("max_salary_by_surname"),
          F.count(lit("")).alias("count_employees"),
          F.array_agg("first_name").alias("list_first_names")
          )
)
df2.show(100, False)

+---------+---------------------+---------------+----------------------+
|surname  |max_salary_by_surname|count_employees|list_first_names      |
+---------+---------------------+---------------+----------------------+
|Bow      |6627                 |1              |[Anthony]             |
|Jones    |10586                |1              |[Barry]               |
|Bondur   |11472                |2              |[Gerard, Loui]        |
|Murphy   |8435                 |1              |[Diane]               |
|Castillo |11303                |1              |[Pamela]              |
|Firrelli |9181                 |2              |[Jeff, Julie]         |
|Vanauf   |10563                |1              |[George]              |
|Yue      |6660                 |1              |[Foon]                |
|Patterson|9998                 |3              |[Mary, William, Steve]|
|Thompson |5186                 |1              |[Leslie]              |
|Jennings |8113                 |1              |[L