# Lesson Five : Introduce Aggregate and GroupBy in Pyspark
### by Samuel Ko

In [17]:
# - Use agg()
# - GroupBy
# - sorted()
# - collect() method

In [18]:
from pyspark.sql import SparkSession

In [19]:
# Create a spark session (spark is a variable)
# for operation
spark=SparkSession.builder.appName('GroupAndAggregate').getOrCreate()

In [20]:
spark

In [21]:
df_pyspark=spark.read.csv('test5.csv', header=True, inferSchema=True)

In [22]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Major: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [23]:
df_pyspark.show()

+----------------+----------+------------------+------+
|            Name|Experience|             Major|Salary|
+----------------+----------+------------------+------+
|     Edward Wong|        10|  Machine Learning| 43000|
|         Tony Fu|         8|  Machine Learning| 35000|
|        Megan Ho|         4|      Data Science| 23000|
|     Pauline Yau|         3|    Virual Reality| 20000|
|     Candy Leung|         7|    Microprocessor| 30000|
|  Summer Jeffson|         5|Wearable Computing| 26000|
|Combell Kawaskey|         4|      Data Science| 24000|
|     Nashu Kwong|         8|      Data Science| 36000|
|     Suki Takayi|         6|Wearable Computing| 27000|
|       Sudhanshu|        12|  Machine Learning| 55000|
|     Wellon Mesk|         9|    Virual Reality| 39500|
|       Polly Mak|         4|      Data Science| 23500|
|     Kevin Leung|         7|      Data Science| 30000|
|   Patrick Tsang|         7|  Machine Learning| 32000|
|     Thomas Tsui|         5|      Data Science|

In [24]:
# Aggrerate function to calculate 
# the mean and sum of Salary and Experience
df_pyspark.agg({'Experience':'mean'}).show()
df_pyspark.agg({'Salary':'mean'}).show()

+-----------------+
|  avg(Experience)|
+-----------------+
|6.809523809523809|
+-----------------+

+-----------------+
|      avg(Salary)|
+-----------------+
|34904.76190476191|
+-----------------+



In [25]:
# Use groupBy function to calculate the average (Year of Experience) &
# average (Salary)
df_pyspark.groupBy('Major').mean().show()

+------------------+-----------------+------------------+
|             Major|  avg(Experience)|       avg(Salary)|
+------------------+-----------------+------------------+
|  Machine Learning|              9.8|           48000.0|
|    Virual Reality|5.666666666666667|29833.333333333332|
|    Microprocessor|              7.0|           30000.0|
|      Data Science|             5.75|           30937.5|
|Wearable Computing|              6.0|           31500.0|
+------------------+-----------------+------------------+



In [26]:
# Calculate the mean experience and average salary which grouped by 'Experience' field
df_pyspark.groupBy('Experience').mean().show()

+----------+---------------+------------------+
|Experience|avg(Experience)|       avg(Salary)|
+----------+---------------+------------------+
|        12|           12.0|           65000.0|
|         6|            6.0|           27000.0|
|         3|            3.0|           23500.0|
|         5|            5.0|28333.333333333332|
|         9|            9.0|           39500.0|
|         4|            4.0|           23625.0|
|         8|            8.0|           35500.0|
|         7|            7.0|30666.666666666668|
|        10|           10.0|           49000.0|
+----------+---------------+------------------+



In [27]:
df_pyspark.groupBy(['Major', 'Experience']).count().show()

+------------------+----------+-----+
|             Major|Experience|count|
+------------------+----------+-----+
|Wearable Computing|         6|    1|
|    Microprocessor|         7|    1|
|Wearable Computing|        10|    1|
|      Data Science|        10|    1|
|  Machine Learning|         7|    1|
|      Data Science|         4|    4|
|  Machine Learning|        12|    2|
|    Virual Reality|         3|    1|
|      Data Science|         8|    1|
|      Data Science|         5|    1|
|  Machine Learning|        10|    1|
|    Virual Reality|         9|    1|
|Wearable Computing|         5|    1|
|      Data Science|         7|    1|
|    Virual Reality|         5|    1|
|Wearable Computing|         3|    1|
|  Machine Learning|         8|    1|
+------------------+----------+-----+



In [28]:
df_pyspark.groupBy(['Major', 'Experience']).avg().show()

+------------------+----------+---------------+-----------+
|             Major|Experience|avg(Experience)|avg(Salary)|
+------------------+----------+---------------+-----------+
|Wearable Computing|         6|            6.0|    27000.0|
|    Microprocessor|         7|            7.0|    30000.0|
|Wearable Computing|        10|           10.0|    46000.0|
|      Data Science|        10|           10.0|    58000.0|
|  Machine Learning|         7|            7.0|    32000.0|
|      Data Science|         4|            4.0|    23625.0|
|  Machine Learning|        12|           12.0|    65000.0|
|    Virual Reality|         3|            3.0|    20000.0|
|      Data Science|         8|            8.0|    36000.0|
|      Data Science|         5|            5.0|    29000.0|
|  Machine Learning|        10|           10.0|    43000.0|
|    Virual Reality|         9|            9.0|    39500.0|
|Wearable Computing|         5|            5.0|    26000.0|
|      Data Science|         7|         

In [29]:
# Check the Average Salary which groupby 'Experience'
sorted(df_pyspark.groupBy('Experience').agg({'Salary': 'mean'}).collect())

[Row(Experience=3, avg(Salary)=23500.0),
 Row(Experience=4, avg(Salary)=23625.0),
 Row(Experience=5, avg(Salary)=28333.333333333332),
 Row(Experience=6, avg(Salary)=27000.0),
 Row(Experience=7, avg(Salary)=30666.666666666668),
 Row(Experience=8, avg(Salary)=35500.0),
 Row(Experience=9, avg(Salary)=39500.0),
 Row(Experience=10, avg(Salary)=49000.0),
 Row(Experience=12, avg(Salary)=65000.0)]

In [30]:
sorted(df_pyspark.groupBy(df_pyspark.Salary).count().collect())

[Row(Salary=20000, count=1),
 Row(Salary=23000, count=1),
 Row(Salary=23500, count=1),
 Row(Salary=24000, count=2),
 Row(Salary=26000, count=1),
 Row(Salary=27000, count=2),
 Row(Salary=29000, count=1),
 Row(Salary=30000, count=3),
 Row(Salary=32000, count=1),
 Row(Salary=35000, count=1),
 Row(Salary=36000, count=1),
 Row(Salary=39500, count=1),
 Row(Salary=43000, count=1),
 Row(Salary=46000, count=1),
 Row(Salary=55000, count=1),
 Row(Salary=58000, count=1),
 Row(Salary=75000, count=1)]