# Chapter 7 - Aggregations

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("chapter7").getOrCreate()

In [4]:
df = spark.read.format("csv").option("header", "true")\
.option("inferSchema", "true").load("../data/retail-data/all/*.csv")\
.coalesce(5)
df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [6]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



## Aggregation functions


In [11]:
# this is an action
df.count()

541909

In [10]:
from pyspark.sql.functions import count

# this is a transformation
df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [12]:
from pyspark.sql.functions import countDistinct

df.select(countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [14]:
# get an approximation (less accurate but faster)
from pyspark.sql.functions import approx_count_distinct

df.select(approx_count_distinct("StockCode", 0.1)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [15]:
# this is an action
df.first()

Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom')

In [16]:
from pyspark.sql.functions import first, last

df.select(first("StockCode"), last("StockCode")).show()

+-----------------------+----------------------+
|first(StockCode, false)|last(StockCode, false)|
+-----------------------+----------------------+
|                 85123A|                 22138|
+-----------------------+----------------------+



In [17]:
from pyspark.sql.functions import min, max

df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [19]:
from pyspark.sql.functions import sum

df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [20]:
from pyspark.sql.functions import sumDistinct

df.select(sumDistinct("Quantity")).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [22]:
from pyspark.sql.functions import avg

df.select(avg("Quantity")).show()

+----------------+
|   avg(Quantity)|
+----------------+
|9.55224954743324|
+----------------+



In [23]:
from pyspark.sql.functions import var_pop, var_samp, stddev_pop, stddev_samp

df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|47559.30364660879| 47559.39140929848|  218.08095663447733|   218.08115785023355|
+-----------------+------------------+--------------------+---------------------+



In [25]:
from pyspark.sql.functions import skewness, kurtosis

df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+------------------+------------------+
|skewness(Quantity)|kurtosis(Quantity)|
+------------------+------------------+
|-0.264075576105298|119768.05495534067|
+------------------+------------------+



In [26]:
from pyspark.sql.functions import covar_pop, covar_samp, corr

df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085636837E-4|             1052.7280543912716|            1052.7260778751674|
+-------------------------+-------------------------------+------------------------------+



In [27]:
from pyspark.sql.functions import collect_set, collect_list

# returns a set and a list
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [28]:
df.groupby("InvoiceNo", "CustomerId")

<pyspark.sql.group.GroupedData at 0x110e60588>

In [29]:
df.groupby("InvoiceNo", "CustomerId").count().show()

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
|   538800|     16458|   10|
|   538942|     17346|   12|
|  C539947|     13854|    1|
|   540096|     13253|   16|
|   540530|     14755|   27|
|   541225|     14099|   19|
|   541978|     13551|    4|
|   542093|     17677|   16|
|   543188|     12567|   63|
|   543590|     17377|   19|
|  C543757|     13115|    1|
|  C544318|     12989|    1|
|   544578|     12365|    1|
|   545165|     16339|   20|
|   545289|     14732|   30|
+---------+----------+-----+
only showing top 20 rows



In [31]:
from pyspark.sql.functions import count, expr

df.groupBy("InvoiceNo").agg(count("Quantity").alias("quan"), expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
|   538184|  26|             26|
|   538517|  53|             53|
|   538879|  19|             19|
|   539275|   6|              6|
|   539630|  12|             12|
|   540499|  24|             24|
|   540540|  22|             22|
|  C540850|   1|              1|
|   540976|  48|             48|
|   541432|   4|              4|
|   541518| 101|            101|
|   541783|  35|             35|
|   542026|   9|              9|
|   542375|   6|              6|
|  C542604|   8|              8|
+---------+----+---------------+
only showing top 20 rows



## Window functions

In [36]:
from pyspark.sql.functions import col, to_date

dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.select("date").show(1)

+----------+
|      date|
+----------+
|2010-12-01|
+----------+
only showing top 1 row



In [38]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

windowSpec = Window.partitionBy("CustomerId", "date").\
orderBy(desc("Quantity")).\
rowsBetween(Window.unboundedPreceding, Window.currentRow)
windowSpec

<pyspark.sql.window.WindowSpec at 0x110e992b0>

In [39]:
from pyspark.sql.functions import max

maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [40]:
maxPurchaseQuantity

Column<b'max(Quantity) OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)'>

In [50]:
from pyspark.sql.functions import dense_rank, rank

purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [52]:
dfWithDate.select(col("CustomerId"), col("date"), col("Quantity"),
                  purchaseRank.alias("qualityRank"), 
                  purchaseDenseRank.alias("qualityDenseRank"),
                  maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

+----------+----------+--------+-----------+----------------+-------------------+
|CustomerId|      date|Quantity|qualityRank|qualityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+-----------+----------------+-------------------+
|     12477|2011-04-14|     100|          1|               1|                100|
|     12477|2011-04-14|      72|          2|               2|                100|
|     12477|2011-04-14|      36|          3|               3|                100|
|     12477|2011-04-14|      36|          3|               3|                100|
|     12477|2011-04-14|      36|          3|               3|                100|
|     12477|2011-04-14|      24|          6|               4|                100|
|     12477|2011-04-14|      24|          6|               4|                100|
|     12477|2011-04-14|      24|          6|               4|                100|
|     12477|2011-04-14|      20|          9|               5|                100|
|     12477|2011

## Rollups

In [54]:
dfNotNull = dfWithDate.drop()
rolledUpDf = dfNotNull.rollup("Date", "Country").agg(sum("Quantity")).orderBy("Date")
rolledUpDf.show()

+----------+--------------+-------------+
|      Date|       Country|sum(Quantity)|
+----------+--------------+-------------+
|      null|          null|      5176450|
|2010-12-01|          null|        26814|
|2010-12-01|     Australia|          107|
|2010-12-01|        France|          449|
|2010-12-01|United Kingdom|        23949|
|2010-12-01|          EIRE|          243|
|2010-12-01|        Norway|         1852|
|2010-12-01|   Netherlands|           97|
|2010-12-01|       Germany|          117|
|2010-12-02|       Germany|          146|
|2010-12-02|          EIRE|            4|
|2010-12-02|          null|        21023|
|2010-12-02|United Kingdom|        20873|
|2010-12-03|        Poland|          140|
|2010-12-03|        France|          239|
|2010-12-03|       Germany|          170|
|2010-12-03|      Portugal|           65|
|2010-12-03|         Spain|          400|
|2010-12-03|   Switzerland|          110|
|2010-12-03|       Belgium|          528|
+----------+--------------+-------

In [58]:
dfNotNull.groupBy("Date", "Country").sum().show()

+----------+--------------+-------------+------------------+---------------+
|      Date|       Country|sum(Quantity)|    sum(UnitPrice)|sum(CustomerID)|
+----------+--------------+-------------+------------------+---------------+
|2011-02-14|          EIRE|         1502|            339.69|        1075856|
|2011-02-23|        France|          362|             49.33|         261177|
|2011-04-04|         Japan|         -624|              2.55|          12755|
|2011-06-05|       Germany|          529|177.08999999999997|         716690|
|2011-07-24|     Australia|          214| 57.56999999999999|         248620|
|2011-10-18|          EIRE|          625|193.95999999999998|         743096|
|2010-12-03|        France|          239| 89.88000000000002|         291714|
|2011-06-09|        Sweden|          188|38.629999999999995|         137313|
|2011-06-12|       Germany|          345|             37.21|         162708|
|2011-06-22|        Norway|            1|             700.0|          12432|