# Aggregation Queries

Queries using aggregation functions, `GROUP BY`, and `HAVING` clauses. Using `LEFT JOIN` in combination with `GROUP BY`

In [1]:
import json
import pymysql 

pymysql.install_as_MySQLdb()

with open('cred.json') as f:
    creds = json.load(f)

connection_string = "mysql://{user}:{password}@{host}".format(**creds)

In [2]:
%load_ext sql
%config SqlMagic.autocommit=True
%sql $connection_string

'Connected: dimitri@None'

### Increasing complexity of queries

In [3]:
%%sql

USE shared_sales

 * mysql://dimitri:***@db.data-science-ust.net
0 rows affected.


[]

## Aggregation functions MAX, MIN, AVG, SUM, COUNT

In [4]:
%%sql

SELECT MAX(RetailPrice) as max_price, AVG(RetailPrice) as avg_price 
FROM products

 * mysql://dimitri:***@db.data-science-ust.net
1 rows affected.


max_price,avg_price
1800.0,196.0335


In [5]:
%%sql

SELECT ProductNumber, MAX(RetailPrice) as max_price, AVG(RetailPrice) as avg_price 
FROM products

 * mysql://dimitri:***@db.data-science-ust.net
1 rows affected.


ProductNumber,max_price,avg_price
1,1800.0,196.0335


In [6]:
%%sql

-- aggregation with condition

SELECT avg(RetailPrice) avg_price 
FROM products 
WHERE CategoryID IN (
    SELECT CategoryID 
    FROM categories
    WHERE CategoryDescription = "Bikes")

 * mysql://dimitri:***@db.data-science-ust.net
1 rows affected.


avg_price
1321.25


In [7]:
%%sql

-- what not to do:

SELECT *
FROM products
GROUP BY CategoryID


 * mysql://dimitri:***@db.data-science-ust.net
6 rows affected.


ProductNumber,ProductName,ProductDescription,RetailPrice,QuantityOnHand,CategoryID
3,Dog Ear Cyclecomputer,,75.0,20,1
1,Trek 9000 Mountain Bike,,1200.0,6,2
23,Ultra-Pro Rain Jacket,,85.0,30,3
4,Victoria Pro All Weather Tires,,54.95,20,4
39,Road Warrior Hitch Pack,,175.0,6,5
27,X-Pro All Weather Tires,,24.0,20,6


In [8]:
%%sql

SELECT CategoryID, AVG(RetailPrice) as avg_price
FROM products
GROUP BY CategoryID


 * mysql://dimitri:***@db.data-science-ust.net
6 rows affected.


CategoryID,avg_price
1,66.191667
2,1321.25
3,51.25
4,79.765556
5,177.5
6,29.0


In [9]:
%%sql 

-- using in a subquery

SELECT * FROM (
    SELECT CategoryID, AVG(RetailPrice) as avg_price
    FROM products
    GROUP BY CategoryID) as q NATURAL JOIN categories


 * mysql://dimitri:***@db.data-science-ust.net
6 rows affected.


CategoryID,avg_price,CategoryDescription
1,66.191667,Accessories
2,1321.25,Bikes
3,51.25,Clothing
4,79.765556,Components
5,177.5,Car racks
6,29.0,Tires


In [10]:
%%sql

SELECT CategoryID, AVG(RetailPrice) as avg_price
FROM products
GROUP BY CategoryID
HAVING avg_price > 100


 * mysql://dimitri:***@db.data-science-ust.net
2 rows affected.


CategoryID,avg_price
2,1321.25
5,177.5


In [11]:
%%sql

SELECT * FROM (
    SELECT CategoryID, AVG(RetailPrice) as avg_price
    FROM products
    GROUP BY CategoryID
) as q WHERE avg_price > 100

 * mysql://dimitri:***@db.data-science-ust.net
2 rows affected.


CategoryID,avg_price
2,1321.25
5,177.5


In [12]:
%%sql

select * from products
WHERE RetailPrice > 100

 * mysql://dimitri:***@db.data-science-ust.net
13 rows affected.


ProductNumber,ProductName,ProductDescription,RetailPrice,QuantityOnHand,CategoryID
1,Trek 9000 Mountain Bike,,1200.0,6,2
2,Eagle FS-3 Mountain Bike,,1800.0,8,2
6,Viscount Mountain Bike,,635.0,5,2
11,GT RTS-2 Mountain Bike,,1650.0,5,2
14,Eagle SA-120 Clipless Pedals,,139.95,20,4
18,Viscount CardioSport Sport Watch,,179.0,12,1
25,King Cobra Helmet,,139.0,30,1
26,Glide-O-Matic Cycling Helmet,,125.0,24,1
36,Cosmic Elite Road Warrior Wheels,,165.0,22,4
37,AeroFlo ATB Wheels,,189.0,40,4


In [13]:
%%sql

SELECT * 
FROM orders 
WHERE OrderNumber NOT IN (
    SELECT OrderNumber 
    FROM order_details)

 * mysql://dimitri:***@db.data-science-ust.net
11 rows affected.


OrderNumber,OrderDate,ShipDate,CustomerID,EmployeeID
198,2017-10-08,2017-10-10,1002,703
216,2017-10-12,2017-10-12,1016,707
305,2017-11-01,2017-11-05,1013,708
361,2017-11-12,2017-11-13,1016,706
484,2017-12-09,2017-12-10,1021,707
523,2017-12-15,2017-12-17,1003,704
629,2018-01-08,2018-01-12,1014,704
632,2018-01-08,2018-01-12,1001,706
689,2018-01-15,2018-01-16,1015,705
753,2018-01-28,2018-01-30,1013,701


In [14]:
%%sql

SELECT * FROM orders  NATURAL LEFT JOIN order_details

 * mysql://dimitri:***@db.data-science-ust.net
3984 rows affected.


OrderNumber,OrderDate,ShipDate,CustomerID,EmployeeID,ProductNumber,QuotedPrice,QuantityOrdered
1,2017-09-02,2017-09-05,1018,707,1.0,1200.0,2.0
1,2017-09-02,2017-09-05,1018,707,6.0,635.0,3.0
1,2017-09-02,2017-09-05,1018,707,11.0,1650.0,4.0
1,2017-09-02,2017-09-05,1018,707,16.0,28.0,1.0
1,2017-09-02,2017-09-05,1018,707,21.0,55.0,3.0
1,2017-09-02,2017-09-05,1018,707,26.0,121.25,5.0
1,2017-09-02,2017-09-05,1018,707,40.0,174.6,6.0
2,2017-09-02,2017-09-04,1001,703,27.0,24.0,4.0
2,2017-09-02,2017-09-04,1001,703,40.0,180.0,4.0
3,2017-09-02,2017-09-05,1002,707,1.0,1164.0,5.0


In [15]:
%%sql

-- bad because mixes aggregated and random 

SELECT * FROM orders  NATURAL LEFT JOIN order_details
GROUP BY OrderNumber

 * mysql://dimitri:***@db.data-science-ust.net
944 rows affected.


OrderNumber,OrderDate,ShipDate,CustomerID,EmployeeID,ProductNumber,QuotedPrice,QuantityOrdered
1,2017-09-02,2017-09-05,1018,707,1.0,1200.0,2.0
2,2017-09-02,2017-09-04,1001,703,27.0,24.0,4.0
3,2017-09-02,2017-09-05,1002,707,1.0,1164.0,5.0
4,2017-09-02,2017-09-04,1009,703,1.0,1200.0,4.0
5,2017-09-02,2017-09-02,1024,708,1.0,1200.0,4.0
6,2017-09-02,2017-09-06,1014,702,2.0,1746.0,5.0
7,2017-09-02,2017-09-05,1001,708,14.0,139.95,3.0
8,2017-09-02,2017-09-02,1003,703,16.0,28.0,2.0
9,2017-09-02,2017-09-05,1007,708,24.0,69.0,1.0
10,2017-09-02,2017-09-05,1012,701,1.0,1200.0,2.0


In [16]:
%%sql

-- bad because mixes aggregated and random 

SELECT orders.*, COUNT(ProductNumber) as nitems FROM orders  NATURAL LEFT JOIN order_details
GROUP BY OrderNumber
HAVING nitems = 0

 * mysql://dimitri:***@db.data-science-ust.net
11 rows affected.


OrderNumber,OrderDate,ShipDate,CustomerID,EmployeeID,nitems
198,2017-10-08,2017-10-10,1002,703,0
216,2017-10-12,2017-10-12,1016,707,0
305,2017-11-01,2017-11-05,1013,708,0
361,2017-11-12,2017-11-13,1016,706,0
484,2017-12-09,2017-12-10,1021,707,0
523,2017-12-15,2017-12-17,1003,704,0
629,2018-01-08,2018-01-12,1014,704,0
632,2018-01-08,2018-01-12,1001,706,0
689,2018-01-15,2018-01-16,1015,705,0
753,2018-01-28,2018-01-30,1013,701,0


In [17]:
%%sql

SELECT orders.*, ProductNumber as nitems FROM orders  NATURAL LEFT JOIN order_details

 * mysql://dimitri:***@db.data-science-ust.net
3984 rows affected.


OrderNumber,OrderDate,ShipDate,CustomerID,EmployeeID,nitems
1,2017-09-02,2017-09-05,1018,707,1.0
1,2017-09-02,2017-09-05,1018,707,6.0
1,2017-09-02,2017-09-05,1018,707,11.0
1,2017-09-02,2017-09-05,1018,707,16.0
1,2017-09-02,2017-09-05,1018,707,21.0
1,2017-09-02,2017-09-05,1018,707,26.0
1,2017-09-02,2017-09-05,1018,707,40.0
2,2017-09-02,2017-09-04,1001,703,27.0
2,2017-09-02,2017-09-04,1001,703,40.0
3,2017-09-02,2017-09-05,1002,707,1.0


## Example problems

In [18]:
%%sql

-- Customers and the number of orders that they made

SELECT CustomerID, CustFirstName, CustLastName, count(OrderNumber) as norders
FROM customers  NATURAL LEFT JOIN orders
GROUP BY CustomerID
ORDER BY norders

 * mysql://dimitri:***@db.data-science-ust.net
28 rows affected.


CustomerID,CustFirstName,CustLastName,norders
1028,Jeffrey,Tirekicker,0
1022,Caleb,Viescas,20
1019,Zachary,Ehrlich,23
1023,Julia,Schnebly,25
1018,David,Smith,26
1006,John,Viescas,27
1015,Darren,Gehring,29
1008,Neil,Patterson,31
1010,Angel,Kennedy,32
1027,Luke,Patterson,33


In [19]:
%%sql

-- orders with their total amounts


SELECT OrderNumber, OrderDate, IFNULL(sum(QuotedPrice * QuantityOrdered), 0) as total_amount
FROM orders NATURAL LEFT JOIN order_details 
GROUP BY OrderNumber
ORDER BY total_amount

 * mysql://dimitri:***@db.data-science-ust.net
944 rows affected.


OrderNumber,OrderDate,total_amount
816,2018-02-09,0.0
216,2017-10-12,0.0
632,2018-01-08,0.0
305,2017-11-01,0.0
361,2017-11-12,0.0
689,2018-01-15,0.0
753,2018-01-28,0.0
523,2017-12-15,0.0
484,2017-12-09,0.0
629,2018-01-08,0.0


In [20]:
%%sql

-- customers and the total $$$ spent

SELECT CustomerID, CustLastName, sum(total_amount) money_spent
FROM customers NATURAL LEFT JOIN (
    SELECT CustomerID, IFNULL(sum(QuotedPrice * QuantityOrdered), 0) as total_amount
    FROM orders NATURAL LEFT JOIN order_details 
    GROUP BY OrderNumber) as q
GROUP BY CustomerID
ORDER BY money_spent DESC


 * mysql://dimitri:***@db.data-science-ust.net
28 rows affected.


CustomerID,CustLastName,money_spent
1012,Keyser,262174.17
1017,Seidel,258544.43
1005,McCrae,248250.81
1006,Viescas,238622.16
1025,Patterson,229550.43
1013,Patterson,225582.11
1004,Brown,222916.6
1002,Thompson,220811.99
1014,Abolrous,217587.07
1020,Bonnicksen,211354.3


In [21]:
%%sql

SELECT CustomerID, CustLastName, IFNULL(sum(QuotedPrice * QuantityOrdered), 0) money_spent
FROM customers NATURAL LEFT JOIN (
    SELECT * FROM orders NATURAL JOIN order_details) as q
GROUP BY CustomerID
ORDER BY money_spent DESC

 * mysql://dimitri:***@db.data-science-ust.net
28 rows affected.


CustomerID,CustLastName,money_spent
1012,Keyser,262174.17
1017,Seidel,258544.43
1005,McCrae,248250.81
1006,Viescas,238622.16
1025,Patterson,229550.43
1013,Patterson,225582.11
1004,Brown,222916.6
1002,Thompson,220811.99
1014,Abolrous,217587.07
1020,Bonnicksen,211354.3


# Summary of principles 
1. Without a `GROUP BY`, aggregation functions collapse the table into a single row.
2. With `GROUP BY`, the grouping attributes become the new primary key of the result.  
3. Do not mix aggregated and non-aggregated values in the result with or without a `GROUP BY`.
4. `HAVING` plays the same role as the `WHERE` clause in a nesting outer query so it can use the output of the aggregation functions.
5. `LEFT JOIN` is often follwed with a `GROUP BY` by the primary key attributes of the left table. In this scenario the entities in the right table are aggregated for each matching row in the left table.
