In [1]:
import sqlite3
import pandas as pd

In [2]:
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [3]:
conn = sqlite3.connect('parch-and-posey.db')

In [4]:
cursor = conn.cursor()
cursor.execute('''
select * from sqlite_master where type = "table";
''')
columns = [col[0] for col in cursor.description]
data = cursor.fetchall()
cursor.close()

In [5]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,web_events,web_events,2,"CREATE TABLE web_events (\tid integer,\taccoun..."
1,table,sales_reps,sales_reps,7,"CREATE TABLE sales_reps (\tid integer,\tname b..."
2,table,region,region,222,"CREATE TABLE region (\tid integer,\tname bpchar)"
3,table,orders,orders,223,"CREATE TABLE orders (\tid integer,\taccount_id..."
4,table,accounts,accounts,583,"CREATE TABLE accounts (\tid integer,\tname bpc..."


In [9]:
pd.read_sql_query(sql='''
SELECT *
FROM orders;
''', con=conn)

Unnamed: 0,id,account_id,occurred_at,standard_qty,gloss_qty,poster_qty,total,standard_amt_usd,gloss_amt_usd,poster_amt_usd,total_amt_usd
0,1,1001,2015-10-06 17:31:14,123,22,24,169,613.77,164.78,194.88,973.43
1,2,1001,2015-11-05 03:34:33,190,41,57,288,948.10,307.09,462.84,1718.03
2,3,1001,2015-12-04 04:21:55,85,47,0,132,424.15,352.03,0.00,776.18
3,4,1001,2016-01-02 01:18:24,144,32,0,176,718.56,239.68,0.00,958.24
4,5,1001,2016-02-01 19:27:27,108,29,28,165,538.92,217.21,227.36,983.49
...,...,...,...,...,...,...,...,...,...,...,...
6907,6908,4501,2016-06-29 04:03:39,11,199,59,269,54.89,1490.51,479.08,2024.48
6908,6909,4501,2016-07-29 19:58:32,5,91,96,192,24.95,681.59,779.52,1486.06
6909,6910,4501,2016-08-27 00:58:11,16,94,82,192,79.84,704.06,665.84,1449.74
6910,6911,4501,2016-11-22 06:52:22,63,67,81,211,314.37,501.83,657.72,1473.92


# GROUP BY

GROUP BY can be used to aggregate data within subsets of the data. For example, grouping for different accounts, different regions, or different sales representatives.


Any column in the SELECT statement that is not within an aggregator must be in the GROUP BY clause.


The GROUP BY always goes between WHERE and ORDER BY.


ORDER BY works like SORT in spreadsheet software.



In [15]:
pd.read_sql_query(sql='''
SELECT account_id, AVG(standard_qty), AVG(gloss_qty), AVG(poster_qty)
FROM orders
GROUP BY account_id
ORDER BY account_id;
''', con=conn)

Unnamed: 0,account_id,AVG(standard_qty),AVG(gloss_qty),AVG(poster_qty)
0,1001,282.000000,279.678571,114.178571
1,1011,527.000000,14.000000,0.000000
2,1021,315.200000,48.300000,17.500000
3,1031,1148.000000,0.000000,215.000000
4,1041,167.200000,154.000000,129.200000
...,...,...,...,...
345,4461,194.490909,229.927273,155.345455
346,4471,208.500000,25.500000,23.500000
347,4481,90.000000,56.600000,129.400000
348,4491,221.068966,231.103448,127.344828


# GROUP BY - Expert Tip

Before we dive deeper into aggregations using GROUP BY statements, it is worth noting that SQL evaluates the aggregations before the LIMIT clause. If you don’t group by any columns, you’ll get a 1-row result—no problem there. If you group by a column with enough unique values that it exceeds the LIMIT number, the aggregates will be calculated, and then some rows will simply be omitted from the results.

This is actually a nice way to do things because you know you’re going to get the correct aggregates. If SQL cuts the table down to 100 rows, then performed the aggregations, your results would be substantially different. The above query’s results exceed 100 rows, so it’s a perfect example. In the next concept, use the SQL environment to try removing the LIMIT and running it again to see what changes.

In [21]:
pd.read_sql_query(sql='''
SELECT account_id, AVG(standard_qty), AVG(gloss_qty), AVG(poster_qty), AVG(total), MIN(name)
FROM orders o
JOIN accounts a
ON o.account_id = a.id
GROUP BY account_id
ORDER BY AVG(total) DESC;
''', con=conn)

Unnamed: 0,account_id,AVG(standard_qty),AVG(gloss_qty),AVG(poster_qty),AVG(total),MIN(name)
0,4251,335.692308,30.307692,2184.461538,2550.461538,Pacific Life
1,2441,1878.285714,285.714286,167.428571,2331.428571,Kohl's
2,1341,1891.777778,235.222222,150.444444,2277.444444,State Farm Insurance Cos.
3,4101,404.000000,16.125000,1430.625000,1850.750000,Fidelity National Financial
4,1031,1148.000000,0.000000,215.000000,1363.000000,Berkshire Hathaway
...,...,...,...,...,...,...
345,1671,149.000000,9.000000,6.000000,164.000000,Delta Air Lines
346,4321,123.000000,0.000000,33.000000,156.000000,Level 3 Communications
347,3271,94.500000,21.500000,22.250000,138.250000,J.C. Penney
348,1751,113.000000,6.500000,5.166667,124.666667,Massachusetts Mutual Life Insurance


In [22]:
pd.read_sql_query(sql='''
SELECT account_id, AVG(standard_qty), AVG(gloss_qty), AVG(poster_qty), AVG(total), MIN(name)
FROM orders o
JOIN accounts a
ON o.account_id = a.id
GROUP BY account_id
ORDER BY AVG(total) DESC
LIMIT 10;
''', con=conn)

Unnamed: 0,account_id,AVG(standard_qty),AVG(gloss_qty),AVG(poster_qty),AVG(total),MIN(name)
0,4251,335.692308,30.307692,2184.461538,2550.461538,Pacific Life
1,2441,1878.285714,285.714286,167.428571,2331.428571,Kohl's
2,1341,1891.777778,235.222222,150.444444,2277.444444,State Farm Insurance Cos.
3,4101,404.0,16.125,1430.625,1850.75,Fidelity National Financial
4,1031,1148.0,0.0,215.0,1363.0,Berkshire Hathaway
5,1111,178.0,262.25,841.5,1281.75,AmerisourceBergen
6,3451,756.6,25.2,423.4,1205.2,Edison International
7,3021,297.0,32.0,853.0,1182.0,CBS
8,2581,298.0,391.333333,370.0,1059.333333,CenturyLink
9,2451,164.0,313.5,526.0,1003.5,Starbucks
