In [1]:
import sqlite3
import pandas as pd

In [2]:
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [3]:
conn = sqlite3.connect('parch-and-posey.db')

In [4]:
cursor = conn.cursor()
cursor.execute('''
select * from sqlite_master where type = "table";
''')
columns = [col[0] for col in cursor.description]
data = cursor.fetchall()
cursor.close()

In [5]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,web_events,web_events,2,"CREATE TABLE web_events (\tid integer,\taccoun..."
1,table,sales_reps,sales_reps,92,"CREATE TABLE sales_reps (\tid integer,\tname b..."
2,table,region,region,93,"CREATE TABLE region (\tid integer,\tname bpchar)"
3,table,orders,orders,94,"CREATE TABLE orders (\tid integer,\taccount_id..."
4,table,accounts,accounts,221,"CREATE TABLE accounts (\tid integer,\tname bpc..."


# Performance tuning

One way to make a query run faster is to reduce the number of calculations that need to be performed. Some of the high-level things that will affect the number of calculations a given query will make include:

Table size
Joins
Aggregations
Query runtime is also dependent on some things that you can’t really control related to the database itself:

Other users running queries concurrently on the database
Database software and optimization (e.g., Postgres is optimized differently than Redshift)

QUIZ QUESTION

Select all of the following statements that are true about tuning performance with LIMIT:

- If you have time series data, limiting to a small time window can make your queries run more quickly.

- Testing your queries on a subset of data, finalizing your query, then removing the subset limitation is a sound strategy.

- When working with subqueries, limiting the amount of data you’re working with in the place where it will be executed first will have the maximum impact on query run time.

Expert Tip

If you’d like to understand this a little better, you can do some extra research on cartesian products. It’s also worth noting that the FULL JOIN and COUNT above actually runs pretty fast—it’s the COUNT(DISTINCT) that takes forever.

In [14]:
# sqlite version

pd.read_sql_query(sql = '''
SELECT strftime('%Y-%m-%d', o.occurred_at) AS date,
COUNT(DISTINCT a.sales_rep_id) AS active_sales_reps,
COUNT(DISTINCT o.id) AS orders,
COUNT(DISTINCT we.id) AS web_visits
FROM accounts a
JOIN orders o
ON o.account_id = a.id
JOIN web_events we
ON strftime('%Y-%m-%d', we.occurred_at) = strftime('%Y-%m-%d', o.occurred_at)
GROUP BY 1
ORDER BY 1 DESC
;
''', con=conn)

Unnamed: 0,date,active_sales_reps,orders,web_visits
0,2017-01-01,13,24,31
1,2016-12-31,15,26,27
2,2016-12-30,7,11,18
3,2016-12-29,7,11,19
4,2016-12-28,12,22,31
...,...,...,...,...
1054,2013-12-09,2,3,5
1055,2013-12-08,5,8,10
1056,2013-12-06,4,7,9
1057,2013-12-05,1,2,2


In [16]:
# sqlite version - table is big without aggregate - 79083 rows × 4 columns

pd.read_sql_query(sql = '''
SELECT o.occurred_at AS date,
a.sales_rep_id AS active_sales_reps,
o.id AS orders,
we.id AS web_visits
FROM accounts a
JOIN orders o
ON o.account_id = a.id
JOIN web_events we
ON strftime('%Y-%m-%d', we.occurred_at) = strftime('%Y-%m-%d', o.occurred_at)
ORDER BY 1 DESC
;
''', con=conn)

Unnamed: 0,date,active_sales_reps,orders,web_visits
0,2017-01-01 23:50:16,321940,3546,398
1,2017-01-01 23:50:16,321940,3546,412
2,2017-01-01 23:50:16,321940,3546,655
3,2017-01-01 23:50:16,321940,3546,1121
4,2017-01-01 23:50:16,321940,3546,1399
...,...,...,...,...
79078,2013-12-04 04:45:54,321820,2415,8825
79079,2013-12-04 04:22:44,321820,5786,2471
79080,2013-12-04 04:22:44,321820,5786,4193
79081,2013-12-04 04:22:44,321820,5786,6994


In [27]:
# sqlite version - aggregate the tables separately

pd.read_sql_query(sql = '''
SELECT COALESCE(orders.date, web_events.date),
COUNT(DISTINCT orders.active_sales_reps) AS active_sales_reps,
COUNT(DISTINCT orders.orders) AS orders,
COUNT(DISTINCT web_events.web_visits) AS web_visits
FROM (SELECT strftime('%Y-%m-%d', o.occurred_at) AS date,
COUNT(a.sales_rep_id) AS active_sales_reps,
COUNT(o.id) AS orders
FROM accounts a
JOIN orders o
ON o.account_id = a.id
GROUP BY 1) orders
JOIN
(SELECT strftime('%Y-%m-%d', we.occurred_at) AS date,
COUNT(we.id) AS web_visits
FROM web_events we
GROUP BY 1) web_events
ON orders.date = web_events.date
GROUP BY 1
ORDER BY 1 DESC

;
''', con=conn)

Unnamed: 0,"COALESCE(orders.date, web_events.date)",active_sales_reps,orders,web_visits
0,2017-01-01,1,1,1
1,2016-12-31,1,1,1
2,2016-12-30,1,1,1
3,2016-12-29,1,1,1
4,2016-12-28,1,1,1
...,...,...,...,...
1054,2013-12-09,1,1,1
1055,2013-12-08,1,1,1
1056,2013-12-06,1,1,1
1057,2013-12-05,1,1,1
