In [1]:
import sqlite3
import pandas as pd

In [2]:
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [3]:
conn = sqlite3.connect('parch-and-posey.db')

In [4]:
cursor = conn.cursor()
cursor.execute('''
select * from sqlite_master where type = "table";
''')
columns = [col[0] for col in cursor.description]
data = cursor.fetchall()
cursor.close()

In [5]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,web_events,web_events,2,"CREATE TABLE web_events (\tid integer,\taccoun..."
1,table,sales_reps,sales_reps,92,"CREATE TABLE sales_reps (\tid integer,\tname b..."
2,table,region,region,93,"CREATE TABLE region (\tid integer,\tname bpchar)"
3,table,orders,orders,94,"CREATE TABLE orders (\tid integer,\taccount_id..."
4,table,accounts,accounts,221,"CREATE TABLE accounts (\tid integer,\tname bpc..."


# Percentiles with Partitions

You can use partitions with percentiles to determine the percentile of a specific subset of all rows. Imagine you're an analyst at Parch & Posey and you want to determine the largest orders (in terms of quantity) a specific customer has made to encourage them to order more similarly sized large orders. You only want to consider the NTILE for that customer's account_id.

Task 1:

Use the NTILE functionality to divide the accounts into 4 levels in terms of the amount of standard_qty for their orders. Your resulting table should have the account_id, the occurred_at time for each order, the total amount of standard_qty paper purchased, and one of four levels in a standard_quartile column.

In [14]:
pd.read_sql_query(sql='''
SELECT account_id,
       occurred_at,
       standard_qty,
       NTILE(4) OVER(PARTITION BY account_id ORDER BY standard_qty) quartile_standard_qty
FROM orders;
''', con=conn)[:50]

Unnamed: 0,account_id,occurred_at,standard_qty,quartile_standard_qty
0,1001,2015-12-04 04:21:55,85,1
1,1001,2016-05-31 21:22:48,91,1
2,1001,2016-06-30 12:32:05,94,1
3,1001,2016-05-01 15:55:51,95,1
4,1001,2016-10-26 20:31:30,97,1
5,1001,2016-04-01 11:20:18,101,1
6,1001,2016-07-30 03:26:30,101,1
7,1001,2016-03-02 15:29:32,103,2
8,1001,2016-09-26 23:28:25,104,2
9,1001,2016-02-01 19:27:27,108,2


Solution:

SELECT
       account_id,
       occurred_at,
       standard_qty,
       NTILE(4) OVER (PARTITION BY account_id ORDER BY standard_qty) AS standard_quartile
  FROM orders 
 ORDER BY account_id DESC

Task 2:

Use the NTILE functionality to divide the accounts into two levels in terms of the amount of gloss_qty for their orders. Your resulting table should have the account_id, the occurred_at time for each order, the total amount of gloss_qty paper purchased, and one of two levels in a gloss_half column.

In [20]:
pd.read_sql_query(sql='''
SELECT account_id,
       occurred_at,
       gloss_qty,
       NTILE(2) OVER(PARTITION BY account_id ORDER BY gloss_qty) half_gloss_qty
FROM orders;
''', con=conn)[:50]

Unnamed: 0,account_id,occurred_at,gloss_qty,half_gloss_qty
0,1001,2016-09-26 23:28:25,10,1
1,1001,2016-05-31 21:22:48,16,1
2,1001,2015-10-06 17:31:14,22,1
3,1001,2016-03-02 15:29:32,24,1
4,1001,2016-02-01 19:27:27,29,1
5,1001,2016-01-02 01:18:24,32,1
6,1001,2016-04-01 11:20:18,33,1
7,1001,2016-08-28 07:13:39,33,1
8,1001,2016-07-30 03:26:30,36,1
9,1001,2016-11-25 23:21:32,39,1


Solution:

SELECT
       account_id,
       occurred_at,
       gloss_qty,
       NTILE(2) OVER (PARTITION BY account_id ORDER BY gloss_qty) AS gloss_half
  FROM orders 
 ORDER BY account_id DESC

Task 3:

Use the NTILE functionality to divide the orders for each account into 100 levels in terms of the amount of total_amt_usd for their orders. Your resulting table should have the account_id, the occurred_at time for each order, the total amount of total_amt_usd paper purchased, and one of 100 levels in a total_percentile column.

In [22]:
pd.read_sql_query(sql='''
SELECT account_id,
       occurred_at,
       total_amt_usd,
       NTILE(100) OVER(PARTITION BY account_id ORDER BY total_amt_usd) percentile_total_amt_usd
FROM orders
ORDER BY account_id DESC;
''', con=conn)[:50]

Unnamed: 0,account_id,occurred_at,total_amt_usd,percentile_total_amt_usd
0,4501,2016-11-22 06:57:04,86.78,1
1,4501,2016-05-30 04:18:34,157.24,2
2,4501,2016-12-21 13:43:26,628.74,3
3,4501,2016-06-29 03:57:11,875.54,4
4,4501,2016-07-29 20:06:39,974.17,5
5,4501,2016-10-24 08:50:37,1122.55,6
6,4501,2016-08-27 00:48:17,1175.47,7
7,4501,2016-09-25 01:44:03,1324.34,8
8,4501,2016-08-27 00:58:11,1449.74,9
9,4501,2016-11-22 06:52:22,1473.92,10


Solution:

SELECT
       account_id,
       occurred_at,
       total_amt_usd,
       NTILE(100) OVER (PARTITION BY account_id ORDER BY total_amt_usd) AS total_percentile
  FROM orders 
 ORDER BY account_id DESC

In [24]:
# Task 1 all accounts

pd.read_sql_query(sql='''
WITH t1 AS (SELECT account_id, SUM(standard_qty) sum_standard_qty
FROM orders
GROUP BY 1
)

SELECT account_id,
       sum_standard_qty,
       NTILE(100) OVER(ORDER BY sum_standard_qty) percentile_sum_standard_qty
FROM t1;
''', con=conn)

Unnamed: 0,account_id,sum_standard_qty,percentile_sum_standard_qty
0,1901,0,1
1,3371,79,1
2,1961,102,1
3,3401,116,1
4,3741,117,2
...,...,...,...
345,1281,21943,99
346,1181,22099,99
347,3491,22936,100
348,2631,23321,100
