In [4]:
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
import pandas as pd
try:
  from StringIO import StringIO
except ImportError:
  from io import BytesIO as StringIO

## Build Some Intermediary Tables

Pull some additional information for each user that we'll need later
Pull some additional information on when each user purchases each product

In [45]:
%%bq query -n users_q
SELECT orders.user_id, COALESCE(user_flags.is_train,0) AS is_train,
  COUNT(*) AS num_orders, SUM(days_since_prior_order) AS days_bw_first_last_order
FROM instacart.orders AS orders
LEFT JOIN (
  SELECT user_id, 1 AS `is_train` FROM instacart.orders WHERE eval_set = "train" GROUP BY 1
) AS user_flags ON orders.user_id = user_flags.user_id
WHERE eval_set = "prior"
GROUP BY 1,2

In [46]:
%%bq execute -q users_q -t instacart.users  -m overwrite

user_id,is_train,num_orders,days_bw_first_last_order
201321,1,3,0.0
125717,0,3,0.0
80567,1,3,0.0
36904,1,3,0.0
164320,1,3,0.0
174627,1,3,0.0
181478,0,3,0.0
179078,1,3,0.0
137150,0,3,0.0
109010,1,3,0.0


In [47]:
%%bq query -n user_products_q
SELECT orders.user_id, op.product_id, 
  COUNT(*) AS num_orders, SUM(op.reordered) AS num_reorders,
  MIN(orders.order_number) AS first_order_number, MIN(days_since_first_order) AS first_order_day,
  MAX(orders.order_number) AS last_order_number, MAX(days_since_first_order) AS last_order_day
FROM instacart.order_products__prior AS op
INNER JOIN (
  SELECT *, 
  SUM(COALESCE(days_since_prior_order,0)) OVER (PARTITION BY user_id ORDER BY order_number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `days_since_first_order`
  FROM instacart.orders WHERE eval_set = "prior"
) AS orders ON orders.order_id = op.order_id
GROUP BY 1,2

In [48]:
%%bq execute -q user_products_q -t instacart.user_products  -m overwrite

user_id,product_id,num_orders,num_reorders,first_order_number,first_order_day,last_order_number,last_order_day
66536,21616,11,10,13,75.0,73,357.0
133815,16262,14,13,45,195.0,74,353.0
35933,11481,36,35,9,36.0,84,352.0
86294,4605,12,11,8,17.0,46,166.0
136558,13032,25,24,6,7.0,97,133.0
146147,260,18,17,23,45.0,66,141.0
22614,49235,47,46,1,0.0,47,349.0
195399,13176,25,24,1,0.0,54,352.0
111451,36338,11,10,8,54.0,47,353.0
69800,13740,16,15,2,4.0,54,316.0


## User-Product Features

User-product level data which is base for the reorder model (includes both test and train)

Contains "basket" of all products that user has ever purchased; trying to predict which will be reordered (though this is admittedly only ~2/3 of the problem since ~1/3 of purchases in the train set are to products never purchased before)

In [49]:
%%bq query -n user_products_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders / users.num_orders AS `perc_all_orders`,
    up.num_reorders / NULLIF(users.num_orders - up.first_order_number,0) AS `perc_reorder`,
    up.num_orders / NULLIF(users.days_bw_first_last_order,0) AS `orders_per_day`,
    up.num_reorders / NULLIF(users.days_bw_first_last_order - up.first_order_day,0) AS `reorders_per_day`,
    up.first_order_number, up.first_order_day, up.last_order_number, up.last_order_day, users.days_bw_first_last_order
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.*, 
  orders.order_id, orders.eval_set, orders.order_hour_of_day, orders.order_dow,
  CASE WHEN orders.eval_set = "test" THEN NULL ELSE LEAST(COALESCE(op_train.order_id,0),1) END AS `is_ordered`,
  up.days_bw_first_last_order - up.last_order_day + orders.days_since_prior_order AS `days_since_last_order`,
  orders.order_number - up.last_order_number + 1 AS `orders_since_last_order`
FROM up_features AS up
INNER JOIN instacart.orders AS orders ON orders.user_id = up.user_id AND orders.eval_set IN ('train','test')
LEFT JOIN instacart.order_products__train AS op_train ON orders.order_id = op_train.order_id AND up.product_id = op_train.product_id

In [50]:
%%bq execute -q user_products_features_q -t instacart.user_products_features  -m overwrite

user_id,product_id,perc_all_orders,perc_reorder,orders_per_day,reorders_per_day,first_order_number,first_order_day,last_order_number,last_order_day,days_bw_first_last_order,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,days_since_last_order,orders_since_last_order
74660,2452,0.2,0.0,,,1,0.0,1,0.0,0.0,2011288,train,15,5,0.0,30.0,6
174627,30407,1.0,1.0,,,1,0.0,3,0.0,0.0,3083298,train,9,3,0.0,30.0,2
137150,43154,0.666666666667,0.5,,,1,0.0,2,0.0,0.0,1183995,test,7,4,,1.0,3
131603,3390,0.333333333333,0.0,,,1,0.0,1,0.0,0.0,1773727,test,14,1,,4.0,4
74660,43122,0.6,0.5,,,1,0.0,3,0.0,0.0,2011288,train,15,5,0.0,30.0,4
121915,19488,1.0,1.0,,,1,0.0,3,0.0,0.0,1686562,train,13,6,0.0,27.0,2
9515,39794,1.0,1.0,,,1,0.0,3,0.0,0.0,318189,test,12,1,,13.0,2
65616,1360,0.5,0.333333333333,2.0,1.0,1,0.0,2,0.0,1.0,2502378,test,15,1,,14.0,4
196888,37766,1.0,1.0,3.0,2.0,1,0.0,3,1.0,1.0,2905964,train,8,6,0.0,30.0,2
347,27325,1.0,1.0,3.0,2.0,1,0.0,3,1.0,1.0,2770483,test,14,3,,28.0,2


## User Features

Some additional user-level features that we'll join in later

In [51]:
%%bq query -n user_features_q
SELECT users.user_id,
  ANY_VALUE(users.num_orders) AS num_orders, 
  ANY_VALUE(users.days_bw_first_last_order) / ANY_VALUE(users.num_orders) AS avg_days_bw_orders,
  COUNT(DISTINCT up.product_id) AS num_products,
  COUNT(DISTINCT products.aisle_id) AS num_aisles,
  COUNT(DISTINCT products.department_id) AS num_departments
FROM instacart.users AS users
INNER JOIN instacart.user_products AS up ON users.user_id = up.user_id
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
GROUP BY 1

In [52]:
%%bq execute -q user_features_q -t instacart.user_features  -m overwrite

user_id,num_orders,avg_days_bw_orders,num_products,num_aisles,num_departments
62180,3,0.0,2,1,1
164320,3,0.0,2,2,1
109010,3,0.0,1,1,1
99295,3,0.0,1,1,1
15495,3,0.0,1,1,1
181478,3,0.0,3,2,1
49581,3,2.0,1,1,1
172259,5,2.0,7,2,1
202329,3,2.0,4,3,1
113387,3,2.0,3,2,1


## Product Features

Some additional product-level features that we'll join in later

In [57]:
%%bq query -n product_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders, users.num_orders AS `num_orders_user`,
    up.num_reorders, users.num_orders - up.first_order_number AS `num_reorders_user`,
    users.days_bw_first_last_order AS `order_days`, users.days_bw_first_last_order - up.first_order_day AS `reorder_days`,
    up.first_order_number, up.first_order_day
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.product_id, products.aisle_id, products.department_id,
  COUNT(DISTINCT user_id) / ANY_VALUE(num_users_total) AS `perc_users`,
  SUM(num_orders) / SUM(num_orders_user) AS `perc_all_orders`,
  SUM(num_reorders) / NULLIF(SUM(num_reorders_user),0) AS `perc_reorders`,
  SUM(num_orders) / NULLIF(SUM(order_days),0) AS `orders_per_day`,
  SUM(num_reorders) / NULLIF(SUM(reorder_days),0) AS `reorders_per_day`,
  AVG(first_order_number) AS `avg_first_order_number`,
  AVG(first_order_day) AS `avg_first_order_day`
FROM up_features AS up
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
INNER JOIN (
  SELECT COUNT(DISTINCT user_id) AS num_users_total
  FROM instacart.user_products
) AS x ON 1=1
GROUP BY 1,2,3

In [58]:
%%bq execute -q product_features_q -t instacart.product_features  -m overwrite

product_id,aisle_id,department_id,perc_users,perc_all_orders,perc_reorders,orders_per_day,reorders_per_day,avg_first_order_number,avg_first_order_day
22853,1,20,0.000305515278189,0.0979653353429,0.09396914446,0.0103784129012,0.0091367789445,9.74603174603,82.4285714286
19101,1,20,0.0013626951297,0.0690438034188,0.0668176670442,0.0083604197998,0.0081005011327,14.0782918149,116.387900356
38694,1,20,0.00178459718053,0.13159646287,0.16868670098,0.0169280070452,0.0218386604475,12.2119565217,95.7472826087
10654,1,20,0.000387955908811,0.0764840182648,0.0584415584416,0.00778752833149,0.00606809753905,10.35,103.85
26870,1,20,0.000737116226741,0.0843142622502,0.0727951469902,0.00987559317686,0.00816925010473,9.93421052632,79.5526315789
8121,1,20,0.00070317008472,0.0611981962637,0.038503850385,0.00803631852019,0.00499696612771,7.04137931034,51.3586206897
43221,1,20,0.00332672191805,0.116397910516,0.128388554217,0.0137864247429,0.014635193133,10.1865889213,80.8994169096
16178,1,20,0.000635277800678,0.0664581704457,0.044992743106,0.00822182814767,0.00557303370787,8.25190839695,66.9083969466
47979,1,20,0.000223074647566,0.106893106893,0.104810996564,0.0114646951677,0.0111517367459,9.10869565217,83.9782608696
6778,1,20,0.000940793078867,0.0503076366269,0.0268370607029,0.00641528591868,0.00342075256556,12.3505154639,96.793814433


## Time-of-Day Features

For each product/aisle/department, compute how much more likely to purchase at that time of day

In [55]:
%%bq query -n tod_features_q
WITH temp AS (
  SELECT orders.order_hour_of_day, orders.order_dow, op.product_id, products.aisle_id, products.department_id, COUNT(*) AS `n_product`
  FROM instacart.order_products__prior AS op
  INNER JOIN instacart.orders AS orders ON orders.order_id = op.order_id
  INNER JOIN instacart.products AS products ON op.product_id = products.product_id
  GROUP BY 1,2,3,4,5
)
SELECT order_hour_of_day, order_dow, product_id,
  n_product / n_product_total / (n_hod_dow / n_total) AS `product_tod_factor`,
  n_aisle / n_aisle_total / (n_hod_dow / n_total) AS `aisle_tod_factor`,
  n_department / n_department_total / (n_hod_dow / n_total) AS `department_tod_factor`
FROM (
  SELECT order_hour_of_day, order_dow, product_id, aisle_id, department_id, n_product,
  SUM(n_product) OVER (PARTITION BY product_id) AS `n_product_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, aisle_id) AS `n_aisle`,
  SUM(n_product) OVER (PARTITION BY aisle_id) AS `n_aisle_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, department_id) AS `n_department`,
  SUM(n_product) OVER (PARTITION BY department_id) AS `n_department_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow) AS `n_hod_dow`,
  SUM(n_product) OVER () AS `n_total`
  FROM temp  
) AS x

In [56]:
%%bq execute -q tod_features_q -t instacart.tod_features  -m overwrite

order_hour_of_day,order_dow,product_id,product_tod_factor,aisle_tod_factor,department_tod_factor
4,4,31391,3.70944877306,1.39329873558,1.07660179665
4,4,42617,1.3283339383,0.797263755548,1.00012144143
4,4,39040,0.599722708701,0.904993350147,0.948444336493
4,4,14278,37.1828079395,1.39329873558,1.07660179665
4,4,17429,1.82367995344,0.895686628409,0.900240407055
4,4,2750,3.55195890855,1.0466515624,1.06000410609
4,4,3681,161.552889668,1.08096280693,1.06000410609
4,4,43885,80.7764448341,0.812646795801,0.870005868157
4,4,28535,0.658750534361,0.64656233732,1.07660179665
4,4,35166,2.82571399299,0.757082863138,1.00012144143


## Model Universe

Assemble model universe with all features

In [1]:
%%bq query -n model_universe_q
SELECT
upf.user_id,
upf.product_id,
pf.aisle_id,
pf.department_id,
upf.order_id,
upf.eval_set,
upf.order_hour_of_day,
upf.order_dow,
upf.is_ordered,
upf.perc_all_orders AS `upf_perc_all_orders`,
upf.perc_reorder AS `upf_perc_reorder`,
upf.orders_per_day AS `upf_orders_per_day`,
upf.reorders_per_day AS `upf_reorders_per_day`,
upf.first_order_number AS `upf_first_order_number`,
upf.first_order_day AS `upf_first_order_day`,
upf.last_order_number AS `upf_last_order_number`,
upf.last_order_day AS `upf_last_order_day`,
upf.days_since_last_order AS `upf_days_since_last_order`,
upf.orders_since_last_order AS `upf_orders_since_last_order`,
uf.num_orders AS `uf_num_orders`,
uf.avg_days_bw_orders AS `uf_avg_days_bw_orders`,
uf.num_products AS `uf_num_products`,
uf.num_aisles AS `uf_num_aisles`,
uf.num_departments AS `uf_num_departments`,
pf.perc_users AS `pf_perc_users`,
pf.perc_all_orders AS `pf_perc_all_orders`,
pf.perc_reorders AS `pf_perc_reorders`,
pf.orders_per_day AS `pf_orders_per_day`,
pf.reorders_per_day AS `pf_reorders_per_day`,
pf.avg_first_order_number AS `pf_avg_first_order_number`,
pf.avg_first_order_day AS `pf_avg_first_order_day`,
tf.product_tod_factor AS `tf_product_tod_factor`,
tf.aisle_tod_factor AS `tf_aisle_tod_factor`,
tf.department_tod_factor AS `tf_department_tod_factor`
FROM instacart.user_products_features AS upf
INNER JOIN instacart.user_features AS uf ON upf.user_id = uf.user_id
INNER JOIN instacart.product_features AS pf ON upf.product_id = pf.product_id
INNER JOIN instacart.tod_features AS tf ON upf.product_id = tf.product_id AND upf.order_hour_of_day = tf.order_hour_of_day AND upf.order_dow = tf.order_dow

In [2]:
%%bq execute -q model_universe_q -t instacart.model_universe  -m overwrite

user_id,product_id,aisle_id,department_id,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,upf_perc_all_orders,upf_perc_reorder,upf_orders_per_day,upf_reorders_per_day,upf_first_order_number,upf_first_order_day,upf_last_order_number,upf_last_order_day,upf_days_since_last_order,upf_orders_since_last_order,uf_num_orders,uf_avg_days_bw_orders,uf_num_products,uf_num_aisles,uf_num_departments,pf_perc_users,pf_perc_all_orders,pf_perc_reorders,pf_orders_per_day,pf_reorders_per_day,pf_avg_first_order_number,pf_avg_first_order_day,tf_product_tod_factor,tf_aisle_tod_factor,tf_department_tod_factor
105127,32170,1,20,1901965,test,15,6,,0.166666666667,0.153846153846,0.0110701107011,0.00826446280992,5,29.0,16,243.0,51.0,4,18,15.0555555556,57,24,14,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.78417893801,0.983471339869,1.01310571055
168310,32170,1,20,1481482,test,10,6,,0.0625,0.0,0.00970873786408,0.0,5,48.0,5,48.0,61.0,13,16,6.4375,91,38,15,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.39582425291,0.950110602581,1.00783586089
48276,32170,1,20,2930937,train,7,5,0.0,0.0769230769231,0.0454545454545,0.00625,0.00361010830325,4,43.0,10,121.0,203.0,18,26,12.3076923077,149,55,15,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,0.436951010601,0.843284833271,0.919589371376
66227,32170,1,20,1635418,train,12,2,1.0,0.265306122449,0.25,0.0517928286853,0.0478087649402,1,0.0,47,229.0,25.0,4,49,5.12244897959,128,49,17,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.19654043771,1.0362146276,1.00186273713
60334,32170,1,20,2009178,train,19,3,1.0,1.0,1.0,0.190476190476,0.142857142857,1,0.0,4,21.0,30.0,2,4,5.25,21,10,7,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.02792765126,1.05639545075,1.0278797383
117887,32170,1,20,1150564,train,18,1,0.0,0.2,0.0,0.0103092783505,0.0,4,67.0,4,67.0,52.0,3,5,19.4,37,23,10,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,0.651114216809,1.07237229702,1.08395058502
135736,32170,1,20,2937549,train,18,6,0.0,0.25,,0.0153846153846,,4,65.0,4,65.0,30.0,2,4,16.25,35,16,8,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.23172158648,0.974527269222,0.987221016087
34756,32170,1,20,996942,train,18,0,0.0,0.0227272727273,0.0,0.00278551532033,0.0,36,317.0,36,317.0,47.0,10,44,8.15909090909,133,52,15,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,0.791644577885,1.08405383638,1.07094759336
180074,32608,1,20,3326163,test,13,5,,0.0298507462687,0.025641025641,0.00684931506849,0.00757575757576,28,160.0,58,258.0,36.0,11,67,4.35820895522,204,65,15,0.000101838426063,0.103238866397,0.123966942149,0.0108119567522,0.0137741046832,12.0,120.904761905,1.89012504101,1.07348317832,0.990327574712
71352,32170,1,20,1341192,train,12,2,0.0,0.1,0.0,0.0049504950495,0.0,7,147.0,7,147.0,64.0,5,10,20.2,29,17,9,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027,1.19654043771,1.0362146276,1.00186273713


In [None]:
project_id = Context.default().project_id
bucket_path = 'gs://' + project_id
bucket_object = bucket_path + '/model_universe_*.csv'
table = bq.Table('instacart.model_universe')
table.extract(destination = bucket_object)