In [51]:
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
import pandas as pd
from StringIO import StringIO

## Build Some Intermediary Tables

Pull some additional information for each user that we'll need later
Pull some additional information on when each user purchases each product

In [27]:
%%bq query -n users_q
SELECT orders.user_id, COALESCE(user_flags.is_train,0) AS is_train,
  COUNT(*) AS num_orders, SUM(days_since_prior_order) AS days_bw_first_last_order
FROM instacart.orders AS orders
LEFT JOIN (
  SELECT user_id, 1 AS `is_train` FROM instacart.orders WHERE eval_set = "train" GROUP BY 1
) AS user_flags ON orders.user_id = user_flags.user_id
WHERE eval_set = "prior"
GROUP BY 1,2

In [28]:
%%bq execute -q users_q -t instacart.users  -m overwrite

user_id,is_train,num_orders,days_bw_first_last_order
36904,1,3,0.0
62180,1,3,0.0
109010,1,3,0.0
174627,1,3,0.0
58934,0,3,0.0
99295,0,3,0.0
80567,1,3,0.0
201321,1,3,0.0
125717,0,3,0.0
9515,0,3,0.0


In [29]:
%%bq query -n user_products_q
SELECT orders.user_id, op.product_id, 
  COUNT(*) AS num_orders, SUM(op.reordered) AS num_reorders,
  MIN(orders.order_number) AS first_order_number, MIN(days_since_first_order) AS first_order_day,
  MAX(orders.order_number) AS last_order_number, MAX(days_since_first_order) AS last_order_day
FROM instacart.order_products__prior AS op
INNER JOIN (
  SELECT *, 
  SUM(COALESCE(days_since_prior_order,0)) OVER (PARTITION BY user_id ORDER BY order_number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `days_since_first_order`
  FROM instacart.orders WHERE eval_set = "prior"
) AS orders ON orders.order_id = op.order_id
GROUP BY 1,2

In [30]:
%%bq execute -q user_products_q -t instacart.user_products  -m overwrite

user_id,product_id,num_orders,num_reorders,first_order_number,first_order_day,last_order_number,last_order_day
182558,25138,32,31,22,73.0,99,337.0
82232,35470,85,84,1,0.0,97,277.0
135425,18140,28,27,15,103.0,69,327.0
194587,26209,46,45,1,0.0,85,350.0
183515,47449,37,36,1,0.0,58,168.0
171492,5212,14,13,1,0.0,41,346.0
15980,23296,14,13,9,57.0,46,247.0
22373,19871,42,41,1,0.0,60,357.0
12709,36956,20,19,1,0.0,40,307.0
48364,32331,11,10,1,0.0,43,214.0


## User-Product Features

User-product level data which is base for the reorder model (includes both test and train)

Contains "basket" of all products that user has ever purchased; trying to predict which will be reordered (though this is admittedly only ~2/3 of the problem since ~1/3 of purchases in the train set are to products never purchased before)

In [31]:
%%bq query -n user_products_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders / users.num_orders AS `perc_all_orders`,
    up.num_reorders / NULLIF(users.num_orders - up.first_order_number,0) AS `perc_reorder`,
    up.num_orders / NULLIF(users.days_bw_first_last_order,0) AS `orders_per_day`,
    up.num_reorders / NULLIF(users.days_bw_first_last_order - up.first_order_day,0) AS `reorders_per_day`,
    up.first_order_number, up.first_order_day, up.last_order_number, up.last_order_day, users.days_bw_first_last_order
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.*, 
  orders.order_id, orders.eval_set, orders.order_hour_of_day, orders.order_dow,
  CASE WHEN orders.eval_set = "test" THEN NULL ELSE LEAST(COALESCE(op_train.order_id,0),1) END AS `is_ordered`,
  up.days_bw_first_last_order - up.last_order_day + orders.days_since_prior_order AS `days_since_last_order`,
  orders.order_number - up.last_order_number + 1 AS `orders_since_last_order`
FROM up_features AS up
INNER JOIN instacart.orders AS orders ON orders.user_id = up.user_id AND orders.eval_set IN ('train','test')
LEFT JOIN instacart.order_products__train AS op_train ON orders.order_id = op_train.order_id AND up.product_id = op_train.product_id

In [32]:
%%bq execute -q user_products_features_q -t instacart.user_products_features  -m overwrite

user_id,product_id,perc_all_orders,perc_reorder,orders_per_day,reorders_per_day,first_order_number,first_order_day,last_order_number,last_order_day,days_bw_first_last_order,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,days_since_last_order,orders_since_last_order
179078,4957,0.333333333333,0.0,,,1,0.0,1,0.0,0.0,2930952,train,10,6,0.0,1.0,4
74660,2452,0.2,0.0,,,1,0.0,1,0.0,0.0,2011288,train,15,5,0.0,30.0,6
9515,40706,1.0,1.0,,,1,0.0,3,0.0,0.0,318189,test,12,1,,13.0,2
121915,19488,1.0,1.0,,,1,0.0,3,0.0,0.0,1686562,train,13,6,0.0,27.0,2
133075,41540,0.333333333333,0.0,,,1,0.0,1,0.0,0.0,765224,train,9,4,0.0,30.0,4
125717,5479,1.0,1.0,,,1,0.0,3,0.0,0.0,3409434,test,15,0,,30.0,2
151890,49544,0.666666666667,0.5,2.0,1.0,1,0.0,3,1.0,1.0,532469,train,15,3,0.0,1.0,2
176503,1376,1.0,1.0,2.0,1.5,1,0.0,4,2.0,2.0,3274120,test,12,5,,4.0,2
40502,8637,1.0,1.0,3.0,2.0,1,0.0,3,1.0,1.0,2980949,train,13,2,0.0,30.0,2
65616,21463,0.75,0.666666666667,3.0,2.0,1,0.0,3,0.0,1.0,2502378,test,15,1,,14.0,3


## User Features

Some additional user-level features that we'll join in later

In [33]:
%%bq query -n user_features_q
SELECT users.user_id,
  ANY_VALUE(users.num_orders) AS num_orders, 
  ANY_VALUE(users.days_bw_first_last_order) / ANY_VALUE(users.num_orders) AS avg_days_bw_orders,
  COUNT(DISTINCT up.product_id) AS num_products,
  COUNT(DISTINCT products.aisle_id) AS num_aisles,
  COUNT(DISTINCT products.department_id) AS num_departments
FROM instacart.users AS users
INNER JOIN instacart.user_products AS up ON users.user_id = up.user_id
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
GROUP BY 1

In [34]:
%%bq execute -q user_features_q -t instacart.user_features  -m overwrite

user_id,num_orders,avg_days_bw_orders,num_products,num_aisles,num_departments
99295,3,0.0,1,1,1
109010,3,0.0,1,1,1
181478,3,0.0,3,2,1
164320,3,0.0,2,2,1
62180,3,0.0,2,1,1
15495,3,0.0,1,1,1
202329,3,2.0,4,3,1
50492,4,2.0,2,2,1
49581,3,2.0,1,1,1
172259,5,2.0,7,2,1


## Product Features

Some additional product-level features that we'll join in later

In [35]:
%%bq query -n product_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders, users.num_orders AS `num_orders_user`,
    up.num_reorders, users.num_orders - up.first_order_number AS `num_reorders_user`,
    users.days_bw_first_last_order AS `order_days`, users.days_bw_first_last_order - up.first_order_day AS `reorder_days`,
    up.first_order_number, up.first_order_day
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.product_id, products.aisle_id, products.department_id,
  COUNT(DISTINCT user_id) / ANY_VALUE(num_users_total) AS `perc_users`,
  SUM(num_orders) / SUM(num_orders_user) AS `perc_all_orders`,
  SUM(num_reorders) / NULLIF(SUM(num_reorders_user),0) AS `perc_reorders`,
  SUM(num_orders) / NULLIF(SUM(order_days),0) AS `orders_per_day`,
  SUM(num_reorders) / NULLIF(SUM(reorder_days),0) AS `reorders_per_day`,
  AVG(first_order_number) AS `avg_first_order_number`,
  AVG(first_order_day) AS `avg_first_order_day`
FROM up_features AS up
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
INNER JOIN (
  SELECT COUNT(DISTINCT user_id) AS num_users_total
  FROM instacart.user_products
) AS x ON 1=1
GROUP BY 1,2,3

In [36]:
%%bq execute -q product_features_q -t instacart.product_features  -m overwrite

product_id,aisle_id,department_id,perc_users,perc_all_orders,perc_reorders,orders_per_day,reorders_per_day,avg_first_order_number,avg_first_order_day
17574,1,20,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231
43221,1,20,0.00332672191805,0.116397910516,0.128388554217,0.0137864247429,0.014635193133,10.1865889213,80.8994169096
16178,1,20,0.000635277800678,0.0664581704457,0.044992743106,0.00822182814767,0.00557303370787,8.25190839695,66.9083969466
47979,1,20,0.000223074647566,0.106893106893,0.104810996564,0.0114646951677,0.0111517367459,9.10869565217,83.9782608696
37258,1,20,0.00035885921565,0.0591715976331,0.0328767123288,0.00683781935724,0.00342661336379,10.3243243243,75.4189189189
21384,1,20,0.00105233040265,0.123502126015,0.125074096028,0.014794748906,0.0140694805628,8.29493087558,60.8156682028
48474,1,20,0.000257020789587,0.0770533446232,0.0524861878453,0.00868983957219,0.00606157281863,8.62264150943,79.3018867925
33399,1,20,0.00121236221503,0.0706787963611,0.0648689900789,0.00854455009983,0.00768906042697,12.856,103.752
22281,1,20,0.0109500555262,0.0683343104684,0.0600092893637,0.00836089746144,0.00723244974045,12.8914968999,103.58768822
3991,1,20,0.000547987721195,0.0787401574803,0.0625498007968,0.0097911227154,0.00758857363817,8.13274336283,60.9469026549


## Time-of-Day Features

For each product/aisle/department, compute how much more likely to purchase at that time of day

In [37]:
%%bq query -n tod_features_q
WITH temp AS (
  SELECT orders.order_hour_of_day, orders.order_dow, op.product_id, products.aisle_id, products.department_id, COUNT(*) AS `n_product`
  FROM instacart.order_products__prior AS op
  INNER JOIN instacart.orders AS orders ON orders.order_id = op.order_id
  INNER JOIN instacart.products AS products ON op.product_id = products.product_id
  GROUP BY 1,2,3,4,5
)
SELECT order_hour_of_day, order_dow, product_id,
  n_product / n_product_total / (n_hod_dow / n_total) AS `product_tod_factor`,
  n_aisle / n_aisle_total / (n_hod_dow / n_total) AS `aisle_tod_factor`,
  n_department / n_department_total / (n_hod_dow / n_total) AS `department_tod_factor`
FROM (
  SELECT order_hour_of_day, order_dow, product_id, aisle_id, department_id, n_product,
  SUM(n_product) OVER (PARTITION BY product_id) AS `n_product_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, aisle_id) AS `n_aisle`,
  SUM(n_product) OVER (PARTITION BY aisle_id) AS `n_aisle_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, department_id) AS `n_department`,
  SUM(n_product) OVER (PARTITION BY department_id) AS `n_department_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow) AS `n_hod_dow`,
  SUM(n_product) OVER () AS `n_total`
  FROM temp  
) AS x

In [38]:
%%bq execute -q tod_features_q -t instacart.tod_features  -m overwrite

order_hour_of_day,order_dow,product_id,product_tod_factor,aisle_tod_factor,department_tod_factor
4,4,25746,13.4241656171,0.873460433845,1.07660179665
4,4,45002,1.9444008302,1.39329873558,1.07660179665
4,4,8224,3.74203977666,1.06827165371,1.00012144143
4,4,11408,1.57851543139,0.757082863138,1.00012144143
4,4,26497,0.911396517922,0.927954475076,0.948444336493
4,4,12858,2.57702629284,0.76991304516,0.919954343583
4,4,12206,1.19837160772,1.02973905232,1.05418085413
4,4,12121,5.19405077647,0.76991304516,0.919954343583
4,4,14819,6.75076916481,0.895686628409,0.900240407055
4,4,20396,16.3241595832,1.75007986447,1.07660179665


## Model Universe

Assemble model universe with all features

In [39]:
%%bq query -n reorder_model_universe_q
SELECT
upf.user_id,
upf.product_id,
pf.aisle_id,
pf.department_id,
upf.order_id,
upf.eval_set,
upf.order_hour_of_day,
upf.order_dow,
upf.is_ordered,
upf.perc_all_orders AS `upf_perc_all_orders`,
upf.perc_reorder AS `upf_perc_reorder`,
upf.orders_per_day AS `upf_orders_per_day`,
upf.reorders_per_day AS `upf_reorders_per_day`,
upf.first_order_number AS `upf_first_order_number`,
upf.first_order_day AS `upf_first_order_day`,
upf.last_order_number AS `upf_last_order_number`,
upf.last_order_day AS `upf_last_order_day`,
upf.days_since_last_order AS `upf_days_since_last_order`,
upf.orders_since_last_order AS `upf_orders_since_last_order`,
uf.num_orders AS `uf_num_orders`,
uf.avg_days_bw_orders AS `uf_avg_days_bw_orders`,
uf.num_products AS `uf_num_products`,
uf.num_aisles AS `uf_num_aisles`,
uf.num_departments AS `uf_num_departments`,
pf.perc_users AS `pf_perc_users`,
pf.perc_all_orders AS `pf_perc_all_orders`,
pf.perc_reorders AS `pf_perc_reorders`,
pf.orders_per_day AS `pf_orders_per_day`,
pf.reorders_per_day AS `pf_reorders_per_day`,
pf.avg_first_order_number AS `pf_avg_first_order_number`,
pf.avg_first_order_day AS `pf_avg_first_order_day`,
tf.product_tod_factor AS `tf_product_tod_factor`,
tf.aisle_tod_factor AS `tf_aisle_tod_factor`,
tf.department_tod_factor AS `tf_department_tod_factor`
FROM instacart.user_products_features AS upf
INNER JOIN instacart.user_features AS uf ON upf.user_id = uf.user_id
INNER JOIN instacart.product_features AS pf ON upf.product_id = pf.product_id
INNER JOIN instacart.tod_features AS tf ON upf.product_id = tf.product_id AND upf.order_hour_of_day = tf.order_hour_of_day AND upf.order_dow = tf.order_dow

In [40]:
%%bq execute -q reorder_model_universe_q -t instacart.reorder_model_universe  -m overwrite

user_id,product_id,aisle_id,department_id,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,upf_perc_all_orders,upf_perc_reorder,upf_orders_per_day,upf_reorders_per_day,upf_first_order_number,upf_first_order_day,upf_last_order_number,upf_last_order_day,upf_days_since_last_order,upf_orders_since_last_order,uf_num_orders,uf_avg_days_bw_orders,uf_num_products,uf_num_aisles,uf_num_departments,pf_perc_users,pf_perc_all_orders,pf_perc_reorders,pf_orders_per_day,pf_reorders_per_day,pf_avg_first_order_number,pf_avg_first_order_day,tf_product_tod_factor,tf_aisle_tod_factor,tf_department_tod_factor
73866,25595,1,20,1524478,test,10,6,,0.078431372549,0.0769230769231,0.0112044817927,0.0110294117647,12,85.0,17,119.0,246.0,36,51,7.0,64,27,13,9.69889772027e-05,0.0697674418605,0.0509554140127,0.00849858356941,0.00635677393723,10.1,85.95,2.48146533851,0.950110602581,1.00783586089
83068,25595,1,20,1567604,train,15,2,0.0,0.010101010101,0.0,0.0047619047619,0.0,30,132.0,30,132.0,79.0,71,99,2.12121212121,198,69,17,9.69889772027e-05,0.0697674418605,0.0509554140127,0.00849858356941,0.00635677393723,10.1,85.95,2.69690583073,1.05689552826,1.00399646353
5720,17574,1,20,2237993,train,18,2,0.0,0.25,0.0,0.0434782608696,0.0,2,7.0,2,7.0,31.0,4,4,5.75,50,30,13,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231,2.23426278568,1.04767566505,1.04464769471
69473,17574,1,20,1264415,train,13,3,0.0,0.166666666667,0.0,0.0102040816327,0.0,1,0.0,1,0.0,128.0,7,6,16.3333333333,67,31,12,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231,0.837011813265,1.00839861654,0.985717006163
57320,17574,1,20,827852,train,18,5,0.0,0.0833333333333,0.0,0.00606060606061,0.0,6,59.0,6,59.0,136.0,8,12,13.75,36,26,12,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231,2.32746352638,1.00854164151,1.0202104287
135165,17574,1,20,2464677,test,8,6,,0.0227272727273,,0.00291545189504,,44,343.0,44,343.0,6.0,2,44,7.79545454545,67,32,15,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231,1.05862878795,0.823260203019,0.95101359362
28858,17574,1,20,2011094,test,15,4,,0.0217391304348,0.0,0.0045871559633,0.0,24,115.0,24,115.0,110.0,24,46,4.73913043478,51,30,13,0.000315214175909,0.077811550152,0.0641547861507,0.00918220946915,0.00736842105263,10.2,82.9230769231,0.846431299646,1.02878447809,0.981969133999
185399,45757,1,20,258619,train,10,3,0.0,0.2,0.0,0.00952380952381,0.0,1,0.0,1,0.0,121.0,6,5,21.0,40,17,9,0.00266234742422,0.0527738598966,0.0315608609152,0.00690758603714,0.00398083723052,10.8524590164,77.1074681239,0.576109111438,1.13498492897,0.953049400171
10907,45757,1,20,2816210,train,19,3,0.0,0.25,0.0,0.0135135135135,0.0,1,0.0,1,0.0,104.0,5,4,18.5,26,19,11,0.00266234742422,0.0527738598966,0.0315608609152,0.00690758603714,0.00398083723052,10.8524590164,77.1074681239,1.09889815725,1.05639545075,1.0278797383
34890,45757,1,20,1962337,test,17,5,,0.333333333333,0.0,0.0166666666667,0.0,2,30.0,2,30.0,60.0,3,3,20.0,34,20,10,0.00266234742422,0.0527738598966,0.0315608609152,0.00690758603714,0.00398083723052,10.8524590164,77.1074681239,1.4679313629,1.09127125654,1.02788818696


Need to shard when exporting from BQ; max export size out of BQ is 1GB

In [55]:
project_id = Context.default().project_id
bucket_path = 'gs://' + project_id
bucket_object = bucket_path + '/reorder_model/universe_*.csv'
bucket_object2 = bucket_path + '/reorder_model/universe.csv'
table = bq.Table('instacart.reorder_model_universe')
table.extract(destination = bucket_object, csv_header=False)

Job kaggle-instacart-172517/job_lhQgVrDvJvZWnE21ImacEc5_tM8 completed

Concatenate the shards together

In [56]:
%%bash -s "$bucket_object" "$bucket_object2"
gsutil compose $1 $2

Composing gs://kaggle-instacart-172517/reorder_model/universe.csv from 5 component object(s).


Import as Pandas DF

In [None]:
fields = [str(x.name) for x in list(table.schema)]
%storage read --object $bucket_object2 --variable model_universe_raw
model_universe = pd.read_csv(StringIO(model_universe_raw), header=None, names=fields)
print model_universe.shape