Install some packages before we get started

In [2]:
#%%bash
#echo "apt-get update" >> /content/datalab/.config/startup.sh
#echo "pip install lightgbm" >> /content/datalab/.config/startup.sh
#echo "apt-get install libgomp1" >> /content/datalab/.config/startup.sh
#cat /content/datalab/.config/startup.sh

In [3]:
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
import numpy as np
import pandas as pd
import json
import lightgbm as lgb
from sklearn.metrics import f1_score
from StringIO import StringIO

## Build Some Intermediary Tables

Pull some additional information for each user that we'll need later
Pull some additional information on when each user purchases each product

In [4]:
%%bq query -n users_q
SELECT orders.user_id, COALESCE(user_flags.is_train,0) AS is_train,
  COUNT(*) AS num_orders, SUM(days_since_prior_order) AS days_bw_first_last_order
FROM instacart.orders AS orders
LEFT JOIN (
  SELECT user_id, 1 AS `is_train` FROM instacart.orders WHERE eval_set = "train" GROUP BY 1
) AS user_flags ON orders.user_id = user_flags.user_id
WHERE eval_set = "prior"
GROUP BY 1,2

In [5]:
%%bq execute -q users_q -t instacart.users -m overwrite

user_id,is_train,num_orders,days_bw_first_last_order
15495,1,3,0.0
80567,1,3,0.0
99295,0,3,0.0
36904,1,3,0.0
164320,1,3,0.0
58934,0,3,0.0
131603,0,3,0.0
174627,1,3,0.0
62180,1,3,0.0
179078,1,3,0.0


In [6]:
%%bq query -n user_products_q
SELECT orders.user_id, op.product_id, 
  COUNT(*) AS num_orders, SUM(op.reordered) AS num_reorders,
  MIN(orders.order_number) AS first_order_number, MIN(days_since_first_order) AS first_order_day,
  MAX(orders.order_number) AS last_order_number, MAX(days_since_first_order) AS last_order_day,
  AVG(op.add_to_cart_order) AS avg_cart_order
FROM instacart.order_products__prior AS op
INNER JOIN (
  SELECT *, 
  SUM(COALESCE(days_since_prior_order,0)) OVER (PARTITION BY user_id ORDER BY order_number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `days_since_first_order`
  FROM instacart.orders WHERE eval_set = "prior"
) AS orders ON orders.order_id = op.order_id
GROUP BY 1,2

In [7]:
%%bq execute -q user_products_q -t instacart.user_products -m overwrite

user_id,product_id,num_orders,num_reorders,first_order_number,first_order_day,last_order_number,last_order_day,avg_cart_order
124325,30827,18,17,7,51.0,44,356.0,2.05555555556
170848,35108,20,19,4,12.0,88,360.0,7.3
204322,7175,31,30,3,15.0,72,337.0,4.35483870968
17918,9339,15,14,5,30.0,76,329.0,11.1333333333
56296,21903,32,31,1,0.0,47,319.0,4.5625
29334,4605,16,15,1,0.0,50,349.0,7.625
56274,43005,13,12,5,27.0,44,343.0,3.69230769231
135222,33731,13,12,30,145.0,78,342.0,6.15384615385
101437,9076,11,10,9,49.0,47,303.0,5.81818181818
44098,46990,13,12,3,22.0,44,336.0,10.2307692308


## User-Product Features

User-product level data which is base for the reorder model (includes both test and train)

Contains "basket" of all products that user has ever purchased; trying to predict which will be reordered (though this is admittedly only ~2/3 of the problem since ~1/3 of purchases in the train set are to products never purchased before)

In [8]:
%%bq query -n user_products_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders / users.num_orders AS `perc_all_orders`,
    up.num_reorders / NULLIF(users.num_orders - up.first_order_number,0) AS `perc_reorder`,
    up.num_orders / NULLIF(users.days_bw_first_last_order,0) AS `orders_per_day`,
    up.num_reorders / NULLIF(users.days_bw_first_last_order - up.first_order_day,0) AS `reorders_per_day`,
    up.first_order_number, up.first_order_day, up.last_order_number, up.last_order_day, up.avg_cart_order, 
    users.days_bw_first_last_order
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.*, 
  orders.order_id, orders.eval_set, orders.order_hour_of_day, orders.order_dow,
  CASE WHEN orders.eval_set = "test" THEN NULL ELSE LEAST(COALESCE(op_train.order_id,0),1) END AS `is_ordered`,
  up.days_bw_first_last_order - up.last_order_day + orders.days_since_prior_order AS `days_since_last_order`,
  orders.order_number - up.last_order_number + 1 AS `orders_since_last_order`
FROM up_features AS up
INNER JOIN instacart.orders AS orders ON orders.user_id = up.user_id AND orders.eval_set IN ('train','test')
LEFT JOIN instacart.order_products__train AS op_train ON orders.order_id = op_train.order_id AND up.product_id = op_train.product_id

In [9]:
%%bq execute -q user_products_features_q -t instacart.user_products_features -m overwrite

user_id,product_id,perc_all_orders,perc_reorder,orders_per_day,reorders_per_day,first_order_number,first_order_day,last_order_number,last_order_day,avg_cart_order,days_bw_first_last_order,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,days_since_last_order,orders_since_last_order
91771,28427,0.161290322581,0.311111111111,0.0421348314607,0.0773480662983,48,175.0,93,356.0,5.53333333333,356.0,1811426,test,18,5,,4.0,2
124027,18828,0.196428571429,0.454545454545,0.0421455938697,0.106382978723,34,167.0,56,261.0,4.36363636364,261.0,1269013,test,8,3,,7.0,2
49934,16223,0.144578313253,0.733333333333,0.0368098159509,0.129411764706,68,241.0,81,316.0,3.75,326.0,435443,test,14,2,,17.0,4
28472,29447,0.220588235294,0.5,0.0857142857143,0.197183098592,40,104.0,68,175.0,1.8,175.0,1694317,test,12,1,,11.0,2
59253,1194,0.244444444444,0.666666666667,0.0436507936508,0.15873015873,30,189.0,45,252.0,4.81818181818,252.0,2772513,test,8,5,,3.0,2
68981,47209,0.327868852459,0.826086956522,0.0571428571429,0.11801242236,38,189.0,61,350.0,13.15,350.0,2936849,test,9,1,,15.0,2
174683,12576,0.131313131313,0.521739130435,0.0383480825959,0.115384615385,76,235.0,99,339.0,10.1538461538,339.0,2896837,test,10,2,,1.0,2
192254,19660,0.245614035088,0.590909090909,0.0388888888889,0.0849673202614,35,207.0,57,360.0,3.57142857143,360.0,2779757,test,19,3,,4.0,2
203332,16996,0.119565217391,0.172413793103,0.0304709141274,0.0523560209424,34,170.0,74,284.0,3.63636363636,361.0,3188993,test,11,0,,79.0,20
144784,38739,0.407407407407,0.913043478261,0.0782918149466,0.189189189189,31,170.0,54,281.0,9.68181818182,281.0,2840170,test,13,0,,5.0,2


## User Features

Some additional user-level features that we'll join in later

In [10]:
%%bq query -n user_features_q
SELECT orders.user_id,
  ANY_VALUE(users.num_orders) AS num_orders,
  SUM(op.reordered) / SUM(CASE WHEN orders.order_number > 1 THEN 1 ELSE 0 END) AS perc_reorder,
  ANY_VALUE(users.days_bw_first_last_order) / ANY_VALUE(users.num_orders) AS avg_days_bw_orders,
  COUNT(DISTINCT op.product_id) AS num_products,
  COUNT(DISTINCT products.aisle_id) AS num_aisles,
  COUNT(DISTINCT products.department_id) AS num_departments
FROM instacart.orders AS orders
INNER JOIN instacart.order_products__prior AS op ON op.order_id = orders.order_id
INNER JOIN instacart.users AS users ON orders.user_id = users.user_id
INNER JOIN instacart.products AS products ON op.product_id = products.product_id
GROUP BY 1

In [11]:
%%bq execute -q user_features_q -t instacart.user_features -m overwrite

user_id,num_orders,perc_reorder,avg_days_bw_orders,num_products,num_aisles,num_departments
109010,3,1.0,0.0,1,1,1
99295,3,1.0,0.0,1,1,1
164320,3,1.0,0.0,2,2,1
15495,3,1.0,0.0,1,1,1
62180,3,0.666666666667,0.0,2,1,1
181478,3,1.0,0.0,3,2,1
202329,3,0.75,2.0,4,3,1
97779,14,1.0,2.0,1,1,1
50492,4,0.75,2.0,2,2,1
172259,5,0.8,2.0,7,2,1


## Product Features

Some additional product-level features that we'll join in later

In [12]:
%%bq query -n product_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders, users.num_orders AS `num_orders_user`,
    up.num_reorders, users.num_orders - up.first_order_number AS `num_reorders_user`,
    users.days_bw_first_last_order AS `order_days`, users.days_bw_first_last_order - up.first_order_day AS `reorder_days`,
    up.first_order_number, up.first_order_day
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.product_id, products.aisle_id, products.department_id,
  COUNT(DISTINCT user_id) / ANY_VALUE(num_users_total) AS `perc_users`,
  SUM(num_orders) / SUM(num_orders_user) AS `perc_all_orders`,
  SUM(num_reorders) / NULLIF(SUM(num_reorders_user),0) AS `perc_reorders`,
  SUM(num_orders) / NULLIF(SUM(order_days),0) AS `orders_per_day`,
  SUM(num_reorders) / NULLIF(SUM(reorder_days),0) AS `reorders_per_day`,
  AVG(first_order_number) AS `avg_first_order_number`,
  AVG(first_order_day) AS `avg_first_order_day`
FROM up_features AS up
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
INNER JOIN (
  SELECT COUNT(DISTINCT user_id) AS num_users_total
  FROM instacart.user_products
) AS x ON 1=1
GROUP BY 1,2,3

In [13]:
%%bq execute -q product_features_q -t instacart.product_features -m overwrite

product_id,aisle_id,department_id,perc_users,perc_all_orders,perc_reorders,orders_per_day,reorders_per_day,avg_first_order_number,avg_first_order_day
13029,1,20,0.0012657061525,0.143698679043,0.161328588375,0.0165226493709,0.0181733146255,8.54406130268,71.9808429119
5653,1,20,0.00157607087954,0.131247514252,0.161721789883,0.0152605860682,0.0186803000084,10.5569230769,90.0738461538
26714,1,20,0.00307455057733,0.143773797772,0.17024112444,0.0159892725234,0.018334616539,9.35173501577,80.2712933754
10654,1,20,0.000387955908811,0.0764840182648,0.0584415584416,0.00778752833149,0.00606809753905,10.35,103.85
26870,1,20,0.000737116226741,0.0843142622502,0.0727951469902,0.00987559317686,0.00816925010473,9.93421052632,79.5526315789
8121,1,20,0.00070317008472,0.0611981962637,0.038503850385,0.00803631852019,0.00499696612771,7.04137931034,51.3586206897
25965,1,20,0.00321518459427,0.127115920291,0.148540653231,0.015843871433,0.0182357206842,10.7933634992,84.467571644
1600,1,20,0.000378257011091,0.0887342322749,0.0981308411215,0.0110115513333,0.0118823085628,13.0128205128,101.564102564
19268,1,20,0.00122691056161,0.0721782890007,0.0622033474894,0.00916627104408,0.00789799219716,11.6679841897,91.8537549407
6778,1,20,0.000940793078867,0.0503076366269,0.0268370607029,0.00641528591868,0.00342075256556,12.3505154639,96.793814433


## Time-of-Day Features

For each aisle/department (aisles roll up into departments), compute how much more likely to purchase at that time of day

In [14]:
%%bq query -n tod_features_q
WITH temp AS (
  SELECT orders.order_hour_of_day, orders.order_dow, products.aisle_id, products.department_id, COUNT(*) AS `n_product`
  FROM instacart.order_products__prior AS op
  INNER JOIN instacart.orders AS orders ON orders.order_id = op.order_id
  INNER JOIN instacart.products AS products ON op.product_id = products.product_id
  GROUP BY 1,2,3,4
)
SELECT order_hour_of_day, order_dow, aisle_id,
  n_aisle / n_aisle_total / (n_hod_dow / n_total) AS `aisle_tod_factor`,
  n_department / n_department_total / (n_hod_dow / n_total) AS `department_tod_factor`
FROM (
  SELECT order_hour_of_day, order_dow, aisle_id, department_id,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, aisle_id) AS `n_aisle`,
  SUM(n_product) OVER (PARTITION BY aisle_id) AS `n_aisle_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, department_id) AS `n_department`,
  SUM(n_product) OVER (PARTITION BY department_id) AS `n_department_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow) AS `n_hod_dow`,
  SUM(n_product) OVER () AS `n_total`
  FROM temp  
) AS x

In [15]:
%%bq execute -q tod_features_q -t instacart.tod_features -m overwrite

order_hour_of_day,order_dow,aisle_id,aisle_tod_factor,department_tod_factor
0,0,41,1.13133403424,0.981797884424
0,0,108,0.985975863352,0.973611227146
0,0,47,0.925793104668,0.983352194461
0,1,56,0.887944733997,0.77740556466
0,1,112,0.911520664653,0.88298492536
0,1,96,1.05277665231,1.07156030786
0,2,56,1.01353100184,0.992661765603
0,2,104,1.09912413205,1.07120623463
0,3,17,1.15422695185,1.14344748087
0,3,129,1.17484020237,1.07012363996


## Model Universe

Assemble model universe with all features

In [16]:
%%bq query -n reorder_model_universe_q
SELECT
upf.user_id,
upf.product_id,
pf.aisle_id,
pf.department_id,
upf.order_id,
upf.eval_set,
upf.order_hour_of_day,
upf.order_dow,
upf.is_ordered,
upf.perc_all_orders AS `upf_perc_all_orders`,
upf.perc_reorder AS `upf_perc_reorder`,
upf.orders_per_day AS `upf_orders_per_day`,
upf.reorders_per_day AS `upf_reorders_per_day`,
upf.first_order_number AS `upf_first_order_number`,
upf.first_order_day AS `upf_first_order_day`,
upf.last_order_number AS `upf_last_order_number`,
upf.last_order_day AS `upf_last_order_day`,
upf.days_since_last_order AS `upf_days_since_last_order`,
upf.orders_since_last_order AS `upf_orders_since_last_order`,
upf.avg_cart_order AS `upf_avg_cart_order`,
uf.num_orders AS `uf_num_orders`,
uf.perc_reorder AS `uf_perc_reorder`,
uf.avg_days_bw_orders AS `uf_avg_days_bw_orders`,
uf.num_products AS `uf_num_products`,
uf.num_aisles AS `uf_num_aisles`,
uf.num_departments AS `uf_num_departments`,
pf.perc_users AS `pf_perc_users`,
pf.perc_all_orders AS `pf_perc_all_orders`,
pf.perc_reorders AS `pf_perc_reorders`,
pf.orders_per_day AS `pf_orders_per_day`,
pf.reorders_per_day AS `pf_reorders_per_day`,
pf.avg_first_order_number AS `pf_avg_first_order_number`,
pf.avg_first_order_day AS `pf_avg_first_order_day`,
tf.aisle_tod_factor AS `tf_aisle_tod_factor`,
tf.department_tod_factor AS `tf_department_tod_factor`
FROM instacart.user_products_features AS upf
INNER JOIN instacart.user_features AS uf ON upf.user_id = uf.user_id
INNER JOIN instacart.product_features AS pf ON upf.product_id = pf.product_id
LEFT JOIN instacart.tod_features AS tf ON pf.aisle_id = tf.aisle_id AND upf.order_hour_of_day = tf.order_hour_of_day AND upf.order_dow = tf.order_dow

In [17]:
%%bq execute -q reorder_model_universe_q -t instacart.reorder_model_universe -m overwrite

user_id,product_id,aisle_id,department_id,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,upf_perc_all_orders,upf_perc_reorder,upf_orders_per_day,upf_reorders_per_day,upf_first_order_number,upf_first_order_day,upf_last_order_number,upf_last_order_day,upf_days_since_last_order,upf_orders_since_last_order,upf_avg_cart_order,uf_num_orders,uf_perc_reorder,uf_avg_days_bw_orders,uf_num_products,uf_num_aisles,uf_num_departments,pf_perc_users,pf_perc_all_orders,pf_perc_reorders,pf_orders_per_day,pf_reorders_per_day,pf_avg_first_order_number,pf_avg_first_order_day,tf_aisle_tod_factor,tf_department_tod_factor
45426,48900,3,19,1751814,test,16,2,,0.0151515151515,0.0,0.00280898876404,0.0,47,241.0,47,241.0,124.0,21,4.0,66,0.571614583333,5.39393939394,336,78,19,0.00388440853697,0.0618992695117,0.0743630938719,0.00798439488866,0.0101264240284,18.3458177278,146.676654182,0.977539463197,1.02942079515
96767,24654,43,3,800409,test,8,0,,0.0227272727273,,0.002849002849,,44,351.0,44,351.0,8.0,2,24.0,44,0.546644844517,7.97727272727,293,68,14,0.00899572763555,0.0696813739663,0.052775195658,0.00831784044472,0.00621343867936,9.98921832884,81.5908355795,1.06276628955,1.02631329148
63793,19548,88,13,1061343,test,7,4,,0.020202020202,0.0833333333333,0.00604229607251,0.0232558139535,87,288.0,92,305.0,29.0,9,16.0,99,0.701067615658,3.34343434343,351,78,19,0.0022743915154,0.0728775878838,0.0726089133701,0.00983526632101,0.00979316167159,13.6588486141,101.134328358,1.13748589322,0.980557812181
191999,44275,93,3,133460,test,13,0,,0.02,0.0,0.00333333333333,0.0,42,276.0,42,276.0,30.0,10,7.0,50,0.743002544529,6.0,111,45,16,0.0033703669578,0.146473399506,0.181392931393,0.0177078240773,0.0208861138263,10.7870503597,82.364028777,0.879227748369,1.01138394065
19063,1525,75,17,1031543,test,8,5,,0.0188679245283,0.0,0.00285714285714,0.0,52,346.0,52,346.0,9.0,3,9.0,53,0.597063621533,6.60377358491,259,70,18,0.00161001702157,0.0624606472736,0.0403047431801,0.00691867763984,0.00436844067977,11.6626506024,102.855421687,1.13329947898,1.10579953701
159765,6326,45,19,1448336,test,13,6,,0.0105263157895,0.0,0.00280112044818,0.0,74,291.0,74,291.0,74.0,23,5.0,95,0.889887640449,3.75789473684,57,18,9,0.000436450397412,0.0711705202312,0.105418719212,0.0093188268685,0.0130870841487,19.4777777778,144.044444444,0.879238737713,0.892953279911
122851,27344,96,20,1756510,test,13,0,,0.0128205128205,0.0,0.00390625,0.0,59,182.0,59,182.0,78.0,21,6.0,78,0.611208406305,3.28205128205,230,58,16,0.0683820783768,0.119701215975,0.127432540643,0.0138627840754,0.0144386355352,9.33799021346,77.7201616907,1.10351228795,1.05392501603
93519,49175,59,15,3113656,test,14,1,,0.020202020202,0.03125,0.0117647058824,0.0434782608696,67,147.0,71,150.0,21.0,30,14.5,99,0.838383838384,1.71717171717,313,56,17,0.0268998928272,0.066764078472,0.0597038544139,0.00843486483497,0.00746661087677,13.2915089237,103.915990626,1.03230854279,1.01759129571
290,38843,61,19,3116687,test,11,6,,0.0196078431373,0.0,0.00280112044818,0.0,45,315.0,45,315.0,49.0,8,19.0,51,0.664964901085,7.0,548,95,19,4.84944886014e-05,0.0369230769231,0.0168067226891,0.00502933780386,0.00159362549801,20.6,113.1,0.84947450101,0.871637846917
84411,30391,83,4,3257554,test,9,0,,0.0196078431373,0.0,0.00291545189504,0.0,42,261.0,42,261.0,94.0,11,5.0,51,0.667832167832,6.72549019608,198,61,16,0.118040434705,0.129067682186,0.148752544406,0.0157947546373,0.0181225502257,10.1088698081,82.0387001356,1.16286121217,1.09356689415


Need to shard when exporting from BQ; max export size out of BQ is 1GB

In [18]:
project_id = Context.default().project_id
bucket_path = 'gs://' + project_id
bucket_object = bucket_path + '/reorder_model/universe_*.csv'
bucket_object2 = bucket_path + '/reorder_model/universe.csv'

table = bq.Table('instacart.reorder_model_universe')
table.extract(destination = bucket_object, csv_header=False)

Job kaggle-instacart-172517/job_mPsv0AaTCj2GNP_v9LGFgoJ3Xlc completed

Concatenate the shards together

In [19]:
%%bash -s "$bucket_object" "$bucket_object2"
gsutil compose $1 $2

Composing gs://kaggle-instacart-172517/reorder_model/universe.csv from 12 component object(s).


Import as Pandas DF

In [20]:
fields = [str(x.name) for x in list(table.schema)]
%storage read --object $bucket_object2 --variable model_universe_raw
model_universe = pd.read_csv(StringIO(model_universe_raw), header=None, names=fields)
print model_universe.shape

(13307953, 35)


## LightGBM Model

Create a LightGBM model for predicting reorders

A little more feature engineering

In [21]:
model_universe['order_overdue'] = model_universe.upf_orders_per_day * model_universe.upf_days_since_last_order
model_universe['reorder_overdue'] = model_universe.upf_reorders_per_day * model_universe.upf_days_since_last_order

Set up our train and test sets

In [22]:
df = model_universe.loc[model_universe.eval_set == "train",]
df_test = model_universe.loc[model_universe.eval_set == "test",]

df_users = df[['user_id']].drop_duplicates()
df_users_train = df_users.sample(frac=0.8, random_state=200)
df_users_eval = df_users.drop(df_users_train.index)
df_train = df.merge(df_users_train, on=['user_id'], how='inner')
df_eval = df.merge(df_users_eval, on=['user_id'], how='inner')

y_val = 'is_ordered'
x_excludes = [y_val, 'user_id', 'order_id', 'eval_set']
categoricals = ['product_id', 'aisle_id', 'department_id']

y_train = df_train[y_val]
x_train = df_train.drop(x_excludes, axis=1)

y_eval = df_eval[y_val]
x_eval = df_eval.drop(x_excludes, axis=1)

y_test = df_test[y_val]
x_test = df_test.drop(x_excludes, axis=1)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_eval, y_eval, reference=lgb_train)

Train our model

In [23]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss'],
    'num_leaves': 96,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)

[1]	valid_0's binary_logloss: 0.658385
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.626891
[3]	valid_0's binary_logloss: 0.598296
[4]	valid_0's binary_logloss: 0.57225
[5]	valid_0's binary_logloss: 0.548443
[6]	valid_0's binary_logloss: 0.526663
[7]	valid_0's binary_logloss: 0.506615
[8]	valid_0's binary_logloss: 0.488172
[9]	valid_0's binary_logloss: 0.47122
[10]	valid_0's binary_logloss: 0.455522
[11]	valid_0's binary_logloss: 0.441
[12]	valid_0's binary_logloss: 0.427562
[13]	valid_0's binary_logloss: 0.415112
[14]	valid_0's binary_logloss: 0.403569
[15]	valid_0's binary_logloss: 0.392851
[16]	valid_0's binary_logloss: 0.382899
[17]	valid_0's binary_logloss: 0.373667
[18]	valid_0's binary_logloss: 0.36505
[19]	valid_0's binary_logloss: 0.357025
[20]	valid_0's binary_logloss: 0.349556
[21]	valid_0's binary_logloss: 0.342595
[22]	valid_0's binary_logloss: 0.3361
[23]	valid_0's binary_logloss: 0.330042
[24]	valid_0's binary_logloss: 0.324389
[25

Plot importance ratings

In [24]:
importance = {'var': gbm.feature_name(), 'importance': gbm.feature_importance()}
importance_df = pd.DataFrame(data=importance)
importance_df.sort_values('importance', ascending=False, inplace=True)
print(importance_df)

    importance                          var
17        2519              uf_perc_reorder
13        2214    upf_days_since_last_order
5         1392          upf_perc_all_orders
22        1366                pf_perc_users
14        1304  upf_orders_since_last_order
12        1277           upf_last_order_day
19        1213              uf_num_products
18        1211        uf_avg_days_bw_orders
31        1072                order_overdue
24        1061             pf_perc_reorders
32        1048              reorder_overdue
20         947                uf_num_aisles
26         863          pf_reorders_per_day
15         843           upf_avg_cart_order
6          760             upf_perc_reorder
3          738            order_hour_of_day
1          678                     aisle_id
10         656          upf_first_order_day
23         645           pf_perc_all_orders
2          624                department_id
28         621       pf_avg_first_order_day
7          621           upf_ord

## Calculate F1 Score on Eval Set

Use model to generate predictions, outer join with actual orders, and calculate F1 score

Pull down actual orders

In [25]:
%%bq query -n actuals
SELECT order_id, product_id FROM instacart.order_products__train

In [26]:
actuals_df = actuals.execute(output_options=bq.QueryOutput.dataframe()).result()
actuals_df = actuals_df.merge(df_eval[['order_id']].drop_duplicates(), on = ['order_id'], how = "inner")

Generate predictions for eval set

In [27]:
pred_threshold = 0.18
y_eval_pred = gbm.predict(x_eval, num_iteration=gbm.best_iteration)
pred_df = df_eval.loc[y_eval_pred > pred_threshold, ['order_id', 'product_id']]

Join together

In [28]:
actuals_df['actual'] = 1
pred_df['predicted'] = 1
all_df = actuals_df.merge(pred_df, on = ['order_id', 'product_id'], how = "outer")
all_df.fillna(0, inplace = True)
print(f1_score(all_df.actual, all_df.predicted))

0.348949101973


## Create Predictions for Test

Use model to generate predictions for our test set

In [29]:
y_test_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
df_test2 = df_test.loc[y_test_pred > pred_threshold, ['order_id', 'product_id']]
aggregation = {'product_id': [lambda x: ' '.join([str(i) for i in x])]}
df_test3 = df_test2.groupby(by = ['order_id'], as_index = False).agg(aggregation)
df_test3.columns = ['order_id', 'products']
df_test4 = df_test[['order_id']].drop_duplicates().merge(df_test3, on = ['order_id'], how = "left")
print(df_test4.shape) # should be 75000

(75000, 2)


In [30]:
df_test4.head(5)

Unnamed: 0,order_id,products
0,1751814,3952 3397 16154 32689 8277 27086 47766 21616 2...
1,800409,5258 26209 47209 21137 13176 24852 27695 22935...
2,1061343,32486 16363 31433 16908 36758 34943 44825 4690...
3,133460,27845 4920 24184 14355 22935 36702 21903 28985...
4,1031543,3957 47280 32655 39276 27992 4472


In [31]:
buf = StringIO()
df_test4.to_csv(buf, index = None)
buf = buf.getvalue()
bucket_object3 = bucket_path + '/reorder_model/test_predictions.csv'
%storage write --variable buf --object $bucket_object3