Install some packages before we get started

In [2]:
#%%bash
#echo "apt-get update" >> /content/datalab/.config/startup.sh
#echo "pip install lightgbm" >> /content/datalab/.config/startup.sh
#echo "apt-get install libgomp1" >> /content/datalab/.config/startup.sh
#cat /content/datalab/.config/startup.sh

In [1]:
from google.datalab import Context
import google.datalab.bigquery as bq
import google.datalab.storage as storage
import numpy as np
import pandas as pd
import json
import lightgbm as lgb
from sklearn.metrics import f1_score
from StringIO import StringIO

## Build Some Intermediary Tables

Pull some additional information for each user that we'll need later
Pull some additional information on when each user purchases each product

In [5]:
%%bq query -n users_q
SELECT orders.user_id, COALESCE(user_flags.is_train,0) AS is_train,
  COUNT(*) AS num_orders, SUM(days_since_prior_order) AS days_bw_first_last_order
FROM instacart.orders AS orders
LEFT JOIN (
  SELECT user_id, 1 AS `is_train` FROM instacart.orders WHERE eval_set = "train" GROUP BY 1
) AS user_flags ON orders.user_id = user_flags.user_id
WHERE eval_set = "prior"
GROUP BY 1,2

In [6]:
%%bq execute -q users_q -t instacart.users  -m overwrite

user_id,is_train,num_orders,days_bw_first_last_order
9515,0,3,0.0
99295,0,3,0.0
164320,1,3,0.0
15495,1,3,0.0
80567,1,3,0.0
179078,1,3,0.0
137150,0,3,0.0
121915,1,3,0.0
36904,1,3,0.0
133075,1,3,0.0


In [7]:
%%bq query -n user_products_q
SELECT orders.user_id, op.product_id, 
  COUNT(*) AS num_orders, SUM(op.reordered) AS num_reorders,
  MIN(orders.order_number) AS first_order_number, MIN(days_since_first_order) AS first_order_day,
  MAX(orders.order_number) AS last_order_number, MAX(days_since_first_order) AS last_order_day
FROM instacart.order_products__prior AS op
INNER JOIN (
  SELECT *, 
  SUM(COALESCE(days_since_prior_order,0)) OVER (PARTITION BY user_id ORDER BY order_number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `days_since_first_order`
  FROM instacart.orders WHERE eval_set = "prior"
) AS orders ON orders.order_id = op.order_id
GROUP BY 1,2

In [8]:
%%bq execute -q user_products_q -t instacart.user_products  -m overwrite

user_id,product_id,num_orders,num_reorders,first_order_number,first_order_day,last_order_number,last_order_day
125780,43504,15,14,28,213.0,58,342.0
113653,1671,23,22,6,13.0,66,292.0
94556,21137,22,21,2,14.0,42,358.0
65180,44632,31,30,33,98.0,63,216.0
16469,13176,28,27,2,6.0,52,357.0
27764,5785,27,26,1,0.0,53,350.0
201415,14182,12,11,22,167.0,41,349.0
120538,27744,64,63,2,6.0,96,358.0
91987,6948,38,37,1,0.0,71,356.0
91133,20247,36,35,2,3.0,73,323.0


## User-Product Features

User-product level data which is base for the reorder model (includes both test and train)

Contains "basket" of all products that user has ever purchased; trying to predict which will be reordered (though this is admittedly only ~2/3 of the problem since ~1/3 of purchases in the train set are to products never purchased before)

In [9]:
%%bq query -n user_products_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders / users.num_orders AS `perc_all_orders`,
    up.num_reorders / NULLIF(users.num_orders - up.first_order_number,0) AS `perc_reorder`,
    up.num_orders / NULLIF(users.days_bw_first_last_order,0) AS `orders_per_day`,
    up.num_reorders / NULLIF(users.days_bw_first_last_order - up.first_order_day,0) AS `reorders_per_day`,
    up.first_order_number, up.first_order_day, up.last_order_number, up.last_order_day, users.days_bw_first_last_order
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.*, 
  orders.order_id, orders.eval_set, orders.order_hour_of_day, orders.order_dow,
  CASE WHEN orders.eval_set = "test" THEN NULL ELSE LEAST(COALESCE(op_train.order_id,0),1) END AS `is_ordered`,
  up.days_bw_first_last_order - up.last_order_day + orders.days_since_prior_order AS `days_since_last_order`,
  orders.order_number - up.last_order_number + 1 AS `orders_since_last_order`
FROM up_features AS up
INNER JOIN instacart.orders AS orders ON orders.user_id = up.user_id AND orders.eval_set IN ('train','test')
LEFT JOIN instacart.order_products__train AS op_train ON orders.order_id = op_train.order_id AND up.product_id = op_train.product_id

In [10]:
%%bq execute -q user_products_features_q -t instacart.user_products_features  -m overwrite

user_id,product_id,perc_all_orders,perc_reorder,orders_per_day,reorders_per_day,first_order_number,first_order_day,last_order_number,last_order_day,days_bw_first_last_order,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,days_since_last_order,orders_since_last_order
94259,18159,0.119565217391,0.114942528736,0.030985915493,0.0299401197605,5,21.0,62,252.0,355.0,2440622,test,12,6,,106.0,32
67798,14467,0.171717171717,0.177777777778,0.0508982035928,0.0542372881356,9,39.0,64,213.0,334.0,2024870,test,14,1,,123.0,37
75943,34448,0.121212121212,0.112244897959,0.0408163265306,0.0374149659864,1,0.0,45,118.0,294.0,2000764,test,8,2,,178.0,56
59881,19938,0.141414141414,0.135416666667,0.0590717299578,0.0565217391304,3,7.0,68,162.0,237.0,947621,test,10,4,,77.0,33
165953,44948,0.130952380952,0.123456790123,0.03125,0.0291545189504,3,9.0,45,217.0,352.0,302695,test,0,3,,136.0,41
126537,44661,0.151515151515,0.166666666667,0.0604838709677,0.0679611650485,15,42.0,61,160.0,248.0,1828410,test,9,6,,92.0,40
140819,32864,0.156626506024,0.153846153846,0.0422077922078,0.0428571428571,5,28.0,49,221.0,308.0,2090419,test,6,0,,90.0,36
186704,18827,0.111111111111,0.102040816327,0.0355987055016,0.0323624595469,1,0.0,53,160.0,309.0,2531767,test,9,2,,153.0,48
190542,20114,0.111111111111,0.102040816327,0.0331325301205,0.0301204819277,1,0.0,66,216.0,332.0,1345825,test,7,1,,121.0,35
187842,47766,0.20202020202,0.19387755102,0.0684931506849,0.0650684931507,1,0.0,43,134.0,292.0,2422565,test,19,0,,159.0,58


## User Features

Some additional user-level features that we'll join in later

In [11]:
%%bq query -n user_features_q
SELECT users.user_id,
  ANY_VALUE(users.num_orders) AS num_orders, 
  ANY_VALUE(users.days_bw_first_last_order) / ANY_VALUE(users.num_orders) AS avg_days_bw_orders,
  COUNT(DISTINCT up.product_id) AS num_products,
  COUNT(DISTINCT products.aisle_id) AS num_aisles,
  COUNT(DISTINCT products.department_id) AS num_departments
FROM instacart.users AS users
INNER JOIN instacart.user_products AS up ON users.user_id = up.user_id
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
GROUP BY 1

In [12]:
%%bq execute -q user_features_q -t instacart.user_features  -m overwrite

user_id,num_orders,avg_days_bw_orders,num_products,num_aisles,num_departments
99295,3,0.0,1,1,1
181478,3,0.0,3,2,1
15495,3,0.0,1,1,1
109010,3,0.0,1,1,1
164320,3,0.0,2,2,1
62180,3,0.0,2,1,1
113387,3,2.0,3,2,1
172259,5,2.0,7,2,1
97779,14,2.0,1,1,1
202329,3,2.0,4,3,1


## Product Features

Some additional product-level features that we'll join in later

In [13]:
%%bq query -n product_features_q
WITH up_features AS (
  SELECT up.user_id, up.product_id,
    up.num_orders, users.num_orders AS `num_orders_user`,
    up.num_reorders, users.num_orders - up.first_order_number AS `num_reorders_user`,
    users.days_bw_first_last_order AS `order_days`, users.days_bw_first_last_order - up.first_order_day AS `reorder_days`,
    up.first_order_number, up.first_order_day
  FROM instacart.user_products AS up
  INNER JOIN instacart.users AS users ON up.user_id = users.user_id
)

SELECT up.product_id, products.aisle_id, products.department_id,
  COUNT(DISTINCT user_id) / ANY_VALUE(num_users_total) AS `perc_users`,
  SUM(num_orders) / SUM(num_orders_user) AS `perc_all_orders`,
  SUM(num_reorders) / NULLIF(SUM(num_reorders_user),0) AS `perc_reorders`,
  SUM(num_orders) / NULLIF(SUM(order_days),0) AS `orders_per_day`,
  SUM(num_reorders) / NULLIF(SUM(reorder_days),0) AS `reorders_per_day`,
  AVG(first_order_number) AS `avg_first_order_number`,
  AVG(first_order_day) AS `avg_first_order_day`
FROM up_features AS up
INNER JOIN instacart.products AS products ON up.product_id = products.product_id
INNER JOIN (
  SELECT COUNT(DISTINCT user_id) AS num_users_total
  FROM instacart.user_products
) AS x ON 1=1
GROUP BY 1,2,3

In [14]:
%%bq execute -q product_features_q -t instacart.product_features  -m overwrite

product_id,aisle_id,department_id,perc_users,perc_all_orders,perc_reorders,orders_per_day,reorders_per_day,avg_first_order_number,avg_first_order_day
13603,1,20,0.00180884442483,0.110287106685,0.122523844461,0.0131101707723,0.0140401025684,10.6890080429,85.3243967828
32170,1,20,0.0007177184313,0.129846708747,0.160902255639,0.0163306966063,0.0200665760233,12.0,94.2027027027
49445,1,20,0.000615880005237,0.0685944855414,0.0635838150289,0.00837197849551,0.00725525299161,13.8818897638,108.299212598
27216,1,20,0.00201252127696,0.068531877559,0.0438489646772,0.00788894873867,0.00503135864153,8.89156626506,76.7975903614
37167,1,20,0.000843804101664,0.0771420438372,0.0428445229682,0.0080688382064,0.00451351728631,7.17816091954,69.5114942529
17302,1,20,0.000969889772027,0.0797872340426,0.0736961451247,0.00808447046718,0.00718113019941,11.86,113.58
554,1,20,0.00170215654991,0.161957618567,0.189634146341,0.0189118331517,0.0216603983842,8.5698005698,70.7122507123
31315,1,20,0.00131905008996,0.131355321907,0.156402737048,0.0157512953368,0.0185431998609,10.4816176471,85.9779411765
39518,1,20,0.000620729454098,0.0577791152573,0.0498442367601,0.0070432868672,0.005936369539,15.9296875,128.7421875
30763,1,20,0.000940793078867,0.0599653379549,0.0377264829983,0.00755887621794,0.00469759248385,8.97422680412,69.1597938144


## Time-of-Day Features

For each aisle/department (aisles roll up into departments), compute how much more likely to purchase at that time of day

In [15]:
%%bq query -n tod_features_q
WITH temp AS (
  SELECT orders.order_hour_of_day, orders.order_dow, products.aisle_id, products.department_id, COUNT(*) AS `n_product`
  FROM instacart.order_products__prior AS op
  INNER JOIN instacart.orders AS orders ON orders.order_id = op.order_id
  INNER JOIN instacart.products AS products ON op.product_id = products.product_id
  GROUP BY 1,2,3,4
)
SELECT order_hour_of_day, order_dow, aisle_id,
  n_aisle / n_aisle_total / (n_hod_dow / n_total) AS `aisle_tod_factor`,
  n_department / n_department_total / (n_hod_dow / n_total) AS `department_tod_factor`
FROM (
  SELECT order_hour_of_day, order_dow, aisle_id, department_id,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, aisle_id) AS `n_aisle`,
  SUM(n_product) OVER (PARTITION BY aisle_id) AS `n_aisle_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow, department_id) AS `n_department`,
  SUM(n_product) OVER (PARTITION BY department_id) AS `n_department_total`,
  SUM(n_product) OVER (PARTITION BY order_hour_of_day, order_dow) AS `n_hod_dow`,
  SUM(n_product) OVER () AS `n_total`
  FROM temp  
) AS x

In [16]:
%%bq execute -q tod_features_q -t instacart.tod_features  -m overwrite

order_hour_of_day,order_dow,aisle_id,aisle_tod_factor,department_tod_factor
0,0,114,0.83094723151,0.870826738174
0,0,126,0.802501815989,0.983352194461
0,1,86,0.952903778856,0.982223553792
0,1,5,0.89474601281,0.971960693812
0,1,90,0.659123376788,0.954438571069
0,2,113,1.11710059668,0.99193337373
0,2,59,0.930576361861,1.06288044056
0,2,66,1.20395721813,1.13961578321
0,3,67,0.960753952694,0.946055431391
0,4,112,0.841346354697,0.81730146075


## Model Universe

Assemble model universe with all features

In [17]:
%%bq query -n reorder_model_universe_q
SELECT
upf.user_id,
upf.product_id,
pf.aisle_id,
pf.department_id,
upf.order_id,
upf.eval_set,
upf.order_hour_of_day,
upf.order_dow,
upf.is_ordered,
upf.perc_all_orders AS `upf_perc_all_orders`,
upf.perc_reorder AS `upf_perc_reorder`,
upf.orders_per_day AS `upf_orders_per_day`,
upf.reorders_per_day AS `upf_reorders_per_day`,
upf.first_order_number AS `upf_first_order_number`,
upf.first_order_day AS `upf_first_order_day`,
upf.last_order_number AS `upf_last_order_number`,
upf.last_order_day AS `upf_last_order_day`,
upf.days_since_last_order AS `upf_days_since_last_order`,
upf.orders_since_last_order AS `upf_orders_since_last_order`,
uf.num_orders AS `uf_num_orders`,
uf.avg_days_bw_orders AS `uf_avg_days_bw_orders`,
uf.num_products AS `uf_num_products`,
uf.num_aisles AS `uf_num_aisles`,
uf.num_departments AS `uf_num_departments`,
pf.perc_users AS `pf_perc_users`,
pf.perc_all_orders AS `pf_perc_all_orders`,
pf.perc_reorders AS `pf_perc_reorders`,
pf.orders_per_day AS `pf_orders_per_day`,
pf.reorders_per_day AS `pf_reorders_per_day`,
pf.avg_first_order_number AS `pf_avg_first_order_number`,
pf.avg_first_order_day AS `pf_avg_first_order_day`,
tf.aisle_tod_factor AS `tf_aisle_tod_factor`,
tf.department_tod_factor AS `tf_department_tod_factor`
FROM instacart.user_products_features AS upf
INNER JOIN instacart.user_features AS uf ON upf.user_id = uf.user_id
INNER JOIN instacart.product_features AS pf ON upf.product_id = pf.product_id
LEFT JOIN instacart.tod_features AS tf ON pf.aisle_id = tf.aisle_id AND upf.order_hour_of_day = tf.order_hour_of_day AND upf.order_dow = tf.order_dow

In [18]:
%%bq execute -q reorder_model_universe_q -t instacart.reorder_model_universe  -m overwrite

user_id,product_id,aisle_id,department_id,order_id,eval_set,order_hour_of_day,order_dow,is_ordered,upf_perc_all_orders,upf_perc_reorder,upf_orders_per_day,upf_reorders_per_day,upf_first_order_number,upf_first_order_day,upf_last_order_number,upf_last_order_day,upf_days_since_last_order,upf_orders_since_last_order,uf_num_orders,uf_avg_days_bw_orders,uf_num_products,uf_num_aisles,uf_num_departments,pf_perc_users,pf_perc_all_orders,pf_perc_reorders,pf_orders_per_day,pf_reorders_per_day,pf_avg_first_order_number,pf_avg_first_order_day,tf_aisle_tod_factor,tf_department_tod_factor
87170,9689,81,15,819541,test,12,2,,0.0120481927711,0.0,0.00274725274725,0.0,48,227.0,48,227.0,138.0,37,83,4.38554216867,196,36,12,0.00560596288232,0.129690754687,0.145664332141,0.0157248099967,0.0174973520956,9.82525951557,79.705017301,1.0093192837,1.02195520688
41698,34386,61,19,3012314,test,18,4,,0.0117647058824,0.0,0.00276243093923,0.0,47,197.0,47,197.0,168.0,40,85,4.25882352941,347,61,14,0.00288057262292,0.102346190165,0.103347704793,0.0114586383006,0.0113012368961,9.90067340067,85.6026936027,1.20658026976,1.02070598931
120166,25758,94,7,655374,test,14,5,,0.025,0.0208333333333,0.00555555555556,0.0049504950495,32,158.0,49,234.0,129.0,33,80,4.5,258,62,15,0.000688621738139,0.0723163841808,0.067887109077,0.00935864065744,0.00876372409039,12.6971830986,97.7605633803,1.12379807508,1.09570368651
59161,248,117,19,2440858,test,7,0,,0.010752688172,0.0,0.00285714285714,0.0,43,162.0,43,162.0,192.0,52,93,3.76344086022,107,30,12,0.0185297440946,0.0583414223183,0.0409250670048,0.00726081258191,0.00502263138613,12.2724417692,96.7673383931,0.898852883868,0.838267281903
115553,27478,126,11,2548573,test,7,0,,0.0102040816327,0.0,0.00294117647059,0.0,49,277.0,49,277.0,66.0,51,98,3.4693877551,225,59,18,0.000538288823475,0.0579483163665,0.0303278688525,0.00655679603048,0.00296997912988,12.018018018,91.1171171171,0.947825581233,0.928820132326
170588,35951,91,16,2981874,test,10,5,,0.0121951219512,0.0,0.00275482093664,0.0,52,254.0,52,254.0,109.0,32,82,4.42682926829,152,47,16,0.0679892730191,0.164055437958,0.191618989387,0.0197014110695,0.0226590900523,8.83944365193,71.4916547789,1.00621562588,1.03854704559
155243,14678,116,1,146880,test,20,5,,0.010989010989,0.0,0.00280898876404,0.0,55,196.0,55,196.0,166.0,38,91,3.91208791209,274,71,16,0.0428400312305,0.0791296830197,0.07423526729,0.0100881793051,0.00940848894416,11.5628254471,89.8896309712,1.09535166465,1.12959672378
122215,10749,83,4,2742672,test,11,6,,0.0108695652174,0.0,0.00275482093664,0.0,49,257.0,49,257.0,108.0,45,92,3.94565217391,155,48,17,0.0946466934033,0.111751772457,0.114372125108,0.0135920612999,0.0138147589567,9.3472357432,75.8795921504,1.10022567096,1.02044850726
103604,4461,19,13,2582506,test,12,0,,0.0125,0.0,0.00353356890459,0.0,46,157.0,46,157.0,127.0,36,80,3.5375,126,39,12,0.0302217652964,0.0479063751716,0.0236382992591,0.00612338831371,0.00300033643478,13.3913671374,103.954428755,1.06638986875,1.03128524474
159183,44904,104,13,743561,test,22,6,,0.0111111111111,0.0,0.00277008310249,0.0,41,162.0,41,162.0,202.0,51,90,4.01111111111,287,79,16,0.00396684916759,0.0498920974206,0.0150412898152,0.00578806502185,0.00168037693161,11.3569682152,93.7750611247,1.09942488271,1.01471700969


Need to shard when exporting from BQ; max export size out of BQ is 1GB

In [3]:
project_id = Context.default().project_id
bucket_path = 'gs://' + project_id
bucket_object = bucket_path + '/reorder_model/universe_*.csv'
bucket_object2 = bucket_path + '/reorder_model/universe.csv'

table = bq.Table('instacart.reorder_model_universe')
#table.extract(destination = bucket_object, csv_header=False)

Concatenate the shards together

In [20]:
%%bash -s "$bucket_object" "$bucket_object2"
gsutil compose $1 $2

Composing gs://kaggle-instacart-172517/reorder_model/universe.csv from 8 component object(s).


Import as Pandas DF

In [4]:
fields = [str(x.name) for x in list(table.schema)]
%storage read --object $bucket_object2 --variable model_universe_raw
model_universe = pd.read_csv(StringIO(model_universe_raw), header=None, names=fields)
print model_universe.shape

(13307953, 33)


## LightGBM Model

Create a LightGBM model for predicting reorders

A little more feature engineering

In [28]:
model_universe['order_overdue'] = model_universe.upf_orders_per_day * model_universe.upf_days_since_last_order
model_universe['reorder_overdue'] = model_universe.upf_reorders_per_day * model_universe.upf_days_since_last_order

Set up our train and test sets

In [31]:
df = model_universe.loc[model_universe.eval_set == "train",]
df_test = model_universe.loc[model_universe.eval_set == "test",]

df_users = df[['user_id']].drop_duplicates()
df_users_train = df_users.sample(frac=0.8, random_state=200)
df_users_eval = df_users.drop(df_users_train.index)
df_train = df.merge(df_users_train, on=['user_id'], how='inner')
df_eval = df.merge(df_users_eval, on=['user_id'], how='inner')

y_val = 'is_ordered'
x_excludes = [y_val, 'user_id', 'order_id', 'eval_set']
categoricals = ['product_id', 'aisle_id', 'department_id']

y_train = df_train[y_val]
x_train = df_train.drop(x_excludes, axis=1)

y_eval = df_eval[y_val]
x_eval = df_eval.drop(x_excludes, axis=1)

y_test = df_test[y_val]
x_test = df_test.drop(x_excludes, axis=1)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_eval, y_eval, reference=lgb_train)

Train our model

In [78]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss'],
    'num_leaves': 96,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)

[1]	valid_0's binary_logloss: 0.658339
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.626819
[3]	valid_0's binary_logloss: 0.598192
[4]	valid_0's binary_logloss: 0.57211
[5]	valid_0's binary_logloss: 0.548278
[6]	valid_0's binary_logloss: 0.526436
[7]	valid_0's binary_logloss: 0.506399
[8]	valid_0's binary_logloss: 0.487947
[9]	valid_0's binary_logloss: 0.470936
[10]	valid_0's binary_logloss: 0.455226
[11]	valid_0's binary_logloss: 0.440707
[12]	valid_0's binary_logloss: 0.427271
[13]	valid_0's binary_logloss: 0.41482
[14]	valid_0's binary_logloss: 0.403273
[15]	valid_0's binary_logloss: 0.39255
[16]	valid_0's binary_logloss: 0.382569
[17]	valid_0's binary_logloss: 0.373293
[18]	valid_0's binary_logloss: 0.364665
[19]	valid_0's binary_logloss: 0.356633
[20]	valid_0's binary_logloss: 0.349149
[21]	valid_0's binary_logloss: 0.342186
[22]	valid_0's binary_logloss: 0.335692
[23]	valid_0's binary_logloss: 0.329638
[24]	valid_0's binary_logloss: 0.32398

Plot importance ratings

In [79]:
importance = {'var': gbm.feature_name(), 'importance': gbm.feature_importance()}
importance_df = pd.DataFrame(data=importance)
importance_df.sort_values('importance', ascending=False, inplace=True)
print(importance_df)

    importance                          var
13        2586    upf_days_since_last_order
12        1996           upf_last_order_day
20        1779                pf_perc_users
16        1632        uf_avg_days_bw_orders
17        1605              uf_num_products
14        1582  upf_orders_since_last_order
18        1388                uf_num_aisles
30        1307              reorder_overdue
5         1302          upf_perc_all_orders
22        1280             pf_perc_reorders
29        1278                order_overdue
24        1166          pf_reorders_per_day
3         1043            order_hour_of_day
7         1024           upf_orders_per_day
6         1020             upf_perc_reorder
1          924                     aisle_id
26         909       pf_avg_first_order_day
10         885          upf_first_order_day
21         876           pf_perc_all_orders
28         854     tf_department_tod_factor
27         853          tf_aisle_tod_factor
0          809                  

## Calculate F1 Score on Eval Set

Use model to generate predictions, outer join with actual orders, and calculate F1 score

Pull down actual orders

In [38]:
%%bq query -n actuals
SELECT order_id, product_id FROM instacart.order_products__train

In [42]:
actuals_df = actuals.execute(output_options=bq.QueryOutput.dataframe()).result()
actuals_df = actuals_df.merge(df_eval[['order_id']].drop_duplicates(), on = ['order_id'], how = "inner")

Generate predictions for eval set

In [86]:
pred_threshold = 0.18
y_eval_pred = gbm.predict(x_eval, num_iteration=gbm.best_iteration)
pred_df = df_eval.loc[y_eval_pred > pred_threshold, ['order_id', 'product_id']]

Join together

In [83]:
actuals_df['actual'] = 1
pred_df['predicted'] = 1
all_df = actuals_df.merge(pred_df, on = ['order_id', 'product_id'], how = "outer")
all_df.fillna(0, inplace = True)
print(f1_score(all_df.actual, all_df.predicted))

0.34589262493


## Create Predictions for Test

Use model to generate predictions for our test set

In [94]:
y_test_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
df_test2 = df_test.loc[y_test_pred > pred_threshold, ['order_id', 'product_id']]
aggregation = {'product_id': [lambda x: ' '.join([str(i) for i in x])]}
df_test3 = df_test2.groupby(by = ['order_id'], as_index = False).agg(aggregation)
df_test3.columns = ['order_id', 'products']
df_test3.head(5)

In [101]:
buf = StringIO()
df_test3.to_csv(buf, index = None)
bucket_object3 = bucket_path + '/reorder_model/test_predictions.csv'
%storage write --variable buf.getvalue() --object $bucket_object3

'buf.getvalue()'
