In [3]:
import google.datalab.bigquery as bq
import pandas as pd

## User-Product Attributes

Some metrics for how often a specific user purchases a specific product

In [5]:
%%bq query -n user_products_query
SELECT orders.user_id, op.product_id,
  COUNT(*) / ANY_VALUE(orders.num_orders) AS `perc_all_orders`,
  SUM(op.reordered) / NULLIF(ANY_VALUE(orders.num_orders) - MIN(orders.order_number),0) AS `perc_reorder`,
  COUNT(*) / NULLIF(ANY_VALUE(orders.days_bw_first_last_order),0) * 30 AS `orders_per_month`,
  SUM(op.reordered) / NULLIF(ANY_VALUE(orders.days_bw_first_last_order) - MIN(days_since_first_order),0) * 30 AS `reorders_per_month`,
  MIN(orders.order_number) AS `first_order_number`,
  CASE WHEN MIN(orders.order_number) = 1 THEN 0 ELSE MIN(days_since_first_order) END AS `first_order_days`
FROM (
  SELECT *, 
  SUM(days_since_prior_order) OVER (PARTITION BY user_id ORDER BY order_number ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `days_since_first_order`,
  SUM(days_since_prior_order) OVER (PARTITION BY user_id) AS `days_bw_first_last_order`,
  COUNT(*) OVER (PARTITION BY user_id) AS `num_orders`
  FROM instacart.orders WHERE eval_set = "prior"
) AS orders
INNER JOIN instacart.order_products__prior AS op ON orders.order_id = op.order_id
WHERE orders.user_id >= @user_min AND orders.user_id <= @user_max
GROUP BY 1,2
ORDER BY orders.user_id ASC, op.product_id ASC

In [None]:
def fetch_user_products(user_min, user_max):
    query_params = [
        {'name': 'user_min', 'parameterType': {'type': 'INT64'}, 'parameterValue': {'value': user_min}},
        {'name': 'user_max', 'parameterType': {'type': 'INT64'}, 'parameterValue': {'value': user_max}}        
    ]

    user_products = user_products_query.execute(
        output_options = bq.QueryOutput.dataframe(), 
        query_params = query_params
    ).result()
    
    return user_products

# split into pieces due to size
up1 = fetch_user_products(1,50000)
up2 = fetch_user_products(50001,100000)
up3 = fetch_user_products(100001,150000)
up4 = fetch_user_products(150001,200000)
up5 = fetch_user_products(200001,250000)
user_products = pd.concat([up1,up2,up3,up4,up5], axis=0)

In [None]:
## Time-of-Day Factors

For each aisle/department, compute how much more li