In [4]:
import google.datalab.bigquery as bq

## check what proportion of train orders are reorders of past products

In [13]:
%%bq query 
SELECT reordered, COUNT(*) AS `n` FROM instacart.order_products__train GROUP BY 1

reordered,n
1,828824
0,555793


## qc the 'reordered' column; all user-product combos should also exist in 'order_products__prior'

In [12]:
%%bq query
SELECT has_prior, COUNT(*) AS `n` FROM (
  SELECT train_reorders.user_id, train_reorders.product_id, COALESCE(prior_orders.has_prior,0) AS `has_prior`
  FROM (
    SELECT user_id, product_id
    FROM instacart.orders AS orders
    INNER JOIN instacart.order_products__train AS orders_products ON orders.order_id = orders_products.order_id
    WHERE orders_products.reordered = 1 GROUP BY 1,2
  ) AS train_reorders
  LEFT JOIN (
    SELECT user_id, product_id, 1 AS `has_prior`
    FROM instacart.orders AS orders
    INNER JOIN instacart.order_products__prior AS orders_products ON orders.order_id = orders_products.order_id
    GROUP BY 1,2
  ) AS prior_orders ON train_reorders.user_id = prior_orders.user_id AND train_reorders.product_id = prior_orders.product_id
) AS x GROUP BY 1

has_prior,n
1,828824


## check what proportion of orders have an item never ordered before

In [11]:
%%bq query
SELECT AVG(CASE WHEN n_new > 0 THEN 1 ELSE 0 END) AS `perc_has_new` FROM (
  SELECT order_id, SUM(1-reordered) AS `n_new` FROM instacart.order_products__train GROUP BY 1
) AS x

perc_has_new
0.815553811095


## qc user overlap between prior, train, and test sets

In [15]:
%%bq query -n prior_train_test_qc
SELECT in_prior, in_train, in_test, COUNT(*) AS `n` FROM (
  SELECT user_id, 
    LEAST(SUM(CASE WHEN eval_set = 'prior' THEN 1 ELSE 0 END),1) AS `in_prior`,
    LEAST(SUM(CASE WHEN eval_set = 'train' THEN 1 ELSE 0 END),1) AS `in_train`,
    LEAST(SUM(CASE WHEN eval_set = 'test' THEN 1 ELSE 0 END),1) AS `in_test`
  FROM instacart.orders
  GROUP BY 1
) AS x GROUP BY 1,2,3

In [16]:
results = prior_train_test_qc.execute(output_options=bq.QueryOutput.dataframe()).result()
print results

<class 'google.datalab.bigquery._query.Query'>
   in_prior  in_train  in_test       n
0         1         0        1   75000
1         1         1        0  131209


## that order numbers are sequential

In [None]:
%%bq query
SELECT SUM(order_number) AS `qc1`, MAX(order_number)*(MAX(order_number)+1)/2 AS `qc2` FROM instacart.orders GROUP BY user_id HAVING qc1 != qc2