In [1]:
import numpy as np
import pandas as pd

from google.cloud import bigquery

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

## Create BQ table with data for model

Train and test sets delineated by whether user is in train/test. Validate set contains NEW orders by members in the train set. To simplify model, filter to products comprising top 90% of orders.

In [2]:
bq_client = bigquery.Client()
job_config = bigquery.QueryJobConfig()

table_ref = bq_client.dataset('instacart').table('new_prod_model')

param = bigquery.ScalarQueryParameter('perc_top_products', 'FLOAT', 0.9)
job_config.query_parameters = [param]
job_config.destination = table_ref
job_config.write_disposition = 'WRITE_TRUNCATE'

query = """
    WITH top_products AS (
      SELECT product_id FROM (
        SELECT product_id, n_orders,
          SUM(n_orders) OVER (ORDER BY n_orders DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS n_orders_cumulative,
          (SELECT COUNT(*) FROM instacart.order_products__prior) AS n_orders_total
        FROM (
          SELECT product_id, COUNT(*) AS n_orders FROM instacart.order_products__prior GROUP BY 1
        ) AS x
      ) AS x WHERE n_orders_cumulative / n_orders_total <= @perc_top_products
    ), prior_products AS (
      SELECT o.user_id, opp.product_id
      FROM instacart.order_products__prior opp
      INNER JOIN instacart.orders o USING(order_id)
      GROUP BY 1,2
    ), users AS (
      SELECT user_id, eval_set, order_id
      FROM instacart.orders
      WHERE eval_set IN ("train", "test")
      GROUP BY 1,2,3
    ), order_products_train AS (
      SELECT opp.product_id, o.user_id, u.eval_set, u.order_id
      FROM instacart.order_products__prior opp
      INNER JOIN top_products tp USING(product_id)
      INNER JOIN instacart.orders o USING(order_id)
      INNER JOIN users u USING(user_id)
    ), order_products_validate AS (
      SELECT opt.product_id, o.user_id
      FROM instacart.order_products__train opt
      INNER JOIN instacart.orders o USING(order_id)
      LEFT JOIN prior_products pp USING(user_id, product_id)
      WHERE pp.product_id IS NULL
    )
    SELECT op.user_id, op.product_id, op.eval_set, op.order_id, 
      COUNT(DISTINCT op.order_id)/ANY_VALUE(u.n_user_orders) AS order_freq
    FROM order_products_train AS op
    INNER JOIN (
      SELECT user_id, COUNT(DISTINCT order_id) AS n_user_orders
      FROM order_products_train GROUP BY 1
    ) AS u USING(user_id)
    GROUP BY 1,2,3,4
    UNION ALL
    SELECT user_id, product_id, 'validate' AS eval_set, NULL AS order_id, NULL AS order_freq
    FROM order_products_validate
    GROUP BY 1,2
"""

query_job = bq_client.query(query, job_config=job_config)
query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fcb27a7bcc0>

## Pull data from BQ into Spark DF

In [3]:
# for deleting temp files when we're done
def cleanup(sess, input_dir):
    input_path = sess._jvm.org.apache.hadoop.fs.Path(input_dir)
    input_path.getFileSystem(sess._jsc.hadoopConfiguration()).delete(input_path, True)

In [4]:
# set up spark session
sess = SparkSession.builder\
    .appName("Model builder")\
    .config("spark.executor.cores", 2)\
    .config("spark.executor.memory", "7g")\
    .config("spark.network.timeout", 2000)\
    .config("spark.shuffle.io.maxRetries", 10)\
    .config("spark.sql.broadcastTimeout", 1000)\
    .getOrCreate()
    
bucket = sess._sc._jsc.hadoopConfiguration().get('fs.gs.system.bucket')
project = sess._sc._jsc.hadoopConfiguration().get('fs.gs.project.id')
input_dir = 'gs://{}/hadoop/tmp/bigquery/pyspark_input'.format(bucket)
output = 'gs://instacart-data/outputs/new_prod_test_pred.csv'

In [5]:
# load data from bq
conf = {
    'mapred.bq.project.id': project,
    'mapred.bq.gcs.bucket': bucket,
    'mapred.bq.temp.gcs.path': input_dir,
    'mapred.bq.input.project.id': project,
    'mapred.bq.input.dataset.id': 'instacart',
    'mapred.bq.input.table.id': 'new_prod_model',
}

cleanup(sess, input_dir)

data_raw = sess._sc.newAPIHadoopRDD(
    'com.google.cloud.hadoop.io.bigquery.JsonTextBigQueryInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'com.google.gson.JsonObject',
    conf=conf)

data_json = data_raw.map(lambda x: x[1])
data_df = sess.read.json(data_json).repartition(sess._sc.defaultParallelism*2)

In [6]:
# cast integers
data_df = data_df\
    .withColumn('order_id', data_df.order_id.cast('integer'))\
    .withColumn('user_id', data_df.user_id.cast('integer'))\
    .withColumn('product_id', data_df.product_id.cast('integer'))

## Hyperparameter tuning

Note: must train model once on both train and test members so that we have user-level weights for predictions.

In [7]:
# split into train/test
train = data_df.filter(data_df.eval_set.isin(['train','test']))
train2 = data_df.filter(data_df.eval_set == 'train')
test = data_df.filter(data_df.eval_set == 'test')
validate = data_df.filter(data_df.eval_set == 'validate')

In [8]:
# recommendation model
als = ALS()\
    .setMaxIter(10)\
    .setCheckpointInterval(5)\
    .setImplicitPrefs(True)\
    .setSeed(0)\
    .setUserCol("user_id")\
    .setItemCol("product_id")\
    .setRatingCol("order_freq")\
    .setNonnegative(True)\
    .setColdStartStrategy("drop")\
    .setNumBlocks(50)

In [9]:
# hyperparameter tuning
train.cache()

param_grid = ParamGridBuilder()\
    .addGrid(als.rank, list([10**k for k in range(2, 3)]))\
    .addGrid(als.regParam, list([10**k for k in range(-1, 0)]))\
    .build()

eva = RegressionEvaluator(labelCol="order_freq")
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    numFolds=3,
                    evaluator=eva)

cv_model = cv.fit(train)

best_func = np.argmax if eva.isLargerBetter() else np.argmin
best_idx = best_func(cv_model.avgMetrics)
best_score = cv_model.avgMetrics[best_idx]
best_param = param_grid[best_idx]

print("Best CV score: {}".format(best_score))
print("Best CV param: {}".format(best_param))

Best CV score: 0.9109078799040278
Best CV param: {Param(parent='ALS_47af91b3bc814985358e', name='rank', doc='rank of the factorization'): 100, Param(parent='ALS_47af91b3bc814985358e', name='regParam', doc='regularization parameter (>= 0).'): 0.1}


In [10]:
# generate predictions for validation set
all_pred = cv_model.bestModel.recommendForAllUsers(100)
all_pred = all_pred.select("user_id", explode(all_pred.recommendations).alias("recommendations"))
all_pred = all_pred.select("user_id", "recommendations.product_id", "recommendations.rating").cache()

DataFrame[user_id: int, product_id: int, rating: float]

In [11]:
# remove past combinations
prior_products = train.select('user_id', 'product_id').distinct().cache()
new_pred = all_pred.join(prior_products, on=['user_id', 'product_id'], how="left_anti").cache()

In [12]:
# flag correct predictions
validate_pred = new_pred.join(broadcast(train2.select("user_id").distinct()), on=["user_id"])
validate_pred = validate_pred.join(broadcast(validate), on=['user_id', 'product_id'], how="outer")
validate_pred = validate_pred.withColumn("label", (validate_pred.eval_set == "validate").cast("integer"))
validate_pred = validate_pred.fillna(-1, subset=["rating"])
validate_pred = validate_pred.fillna(0, subset=["label"])

In [13]:
# determine cutoff which maximizes mean F1 score
validate_pred_agg = validate_pred.withColumn('rating_bkt', round(validate_pred.rating,2))
validate_pred_agg = validate_pred_agg.groupBy('rating_bkt')\
                        .agg(sum('label').alias('sum'), count('label').alias('count'))
validate_pred_df = validate_pred_agg.toPandas()

def precision_fn(df, cutoff): 
    x = df.loc[df.rating_bkt >= cutoff, ['sum','count']].apply(np.sum)
    return x[0] / x[1]

def recall_fn(df, cutoff): 
    return np.sum(df['sum'][df.rating_bkt >= cutoff]) / np.sum(df['sum'])

thresholds = np.arange(0, 1, 0.01)
precision = np.array([precision_fn(validate_pred_df, x) for x in thresholds])
recall = np.array([recall_fn(validate_pred_df, x) for x in thresholds])
f1 = (2*precision*recall)/(precision+recall)
optimal_threshold = thresholds[np.nanargmax(f1)]

print("Optimal threshold: {}".format(optimal_threshold))
print("Optimal threshold F1: {}".format(np.nanmax(f1)))

Optimal threshold: 0.31
Optimal threshold F1: 0.025856346736422006


## Generate predictions for test set

In [14]:
# create predictions for test set
test_users = test.select("user_id", "order_id").distinct().cache()

test_pred = new_pred.join(broadcast(test_users), on=["user_id"])
test_pred = test_pred.filter(test_pred.rating >= optimal_threshold)\
                .groupBy('order_id').agg(collect_list('product_id').alias('products'))

collapse = udf(lambda x: ' '.join([str(i) for i in x]))
test_pred = test_pred.withColumn('products', collapse('products'))
test_pred = test_users.join(broadcast(test_pred), on='order_id', how='left')
test_pred = test_pred.select('order_id', 'products')

In [15]:
# export
cleanup(sess, output)
test_pred.repartition(1).write.option('header', 'true').csv(output)

In [16]:
# cleanup
cleanup(sess, input_dir)