In [1]:
import re
from google.cloud import storage
from pyspark.sql.functions import *

## Some setup

In [2]:
# for deleting temp files when we're done
def cleanup(sess, input_dir):
    input_path = sess._jvm.org.apache.hadoop.fs.Path(input_dir)
    input_path.getFileSystem(sess._jsc.hadoopConfiguration()).delete(input_path, True)

In [3]:
# set up spark session
sess = SparkSession.builder\
    .appName("Combine predictions")\
    .getOrCreate()

## Get path to model outputs

In [4]:
client = storage.Client()
bucket = client.get_bucket('instacart-data')

def list_regex(bucket, regexpr):
    blobs = list(bucket.list_blobs())
    return [b.name for b in blobs if re.match(regexpr, b.name)]

new_path = list_regex(bucket, "^outputs/new_prod_test_pred.csv/part-.*\\.csv")[0]
reorder_path = list_regex(bucket, "^outputs/reorder_test_pred.csv/part-.*\\.csv")[0]

## Load predictions and combine

In [5]:
new_pred = spark.read.\
    format("csv").\
    options(header=True, inferSchema=True).\
    load("gs://instacart-data/{}".format(new_path))

reorder_pred = spark.read.\
    format("csv").\
    options(header=True, inferSchema=True).\
    load("gs://instacart-data/{}".format(reorder_path))

In [6]:
new_pred = new_pred.select("order_id", new_pred.products.alias("products1"))
reorder_pred = reorder_pred.select("order_id", reorder_pred.products.alias("products2"))
all_pred = new_pred.join(reorder_pred, on="order_id", how="inner")
all_pred = all_pred.fillna("", subset=["products1", "products2"])
all_pred = all_pred.select("order_id", concat(all_pred.products1, all_pred.products2).alias('products'))

## Export

In [7]:
output = 'gs://instacart-data/outputs/test_pred.csv'
cleanup(sess, output)
all_pred.repartition(1).write.option('header', 'true').csv(output)