# Instacart data preparation for two-tower SBERT

This notebook mirrors `src/data/prepare_instacart_sbert.py`: same functions, compact context format (`[+Nd w{dow}h{hour}]`, `Next: +{gap}d w{dow}h{hour}`), and script defaults. Uses a small subset by default (`max_target_orders=500`) for fast iteration.

## Setup: paths and config

In [18]:
from pathlib import Path
from collections import defaultdict

import pandas as pd
from datasets import Dataset

from src.constants import DEFAULT_DATA_DIR, DEFAULT_PROCESSED_DIR

DATA_DIR = DEFAULT_DATA_DIR
OUTPUT_DIR = DEFAULT_PROCESSED_DIR
CHUNK_SIZE = 500_000  # for reading order_products__prior.csv (matches script)

# Config: script defaults are max_prior_orders=5, max_product_names=20
MAX_TARGET_ORDERS = 500  # limit target orders for fast iteration (script: --max-target-orders)
MAX_PRIOR_ORDERS = 5
MAX_PRODUCT_NAMES = 20
EVAL_FRAC = 0.1
EVAL_SERVE_TIME = True  # strip " Next: ..." from eval queries so eval matches production
SAMPLE_FRAC = None
SEED = 42

print("DATA_DIR:", DATA_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)

DATA_DIR: /Users/chen_bowen/AI & ML/Projects/Instacart_Personalization/data
OUTPUT_DIR: /Users/chen_bowen/AI & ML/Projects/Instacart_Personalization/processed


## 1. `load_product_text_map`

Builds `product_id -> "Product: {name}. Aisle: {aisle}. Department: {department}."`

In [19]:
def load_product_text_map(
    products_path: Path, aisles_path: Path, departments_path: Path
) -> dict[int, str]:
    """
    Build a mapping from product_id to a single text string for the item tower.

    Joins products with aisle and department names, then formats each product as
    "Product: {name}. Aisle: {aisle}. Department: {department}." for use as
    the "positive" (item) side of (anchor, positive) pairs.
    """
    products = pd.read_csv(products_path)
    aisles = pd.read_csv(aisles_path)
    departments = pd.read_csv(departments_path)
    products = products.merge(aisles, on="aisle_id").merge(
        departments, on="department_id"
    )
    products["text"] = (
        "Product: "
        + products["product_name"].astype(str)
        + ". Aisle: "
        + products["aisle"].astype(str)
        + ". Department: "
        + products["department"].astype(str)
        + "."
    )
    return dict(zip(products["product_id"], products["text"]))

product_text_map = load_product_text_map(
    DATA_DIR / "products.csv",
    DATA_DIR / "aisles.csv",
    DATA_DIR / "departments.csv",
)

In [20]:
# Inspect output
print("Len:", len(product_text_map))
print("\nSample (first 5 product_ids):")
for pid, text in list(product_text_map.items())[:5]:
    print(f"  {pid}: {text[:80]}...")

Len: 49688

Sample (first 5 product_ids):
  1: Product: Chocolate Sandwich Cookies. Aisle: cookies cakes. Department: snacks....
  2: Product: All-Seasons Salt. Aisle: spices seasonings. Department: pantry....
  3: Product: Robust Golden Unsweetened Oolong Tea. Aisle: tea. Department: beverages...
  4: Product: Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce. Aisl...
  5: Product: Green Chile Anytime Sauce. Aisle: marinades meat preparation. Departmen...


## 2. `load_orders`

Returns `(target_orders, history_orders)` DataFrames.

In [21]:
def load_orders(orders_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load orders.csv and split into train vs prior by eval_set.

    Target orders are the "next" order we predict for each user; history orders
    are that user's history used only to build context (no leakage).
    """
    orders = pd.read_csv(orders_path)
    if orders["order_hour_of_day"].dtype == object:
        orders["order_hour_of_day"] = (
            orders["order_hour_of_day"].astype(str).str.zfill(2)
        )
    target_orders = orders[orders["eval_set"] == "train"][
        [
            "order_id",
            "user_id",
            "order_number",
            "order_dow",
            "order_hour_of_day",
            "days_since_prior_order",
        ]
    ].copy()
    history_orders = orders[orders["eval_set"] == "prior"][
        [
            "order_id",
            "user_id",
            "order_number",
            "order_dow",
            "order_hour_of_day",
            "days_since_prior_order",
        ]
    ].copy()
    return target_orders, history_orders

target_orders_full, history_orders_full = load_orders(DATA_DIR / "orders.csv")

In [22]:
print("target_orders shape:", target_orders_full.shape)
print("history_orders shape:", history_orders_full.shape)
print("\ntarget_orders head:")
display(target_orders_full.head(10))
print("\nhistory_orders head:")
display(history_orders_full.head(10))

target_orders shape: (131209, 6)
history_orders shape: (3214874, 6)

target_orders head:


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,11,4,8,14.0
25,1492625,2,15,1,11,30.0
49,2196797,5,5,0,11,6.0
74,525192,7,21,2,11,6.0
78,880375,8,4,1,14,10.0
82,1094988,9,4,6,10,30.0
88,1822501,10,6,0,19,30.0
115,1827621,13,13,0,21,8.0
129,2316178,14,14,2,19,11.0
200,2180313,17,41,3,10,30.0



history_orders head:


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [23]:
# Limit to subset for fast iteration (script: --max-target-orders)
target_orders = target_orders_full.head(MAX_TARGET_ORDERS)
users_needed = set(target_orders["user_id"].tolist())
history_orders = history_orders_full[history_orders_full["user_id"].isin(users_needed)]
history_order_ids = set(history_orders["order_id"].tolist())

print("After limiting: target_orders", target_orders.shape[0], ", history_orders", history_orders.shape[0], ", history_order_ids", len(history_order_ids))

After limiting: target_orders 500 , history_orders 7663 , history_order_ids 7663


## 3. `build_order_to_products`

Builds `order_id -> list of product_id` for prior orders (chunked read).

In [24]:
def build_order_to_products(
    order_products_prior_path: Path,
    history_order_ids: set[int],
    chunk_size: int = CHUNK_SIZE,
) -> dict[int, list[int]]:
    """
    Build a mapping from each history order_id to the list of product_ids in that order.

    Reads order_products__prior.csv in chunks. Only rows whose order_id is in
    history_order_ids are kept.
    """
    order_to_products = defaultdict(list)
    for chunk in pd.read_csv(order_products_prior_path, chunksize=chunk_size):
        chunk = chunk[chunk["order_id"].isin(history_order_ids)]
        for order_id, product_id in chunk[["order_id", "product_id"]].itertuples(
            index=False
        ):
            order_to_products[order_id].append(product_id)
    return dict(order_to_products)

order_to_products = build_order_to_products(
    DATA_DIR / "order_products__prior.csv", history_order_ids
)

In [25]:
print("Num orders with products:", len(order_to_products))
lens = [len(v) for v in order_to_products.values()]
print("Products per order: min", min(lens), ", max", max(lens), ", mean", sum(lens)/len(lens))
print("\nSample (first 3 order_id -> product_ids):")
for oid, pids in list(order_to_products.items())[:3]:
    print(f"  order_id {oid}: {len(pids)} products, first 5 ids: {pids[:5]}")

Num orders with products: 7663
Products per order: min 1 , max 89 , mean 9.778285266866762

Sample (first 3 order_id -> product_ids):
  order_id 40: 4 products, first 5 ids: [10070, 42450, 33198, 34866]
  order_id 1483: 12 products, first 5 ids: [32818, 27582, 12302, 1831, 19204]
  order_id 2199: 15 products, first 5 ids: [24852, 17616, 47877, 33313, 651]


## 4. `build_user_context_for_target_orders`

For each target order, build user context string from order history only. Uses compact format: `[+Nd w{dow}h{hour}]` and `Next: +{gap}d w{dow}h{hour}` (script default).

In [26]:
def build_user_context_for_target_orders(
    target_orders: pd.DataFrame,
    history_orders: pd.DataFrame,
    order_to_products: dict[int, list[int]],
    product_text_map: dict[int, str],
    max_prior_orders: int = 5,
    max_product_names: int = 20,
) -> dict[int, str]:
    """
    For each target order, build one user-context string using only that user's order history.

    Context format (compact): "[w{dow}h{hour}] name1, name2; [+{days}d w{dow}h{hour}] ... Next: +{gap}d w{dow}h{hour}"
    Used as the "anchor" (query) side in (anchor, positive) pairs. No leakage.
    """
    history_orders = history_orders.sort_values(["user_id", "order_number"])
    order_id_to_context: dict[int, str] = {}

    for _, row in target_orders.iterrows():
        order_id = int(row["order_id"])
        user_history = history_orders[
            (history_orders["user_id"] == row["user_id"])
            & (history_orders["order_number"] < row["order_number"])
        ].tail(max_prior_orders)

        segments: list[str] = []
        total_products = 0

        for _, h in user_history.iterrows():
            if total_products >= max_product_names:
                break
            oid = int(h["order_id"])
            order_products = []
            for pid in order_to_products.get(oid, []):
                if pid not in product_text_map:
                    continue
                if total_products >= max_product_names:
                    break
                name = product_text_map[pid].split("Product: ")[1].split(".")[0].strip()
                order_products.append(name)
                total_products += 1

            if not order_products:
                continue

            dow = int(h["order_dow"])
            hour = (
                h["order_hour_of_day"]
                if isinstance(h["order_hour_of_day"], str)
                else str(int(h["order_hour_of_day"]))
            )
            if pd.isna(h["days_since_prior_order"]):
                time_prefix = f"w{dow}h{hour}"
            else:
                days_gap = int(h["days_since_prior_order"])
                time_prefix = f"+{days_gap}d w{dow}h{hour}"
            seg = f"[{time_prefix}] " + ", ".join(order_products)
            segments.append(seg)

        products_str = "; ".join(segments) if segments else "(no prior orders)"
        row_dow = int(row["order_dow"])
        row_hour = (
            row["order_hour_of_day"]
            if isinstance(row["order_hour_of_day"], str)
            else str(int(row["order_hour_of_day"]))
        )
        if pd.isna(row["days_since_prior_order"]):
            next_clause = f"Next: w{row_dow}h{row_hour}"
        else:
            gap = int(row["days_since_prior_order"])
            next_clause = f"Next: +{gap}d w{row_dow}h{row_hour}"
        context = f"{products_str}. {next_clause}"
        order_id_to_context[order_id] = context

    return order_id_to_context

order_id_to_context = build_user_context_for_target_orders(
    target_orders,
    history_orders,
    order_to_products,
    product_text_map,
    max_prior_orders=MAX_PRIOR_ORDERS,
    max_product_names=MAX_PRODUCT_NAMES,
)

In [27]:
print("Num train orders with context:", len(order_id_to_context))
print("\nSample contexts (compact format, first 2):")
for oid, ctx in list(order_id_to_context.items())[:2]:
    print(f"\norder_id {oid}:")
    print(ctx[:300] + "..." if len(ctx) > 300 else ctx)

Num train orders with context: 500

Sample contexts (first 2):

order_id 1187899.0:
Previously ordered: [ordered on weekday 2 at hour 8.0] Soda, Organic Unsweetened Vanilla Almond Milk, Original Beef Jerky, Aged White Cheddar Popcorn, XL Pick-A-Size Paper Towel Rolls; [ordered 15 days after previous order on weekday 3 at hour 7.0] Soda, Pistachios, Original Beef Jerky, Bag of Organ...

order_id 1492625.0:
Previously ordered: [ordered on weekday 2 at hour 11.0] Chipotle Beef & Pork Realstick, Organic Avocado, Roasted Turkey, Baked Organic Sea Salt Crunchy Pea Snack, Thin Stackers Brown Rice Lightly Salted, Cheddar Bunnies Snack Crackers, Plantain Chips, Organic Just Concord Grape Juice, Uncured Genoa ...


## 5. `build_anchor_positive_pairs`

Build (anchor, positive, order_id) from order_products__train.

In [11]:
def build_anchor_positive_pairs(
    order_products_train_path: Path,
    order_id_to_context: dict[int, str],
    product_text_map: dict[int, str],
) -> tuple[list[str], list[str], list[int]]:
    """
    Build (anchor, positive) training pairs from order_products__train.
    Each row gives one positive pair; returns order_id per row for train/eval split.
    """
    train_op = pd.read_csv(order_products_train_path)
    anchors, positives, order_ids = [], [], []
    for _, row in train_op.iterrows():
        order_id = row["order_id"]
        product_id = row["product_id"]
        if order_id not in order_id_to_context or product_id not in product_text_map:
            continue
        anchors.append(order_id_to_context[order_id])
        positives.append(product_text_map[product_id])
        order_ids.append(order_id)
    return anchors, positives, order_ids

anchors, positives, order_ids = build_anchor_positive_pairs(
    DATA_DIR / "order_products__train.csv",
    order_id_to_context,
    product_text_map,
)

In [12]:
print("Total pairs (before split):", len(anchors))
print("\nSample pair 0:")
print("ANCHOR:", anchors[0][:200], "...")
print("\nPOSITIVE:", positives[0])

Total pairs (before split): 5163

Sample pair 0:
ANCHOR: Previously ordered: [7 days since prior, weekday 4, hour 12.0] Variety Pack Hard Cider, Instant Oatmeal Maple & Brown Sugar; [4 days since prior, weekday 1, hour 8.0] French Vanilla Coffee Creamer, Ha ...

POSITIVE: Product: Natural Vanilla Ice Cream. Aisle: ice cream ice. Department: frozen.


## 6. Train / eval split and final datasets

Split by order; optionally sample train.

In [13]:
train_order_ids_all = set(order_id_to_context.keys())
order_list = sorted(train_order_ids_all)
n_eval = max(1, int(len(order_list) * EVAL_FRAC))
eval_order_ids = set(order_list[-n_eval:])

train_anchors, train_positives = [], []
eval_anchors, eval_positives = [], []
for a, p, oid in zip(anchors, positives, order_ids):
    if oid in eval_order_ids:
        eval_anchors.append(a)
        eval_positives.append(p)
    else:
        train_anchors.append(a)
        train_positives.append(p)

if SAMPLE_FRAC is not None and SAMPLE_FRAC < 1.0:
    train_df = pd.DataFrame({"anchor": train_anchors, "positive": train_positives})
    train_df = train_df.sample(frac=SAMPLE_FRAC, random_state=SEED)
    train_anchors = train_df["anchor"].tolist()
    train_positives = train_df["positive"].tolist()

train_dataset = Dataset.from_dict({"anchor": train_anchors, "positive": train_positives})
eval_dataset = Dataset.from_dict({"anchor": eval_anchors, "positive": eval_positives}) if eval_anchors else None

In [14]:
print("Train pairs:", len(train_anchors))
print("Eval pairs:", len(eval_anchors))
print("Eval orders:", len(eval_order_ids))
print("\ntrain_dataset:", train_dataset)
if eval_dataset is not None:
    print("eval_dataset:", eval_dataset)

Train pairs: 4615
Eval pairs: 548
Eval orders: 50

train_dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 4615
})
eval_dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 548
})


## 7. Eval information retrieval artifacts

`eval_queries`, `eval_corpus`, `eval_relevant_docs` for InformationRetrievalEvaluator. When `EVAL_SERVE_TIME=True`, strip " Next: ..." from eval queries so eval matches production (script: `--no-eval-serve-time` to keep it).

In [15]:
def strip_next_order_from_context(context: str) -> str:
    """Remove the ' Next: ...' clause for eval/serve (we don't know next order time at serve time)."""
    if " Next:" in context:
        return context.split(" Next:")[0].strip()
    return context

if EVAL_SERVE_TIME:
    eval_queries = {
        str(oid): strip_next_order_from_context(order_id_to_context[oid])
        for oid in eval_order_ids
        if oid in order_id_to_context
    }
else:
    eval_queries = {
        str(oid): order_id_to_context[oid]
        for oid in eval_order_ids
        if oid in order_id_to_context
    }
eval_relevant_docs = {str(oid): set() for oid in eval_order_ids}
train_op = pd.read_csv(DATA_DIR / "order_products__train.csv")
for _, row in train_op.iterrows():
    oid = int(row["order_id"])
    oid_str = str(oid)
    if oid_str in eval_relevant_docs:
        eval_relevant_docs[oid_str].add(str(int(row["product_id"])))
eval_corpus = {str(pid): text for pid, text in product_text_map.items()}

In [16]:
print("eval_queries:", len(eval_queries))
print("eval_corpus (products):", len(eval_corpus))
print("eval_relevant_docs:", len(eval_relevant_docs))
print("\nSample query (first qid):")
qid = list(eval_queries.keys())[0]
print(f"  qid={qid}")
print("  query:", eval_queries[qid][:150], "...")
print("  relevant_docs:", list(eval_relevant_docs.get(qid, []))[:10])

eval_queries: 50
eval_corpus (products): 49688
eval_relevant_docs: 50

Sample query (first qid):
  qid=3105153.0
  query: Previously ordered: [weekday 5, hour 8.0] Peppermint Mocha Liquid Coffee Creamer, Chicken & Apple Smoked Chicken Sausage, Chunky Medium Salsa, Organic ...
  relevant_docs: []


## 8. (Optional) Save to disk

Script writes to a param-based subdir under `OUTPUT_DIR` (e.g. `p5_mp20_ef0.1`) and saves `data_prep_params.json`. Uncomment to write.

In [17]:
# import json
#
# def _params_subdir(max_prior_orders, max_product_names, eval_frac, eval_serve_time, sample_frac, max_target_orders):
#     parts = [f"p{max_prior_orders}", f"mp{max_product_names}", f"ef{eval_frac}"]
#     if not eval_serve_time:
#         parts.append("no_serve")
#     if sample_frac is not None:
#         parts.append(f"sf{sample_frac}")
#     if max_target_orders is not None:
#         parts.append(f"mt{max_target_orders}")
#     return "_".join(parts)
#
# subdir = _params_subdir(MAX_PRIOR_ORDERS, MAX_PRODUCT_NAMES, EVAL_FRAC, EVAL_SERVE_TIME, SAMPLE_FRAC, MAX_TARGET_ORDERS)
# effective_output_dir = OUTPUT_DIR / subdir
# effective_output_dir.mkdir(parents=True, exist_ok=True)
#
# train_dataset.save_to_disk(str(effective_output_dir / "train_dataset"))
# if eval_dataset is not None:
#     eval_dataset.save_to_disk(str(effective_output_dir / "eval_dataset"))
# with open(effective_output_dir / "eval_queries.json", "w") as f:
#     json.dump(eval_queries, f, indent=0)
# with open(effective_output_dir / "eval_corpus.json", "w") as f:
#     json.dump(eval_corpus, f, indent=0)
# with open(effective_output_dir / "eval_relevant_docs.json", "w") as f:
#     json.dump({k: list(v) for k, v in eval_relevant_docs.items()}, f, indent=0)
# data_prep_params = {"data_dir": str(DATA_DIR), "output_dir": str(effective_output_dir), "max_prior_orders": MAX_PRIOR_ORDERS, "max_product_names": MAX_PRODUCT_NAMES, "sample_frac": SAMPLE_FRAC, "eval_frac": EVAL_FRAC, "eval_serve_time": EVAL_SERVE_TIME, "max_target_orders": MAX_TARGET_ORDERS, "seed": SEED, "n_train_pairs": len(train_anchors), "n_eval_pairs": len(eval_anchors), "n_eval_queries": len(eval_queries), "n_corpus": len(eval_corpus)}
# with open(effective_output_dir / "data_prep_params.json", "w") as f:
#     json.dump(data_prep_params, f, indent=2)
# print("Saved to", effective_output_dir)