# Generate Large-Scale Synthetic Datasets for Performance Benchmarking

The notebook generates synthetic choice datasets by: (1) subsampling 500 users and 30 items from a seed supermarket dataset containing latent user/item feature vectors (10-dimensional after dimensionality reduction), (2) selecting 100,000 purchase records that match these subsampled entities, (3) encoding user/item IDs with label encoders, (4) creating a PyTorch ChoiceDataset with latent features and purchase indices, and (5) generating an expanded CSV format with one-hot encoded choices and merged latent features for compatibility with traditional choice modeling tools.

In [1]:
import os
import sys
from time import ctime, time
from typing import Optional

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch
import torch_choice
from torch_choice.data import ChoiceDataset
from tqdm import tqdm

In [2]:
# print system information, including python version, torch version, GPU information if available.
print(f"Python version: {sys.version}")
print(f"Torch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"GPU information: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available")

Python version: 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:24:02) 
[Clang 11.1.0 ]
Torch version: 2.6.0
No GPU available


## Configuration of the Synthetic Dataset Generation.
We will generate synthetic by inflating an existing dataset from a supermarket shopping dataset (called the `seed` dataset). This seed dataset is available upon request.

In [3]:
# configure the input and output paths for the synthetic dataset.
INPUT_PATH = "./data/supermarket_simulation_seed"
assert os.path.exists(INPUT_PATH), f"The input path {INPUT_PATH} does not exist."
# also report the last modified time of the input path.
print(f"Last modified time of the input path ({INPUT_PATH}): {ctime(os.path.getmtime(INPUT_PATH))}")
import subprocess
du_output = subprocess.check_output(["du", "-hs", INPUT_PATH]).decode("utf-8").strip()
print(f"Total file size of the input path ({INPUT_PATH}): {du_output}")
OUTPUT_PATH = "/Volumes/HS_SSD/torch_choice_benchmark_data"
os.makedirs(OUTPUT_PATH, exist_ok=True)
print(f"Synthetic dataset will be saved to: {OUTPUT_PATH}")

Last modified time of the input path (./data/supermarket_simulation_seed): Mon Feb 24 16:21:19 2025
Total file size of the input path (./data/supermarket_simulation_seed): 239M	./data/supermarket_simulation_seed
Synthetic dataset will be saved to: /Volumes/HS_SSD/torch_choice_benchmark_data


## Load the Seed Dataset

In [4]:
# load the latent vectors of users and items from the seed dataset.
# also load the price coefficients of items.
user_latents = pd.read_csv(os.path.join(INPUT_PATH, "user_latents.csv")).drop(columns="price_coef")
item_latents = pd.read_csv(os.path.join(INPUT_PATH, "item_latents.csv"))
# item_sess_price = pd.read_csv(os.path.join(INPUT_PATH, "item_sess_price.tsv"), sep="\t", header=None, names=["item_id", "session_id", "price"])
for i in range(30):
    user_latents.rename(columns={str(i): f"user_latent_{i}"}, inplace=True)
    item_latents.rename(columns={str(i): f"item_latent_{i}"}, inplace=True)

## Preview the Seed Dataset

In [5]:
display(user_latents)
display(item_latents)

Unnamed: 0,user_latent_0,user_latent_1,user_latent_2,user_latent_3,user_latent_4,user_latent_5,user_latent_6,user_latent_7,user_latent_8,user_latent_9,...,user_latent_21,user_latent_22,user_latent_23,user_latent_24,user_latent_25,user_latent_26,user_latent_27,user_latent_28,user_latent_29,user_id
0,1.652043,0.896975,-0.320634,-0.725527,0.408554,0.411511,0.260232,0.794857,-1.238917,-2.424044,...,-1.489928,-0.541257,-1.069119,-1.686813,-0.793135,-0.410285,-1.002112,1.769302,1.412756,10
1,-1.424878,-2.330930,-0.484043,1.228705,-1.149234,-0.656144,0.494841,-0.267629,-0.998971,-0.284150,...,-1.568228,-0.382963,0.241070,-0.866551,-0.399027,0.903710,0.464493,-0.947236,-0.819285,11
2,0.791819,-0.233433,1.250640,-0.016727,-0.755141,-0.333004,-1.005224,-0.929233,0.078521,-0.061093,...,0.546483,0.407443,0.900532,-0.420322,-0.665277,-0.658553,0.750101,1.715973,1.043971,12
3,-1.537094,1.109955,0.071588,-0.412510,-2.458540,-0.297457,0.038190,-0.793976,-0.382170,0.908736,...,-1.070785,-0.097273,-0.342732,-0.276352,0.980518,0.095073,-0.339366,1.464080,0.130827,13
4,-0.071070,-0.201505,-0.112519,-0.736719,-0.648391,-0.477821,-0.313232,0.583525,-1.649237,-1.826413,...,0.335774,-0.276055,-0.588549,1.092662,1.165659,-0.252770,-0.264086,1.595825,1.266879,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.629963,-0.936433,-0.609025,0.894321,-1.458818,1.589152,-0.119949,-0.043622,-0.838796,-0.314547,...,1.256244,-0.543903,1.063730,1.437808,0.594138,-1.140934,0.504068,-1.268392,0.215705,1495
496,-0.654049,0.349450,1.117007,1.476670,0.562758,-1.577097,-1.557933,-1.695043,-1.035915,1.517097,...,0.215489,-0.569829,-1.409151,2.635519,1.012705,-0.224135,1.259907,1.344729,-1.647557,1496
497,0.292110,0.667330,-2.595917,0.668047,1.103777,0.440590,0.118690,-0.583405,0.945426,-0.013941,...,0.810359,-0.906858,0.154597,-0.389211,-0.316069,1.118300,-0.643973,-1.474082,-1.031294,1497
498,-0.316736,-0.164831,0.154419,-0.060956,-0.383504,-0.004693,1.297727,-0.747652,-0.574622,-0.405096,...,1.486630,-0.977136,1.796335,1.412003,0.516029,-0.759676,1.773111,1.008517,-0.648832,1498


Unnamed: 0,item_latent_0,item_latent_1,item_latent_2,item_latent_3,item_latent_4,item_latent_5,item_latent_6,item_latent_7,item_latent_8,item_latent_9,...,item_latent_21,item_latent_22,item_latent_23,item_latent_24,item_latent_25,item_latent_26,item_latent_27,item_latent_28,item_latent_29,item_id
0,-1.180416,0.780843,-0.609615,-0.009930,-0.346506,0.256034,-1.544774,0.894837,-0.260633,-0.108083,...,-0.078928,1.246908,-1.235101,0.919382,-0.386309,-1.631385,1.979308,0.631791,-0.231759,0
1,0.076907,-0.546282,-0.066576,-0.222129,0.767167,0.454014,1.033211,0.681780,-0.436766,1.428369,...,1.879548,0.396369,-0.077428,-0.268600,0.047633,-1.203710,0.728284,-0.449082,-0.634305,1
2,1.194630,0.956158,-0.116020,1.457069,0.475462,0.462169,0.511546,-0.962028,-0.687628,0.376553,...,-1.500292,0.294344,-1.508859,-0.205371,-0.005506,-0.356446,-0.311747,0.353643,2.078901,2
3,-1.047516,0.768583,-1.523498,0.106635,0.837008,0.020559,1.783410,0.153145,-0.002027,0.354408,...,-0.561901,1.048928,-2.547275,-0.059737,-0.216024,0.420048,1.147283,-0.968257,-1.609774,3
4,0.355223,1.116302,1.122433,-1.107433,1.780376,0.707121,0.287654,-0.955454,2.414806,1.184566,...,0.699091,-0.455226,-0.620574,-1.204995,-0.153397,0.265757,-0.636522,-1.009489,1.319974,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-1.030201,-1.918512,1.597233,0.344906,-0.548310,-0.795283,0.502118,-0.843086,-0.221221,0.934821,...,1.873550,0.430558,0.022867,0.600208,0.283746,0.684030,0.944583,0.381152,-0.629110,495
496,0.836681,-0.903445,-0.931781,-0.641818,-0.698298,1.665656,1.200952,0.039846,-0.758340,-0.431396,...,-1.064213,1.223528,1.208416,-0.506720,1.334080,0.845463,0.889762,0.307895,-0.591260,496
497,-0.269530,0.383244,-0.669919,1.053787,-0.408319,0.331533,-0.818428,-0.973172,0.687965,-0.280867,...,1.064936,0.797903,-0.264837,1.365297,0.003599,0.313854,-0.841400,0.863058,-1.192473,497
498,0.765047,-0.008431,-0.340475,0.110560,0.695212,1.938753,0.075895,0.022364,-1.360376,0.955210,...,1.155430,-1.438004,-0.755103,-0.381443,-0.026872,-0.480775,-1.370354,0.072734,-0.082506,498


## Load the Purchase Records $(u^{(n)}, i^{(n)}, s^{(n)})$ from the Seed Dataset

In [6]:
train_all = pd.read_csv(os.path.join(INPUT_PATH, "train_all.tsv"), sep="\t", header=None, names=["user_id", "item_id", "session_id", "3"]).drop(columns=["3", "session_id"])
display(train_all)
# report basic summary statistics of the purchase records.
print(f"Total number of purchase records: {len(train_all):,}")
print(f"Number of unique users: {train_all['user_id'].nunique():,} each appeared for {train_all.value_counts('user_id').mean():.2f} sessions on average.")
print(f"Number of unique items: {train_all['item_id'].nunique():,} each appeared for {train_all.value_counts('item_id').mean():.2f} times on average.")

Unnamed: 0,user_id,item_id
0,10,4
1,10,6
2,10,10
3,10,15
4,10,20
...,...,...
1750095,1499,478
1750096,1499,483
1750097,1499,487
1750098,1499,493


Total number of purchase records: 1,750,100
Number of unique users: 500 each appeared for 3500.20 sessions on average.
Number of unique items: 500 each appeared for 3500.20 times on average.


## Subsample Data for Benchmarking

In [7]:
ALL_ITEMS = item_latents["item_id"].unique()
ALL_USERS = user_latents["user_id"].unique()
TOTAL_NUM_RECORDS = len(train_all)
TOTAL_NUM_LATENTS = 30
# subsample amount.
# NUM_USERS = 500
# NUM_RECORDS = 100_000
# number of latent dimensions to keep.
# NUM_LATENTS = 10

In [8]:
def simulate_choice_data(
    user_latents: pd.DataFrame,
    item_latents: pd.DataFrame,
    NUM_ITEMS: int,
    NUM_USERS: int,
    NUM_LATENTS: Optional[int]=None,
    NUM_RECORDS: Optional[int]=None,
    seed: int=42,
    experiment_tag: Optional[str]=None,
    export_csv: bool=False,
) -> None:
    """
    Simulate choice data for performance benchmarking.

    This function samples a subset of users, items, and purchase records from global datasets,
    encodes latent features, and saves the resulting dataset in both torch and CSV formats.
    """
    # Set a default experiment tag if none is provided.
    if experiment_tag is None:
        experiment_tag = "default"

    np.random.seed(seed)  # Fix random seed for reproducibility

    # Check for completeness of the input data.
    missing_user_latents = [f"user_latent_{i}" for i in range(TOTAL_NUM_LATENTS) if f"user_latent_{i}" not in user_latents.columns]
    missing_item_latents = [f"item_latent_{i}" for i in range(TOTAL_NUM_LATENTS) if f"item_latent_{i}" not in item_latents.columns]
    assert not missing_user_latents, f"user_latents must contain all {TOTAL_NUM_LATENTS} latent dimensions."
    assert not missing_item_latents, f"item_latents must contain all {TOTAL_NUM_LATENTS} latent dimensions."
    assert "item_id" in item_latents.columns, "item_latents must contain the item id column."
    assert "user_id" in user_latents.columns, "user_latents must contain the user id column."

    # Subset the latent dimensions, if requested.
    if NUM_LATENTS is not None:
        user_latents = user_latents.copy().drop(columns=[f"user_latent_{i}" for i in range(NUM_LATENTS, TOTAL_NUM_LATENTS)])
        item_latents = item_latents.copy().drop(columns=[f"item_latent_{i}" for i in range(NUM_LATENTS, TOTAL_NUM_LATENTS)])

    # Randomly choose items and users for the performance benchmark.
    selected_items = np.random.choice(ALL_ITEMS, size=NUM_ITEMS, replace=False)
    selected_users = np.random.choice(ALL_USERS, size=NUM_USERS, replace=False)

    # Get all records that match the selected items and users.
    suitable_records = train_all[(train_all["item_id"].isin(selected_items)) &
                                 (train_all["user_id"].isin(selected_users))]
    if NUM_RECORDS is None:
        # Use all matching records by default.
        records = suitable_records.copy()  # Create a copy to avoid SettingWithCopyWarning
        NUM_RECORDS = len(records)
    else:
        # Sample NUM_RECORDS records from the suitable records.
        records = suitable_records.sample(NUM_RECORDS, replace=False, random_state=seed)

    # Retrieve the corresponding latent vectors for the selected users and items.
    relevant_user_latents = user_latents[user_latents["user_id"].isin(selected_users)].copy()  # Create a copy
    relevant_item_latents = item_latents[item_latents["item_id"].isin(selected_items)].copy()  # Create a copy

    # Encode the item and user ids to consecutive integers.
    item_encoder = LabelEncoder().fit(selected_items)
    user_encoder = LabelEncoder().fit(selected_users)

    # Use .loc to avoid SettingWithCopyWarning
    relevant_item_latents.loc[:, "item_id"] = item_encoder.transform(relevant_item_latents["item_id"].values)
    relevant_user_latents.loc[:, "user_id"] = user_encoder.transform(relevant_user_latents["user_id"].values)
    records.loc[:, "item_id"] = item_encoder.transform(records["item_id"].values)
    records.loc[:, "user_id"] = user_encoder.transform(records["user_id"].values)

    # Sort the latent dataframes by the encoded id.
    relevant_item_latents = relevant_item_latents.set_index("item_id").loc[np.arange(NUM_ITEMS)]
    relevant_user_latents = relevant_user_latents.set_index("user_id").loc[np.arange(NUM_USERS)]

    # Create the ChoiceDataset.
    dataset = ChoiceDataset(
        item_index=torch.LongTensor(records["item_id"].values),
        user_index=torch.LongTensor(records["user_id"].values),
        # we don't have a session index in this dataset, we just assume each record is in its own session.
        session_index=torch.arange(len(records)).long(),
        user_latents=torch.FloatTensor(relevant_user_latents.values),
        item_latents=torch.FloatTensor(relevant_item_latents.values),
        num_items=NUM_ITEMS,
    )
    # Fix: Assign missing private attributes required by ChoiceDataset.
    dataset._num_items = NUM_ITEMS
    dataset._num_users = NUM_USERS
    # dataset._num_sessions = records["session_id"].nunique()

    # Save the torch-choice dataset.
    torch.save(dataset, os.path.join(OUTPUT_PATH, f"simulated_choice_data_{experiment_tag}_seed_{seed}.pt"))

    if export_csv:
        # Save the dataset in long-format CSV for mlogit compatibility.
        temp_lst = []
        for record_id in range(NUM_RECORDS):
            item_chosen = records.iloc[record_id]["item_id"]
            one_hot_item_chosen = np.zeros(NUM_ITEMS)
            one_hot_item_chosen[item_chosen] = 1
            df_single_record = pd.DataFrame({
                "session_id": record_id,
                "item_id": np.arange(NUM_ITEMS),
                "choice": one_hot_item_chosen,
                "user_id": records.iloc[record_id]["user_id"],
            })
            temp_lst.append(df_single_record)

        # note that these CSV files can be pretty big.
        df_long = pd.concat(temp_lst, axis=0)
        # Fix: Merge latent feature data using the proper keys.
        df_long = df_long.merge(relevant_user_latents.reset_index(), on="user_id", how="left")
        df_long = df_long.merge(relevant_item_latents.reset_index(), on="item_id", how="left")
        df_long.to_csv(os.path.join(OUTPUT_PATH, f"simulated_choice_data_{experiment_tag}_seed_{seed}.csv"), index=False)

# Generate Synthetic Datasets for Performance Benchmarking

## Number of Records Experiment (Small)

In [9]:
simulate_choice_data(
    NUM_ITEMS=30,
    user_latents=user_latents,
    item_latents=item_latents,
    NUM_USERS=len(ALL_USERS),
    NUM_LATENTS=10,
    NUM_RECORDS=None,
    seed=42,
    experiment_tag=f"num_records_experiment_small",
    export_csv=True,
)

## Number of Records Experiment (Large)

In [10]:
# the full dataset.
simulate_choice_data(
    NUM_ITEMS=len(ALL_ITEMS),
    user_latents=user_latents,
    item_latents=item_latents,
    NUM_USERS=len(ALL_USERS),
    NUM_LATENTS=30,
    NUM_RECORDS=None,
    seed=42,
    experiment_tag=f"full_dataset",
    export_csv=False,
)

## Number of Covariates/Parameters Experiment (Small)

In [11]:
# the full dataset.
simulate_choice_data(
    NUM_ITEMS=50,
    user_latents=user_latents,
    item_latents=item_latents,
    NUM_USERS=len(ALL_USERS),
    NUM_LATENTS=30,
    NUM_RECORDS=10_000,
    seed=42,
    experiment_tag=f"num_params_experiment_small",
    export_csv=True,
)

## Number of Covariates/Parameters Experiment (Large)
**Use the full dataset, which has already been generated.**

## Number of Items Experiment (Small with 10,000 records)

In [12]:
for num_items_sampled in tqdm([10, 20, 30, 50, 100, 150, 200]):
    simulate_choice_data(
        NUM_ITEMS=num_items_sampled,
        user_latents=user_latents,
        item_latents=item_latents,
        NUM_USERS=len(ALL_USERS),
        NUM_LATENTS=5,
        NUM_RECORDS=10_000,
        seed=42,
        experiment_tag=f"num_items_experiment_small_{num_items_sampled}_items",
        export_csv=True,
    )

100%|██████████| 7/7 [00:53<00:00,  7.59s/it]


## Number of Items Experiment (Large with 30,000 records)

In [13]:
for num_items_sampled in tqdm([10, 20, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]):
    simulate_choice_data(
        NUM_ITEMS=num_items_sampled,
        user_latents=user_latents,
        item_latents=item_latents,
        NUM_USERS=len(ALL_USERS),
        NUM_LATENTS=30,
        NUM_RECORDS=30_000,
        seed=42,
        experiment_tag=f"num_items_experiment_large_{num_items_sampled}_items",
        export_csv=False,
    )

100%|██████████| 13/13 [00:00<00:00, 21.99it/s]


In [14]:
# testing.
ds = torch.load(
    "/Volumes/HS_SSD/torch_choice_benchmark_data/simulated_choice_data_full_dataset_seed_42.pt",
    weights_only=False)

In [15]:
ds.num_sessions



1750100

In [16]:
print("Finished.")

Finished.
