# Data Preparation for Semantic ID Generation

In [1]:
# CATEGORY = "Baby_Products"
CATEGORY = "Video_Games"

# Define sequence lengths
MIN_SEQUENCE_LENGTH = 3
MAX_SEQUENCE_LENGTH = 100  # Adjust as needed

In [2]:
import sys
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent
sys.path.append(str(PROJECT_ROOT))

# Data directory
DATA_DIR = Path(PROJECT_ROOT, "data")
DATA_DIR.mkdir(exist_ok=True)

In [None]:
import gzip
import shutil
import urllib.request

import polars as pl

from src.logger import setup_logger

logger = setup_logger("dataprep")

In [4]:
# URLs for the data files
ITEMS_URL = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{CATEGORY}.jsonl.gz"  # fmt: off
REVIEWS_URL = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/{CATEGORY}.jsonl.gz"  # fmt: off
SEQUENCES_URL = f"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/benchmark/5core/timestamp_w_his/{CATEGORY}.train.csv.gz"  # fmt: off

## Download and Load Data

First, we download the gzipped JSONL files from the Amazon dataset and unzip them to the data directory. The data is then loaded using Polars' `read_ndjson` function which can handle newline-delimited JSON files.

In [5]:
# Download and unzip the data files
def download_and_unzip(url, output_path):
    """Download a gzipped file and unzip it."""
    # Download the gzipped file
    gz_path = output_path.with_suffix(".jsonl.gz")

    logger.info(f"Downloading {url}...")
    urllib.request.urlretrieve(url, gz_path)
    logger.info(f"Downloaded to {gz_path}")

    # Unzip the file
    logger.info(f"Unzipping {gz_path}...")
    with gzip.open(gz_path, "rb") as f_in:
        with open(output_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    logger.info(f"Unzipped to {output_path}")

    # Remove the gzipped file
    gz_path.unlink()
    logger.info(f"Removed {gz_path}\n")


# Download item metadata
item_output_path = DATA_DIR / f"meta_{CATEGORY}.jsonl"
if not item_output_path.exists():
    download_and_unzip(ITEMS_URL, item_output_path)
else:
    logger.info(f"Item data already exists at {item_output_path}")

# Download review data
review_output_path = DATA_DIR / f"review_{CATEGORY}.jsonl"
if not review_output_path.exists():
    download_and_unzip(REVIEWS_URL, review_output_path)
else:
    logger.info(f"Review data already exists at {review_output_path}")

# Download sequences data
sequences_output_path = DATA_DIR / f"{CATEGORY}.train.csv.gz"
if not sequences_output_path.exists():
    download_and_unzip(SEQUENCES_URL, sequences_output_path)
else:
    logger.info(f"Sequences data already exists at {sequences_output_path}")

18:08:21 - Item data already exists at /Users/eugeneyan/projects/semantic-id/data/meta_Video_Games.jsonl


18:08:21 - Review data already exists at /Users/eugeneyan/projects/semantic-id/data/review_Video_Games.jsonl


18:08:21 - Sequences data already exists at /Users/eugeneyan/projects/semantic-id/data/Video_Games.train.csv.gz


## Prepare item metadata

In [6]:
# Load item metadata and filter for items with both title and description
item_df = pl.read_ndjson(DATA_DIR / f"meta_{CATEGORY}.jsonl", ignore_errors=True)
logger.info(f"Total items in metadata: {len(item_df):,}")

# Check what columns are available
logger.info(f"Item metadata columns: {item_df.columns}")
logger.info(f"Total items in metadata: {len(item_df):,}")

item_df = item_df.with_columns(
    pl.col("description").list.join(" ").fill_null("").alias("description_text"),
    pl.col("features").list.join(" ").fill_null("").alias("features_text"),
    pl.col("categories").list.join(" > ").fill_null("").alias("categories_text"),
)

18:08:23 - Total items in metadata: 137,269


18:08:23 - Item metadata columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']


18:08:23 - Total items in metadata: 137,269


In [7]:
item_df.head(1).select("description_text").item()

'The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the popular Dash 8-300 in a total of 17 different liveries. The Dash 8-300 is one of the most popular short-haul aircraft available and this superbly modelled version from acclaimed aircraft developers PSS is modelled in two versions with a total of 17 different liveries. The package also includes scenery for three European airports, tutorials, tutorial flights and utilities together in one fantastic package.'

In [8]:
item_df.head(1).select("features_text").item()

"Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Bahamasair, Augsburg Airways, Lufthansa Cityline, British Airways (Union Jack), British European, FlyBe, Intersky, Wideroe, Iberia, Tyrolean, QantasLink, BWIA Airports include - London City, Frankfurt, Milan and Amsterdam Schipol Includes PSS PanelConfig and LoadEdit tools"

In [9]:
item_df.head(1).select("categories_text").item()

'Video Games > PC > Games'

In [None]:
# Filter items that have both title and description (non-null and non-empty)
# Note: description is a list of strings, so we check if the list has elements
original_num_items = len(item_df)
logger.info(f"Initial number of items in metadata: {original_num_items:,}")

item_df = item_df.filter(
    (pl.col("title").is_not_null())
    & (pl.col("title").str.len_chars() > 20)
    & (pl.col("description_text").is_not_null())
    & (pl.col("description_text").str.len_chars() > 100)
)

# Create set of valid item IDs
valid_items = set(item_df["parent_asin"].to_list())
logger.info(f"Items with valid metadata (title + description): {len(valid_items):,}")
logger.info(
    f"Items without valid metadata: {original_num_items - len(valid_items):,} ({(original_num_items - len(valid_items)) / original_num_items * 100:.1f}%)"
)

18:08:23 - Initial number of items in metadata: 137,269


18:08:23 - Items with valid metadata (title + description): 66,133


18:08:23 - Items without valid metadata: 71,136 (51.8%)


In [11]:
pl.Config.set_fmt_str_lengths(500)
item_df.select("description_text").head(10)

description_text
str
"""The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the popular Dash 8-300 in a total of 17 different liveries. The Dash 8-300 is one of the most popular short-haul aircraft available and this superbly modelled version from acclaimed aircraft developers PSS is modelled in two versions with a total of 17 different liveries. The package also includes scenery for three European airports, tutorials, tutorial flights and utilities together in one fantastic p…"
"""Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim as the most authentic sports video game with NBA 2K17. As the franchise that “all sports video games should aspire to be” (GamesRadar), NBA 2K17 will take the game to new heights and continue to blur the lines between video game and reality."""
"""The Thrustmaster Motion Plus Elite Fitness Pack for Wii is Ideal for Nintendo Wii Fit & Wii Fit Plus games such as EA Active (EA), U Shape & My Fitness Coach (UbiSoft). Ultimate pack with 9 accessories for the Nintendo Wii Fit Balance Board including: (1) floor mat made from woven foam Size (inches) : 70x20, (2) flexible ankle or wrist weights, (1) stepper for the Wii Balance Board, (1) pedometer to count your steps during each in-game exercise or train without games, (1) armband for Wiimote or …"
"""Now you can watch the wild underwater antics of SpongeBob SquarePants on your Game Boy Advance, with this collection of 4 great episodes. In ""Hall Monitor,"" an overzealous SpongeBob becomes the new Hall Monitor at Mrs. Puff's Boating School, and extends his jurisdiction to the unsuspecting citizens of Bikini Bottom. In ""Jellyfish Jam,"" SpongeBob takes a wild jellyfish home and discovers that they multiply quickly when they take over his house! In ""Jellyfishing,"" SpongeBob and Patrick unwittingly…"
"""A set of 4 bullet button to replace all the 4 buttons (Cross, Square, Triangle, Circle) of your PS3 controller. Made from 9mm luger bullet casings, to perfectly fit the PS3. The bullet buttons are already mounted on a special plastic holder that fit directly and perfectly your PS3 controller. Each cartridge shell have a small layer of clear coat that keep the bullet button ultra shiny even after many years. The clear coat also prevent the metallic odor that can be created during extended use . C…"
"""The Konami Collector's Series: Castlevania & Contra brings back the classic games that first appeared on the NES for a new generation of computer gamers!"""
"""A power struggle begins in a Civilization dependent on an ancient technology, the blastia, and the empire that controls it. The Fates of two friends traveling separate paths intertwine in an epic adventure that threatens the existence of all. Celebrating its 10th anniversary, tales of vesperia is One of the most beloved entries in the 'tales of' series and returns with beautifully remastered full HD graphics, never before seen characters, expanded story, and much more!. Entertainment Software Ra…"
"""Product Description Turbo: Super Stunt Squad is a high-velocity action racing game featuring the super-charged crew from the film Turbo. Each character has their own signature style and tricked-out skills such as jumps, drifts, grinds, flips and other super-cool stunt moves. Expert skills will also prove useful to earn power-ups, customize your character, discover shortcuts, and win the race in the larger-than-life environments of Turbo! From the Manufacturer Turbo: Super Stunt Squad Turbo: Supe…"
"""Product Description Arctis Pro + GameDAC delivers gaming's first certified Hi-Res Audio system, ensuring you hear high-fidelity 96 kHz/ 24 bit, full resolution audio at its purest, with no down-sampling. The gaming audio system includes the legendary ESS Sabre DAC and amp, guaranteeing your audio comes through with unsurpassed quality. Plus, Arctis Pro + GameDAC is designed with luxury materials like polished steel and aluminum alloy, and the exclusive Arctis ski goggle headband and AirWeave per…"
"""From the Manufacturer This Game of The Year Edition features four new maps unique to this edition, four maps previously only available online and new online features, including Observer Mode, Friends list and Arranged Teams. Ten thousand years have passed since the treasonous acts of Chaos toppled the Golden Age of mankind. Fires burn brightly throughout the galaxies, illuminating carnage and slaughter as mankind defends itself from ancient enemies. Hardened forces collide with one goal in mind,…"


In [12]:
pl.Config.set_fmt_str_lengths(100)
item_df = item_df.select(
    "parent_asin",
    "title",
    "description_text",
    "features_text",
    "main_category",
    "categories_text",
    "store",
    "average_rating",
    "rating_number",
    "price",
)

item_df.head()

parent_asin,title,description_text,features_text,main_category,categories_text,store,average_rating,rating_number,price
str,str,str,str,str,str,str,f64,i64,f64
"""B000FH0MHO""","""Dash 8-300 Professional Add-On""","""The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the p…","""Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Ba…","""Video Games""","""Video Games > PC > Games""","""Aerosoft""",5.0,1,
"""B00Z9TLVK0""","""NBA 2K17 - Early Tip Off Edition - PlayStation 4""","""Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim …","""The #1 rated NBA video game simulation series for the last 15 years (Metacritic). The #1 selling NBA…","""Video Games""","""Video Games > PlayStation 4 > Games""","""2K""",4.3,223,58.0
"""B002WH4ZJG""","""Thrustmaster Elite Fitness Pack for Nintendo Wii""","""The Thrustmaster Motion Plus Elite Fitness Pack for Wii is Ideal for Nintendo Wii Fit & Wii Fit Plus…","""Includes (9) Total Accessories Pedometer Wii Fit Balance Board Stepper Floor Mat made from high dens…","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Wii > Accessories > Fitness Accessories""","""THRUSTMASTER""",3.0,3,
"""B0001ZNU56""","""Spongebob Squarepants, Vol. 1""","""Now you can watch the wild underwater antics of SpongeBob SquarePants on your Game Boy Advance, with…","""Bubblestand: SpongeBob shows Patrick and Squidward his unique talent for blowing bubbles. Squidward …","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Game Boy Systems > Game Boy Advance > Games""","""Majesco""",4.4,32,33.98
"""B009C9E8JY""","""Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers""","""A set of 4 bullet button to replace all the 4 buttons (Cross, Square, Triangle, Circle) of your PS3 …","""Case Color: Silver Case Material: Nickel Primer Color: Bronze Primer Material: Brass Brand of ammo s…","""Computers""","""Video Games > PlayStation 4 > Accessories > Controllers""","""NEXiLUX""",4.8,4,


In [13]:
item_df = item_df.with_columns(
    [
        pl.col("title").fill_null(""),
        pl.col("description_text").fill_null(""),
        pl.col("features_text").fill_null(""),
        pl.col("main_category").fill_null(""),
        pl.col("categories_text").fill_null(""),
        pl.col("store").fill_null(""),
        pl.col("average_rating").fill_null(""),
        pl.col("rating_number").fill_null(0),
        pl.col("price").fill_null(""),
    ]
)

item_df.head()

parent_asin,title,description_text,features_text,main_category,categories_text,store,average_rating,rating_number,price
str,str,str,str,str,str,str,str,i64,str
"""B000FH0MHO""","""Dash 8-300 Professional Add-On""","""The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the p…","""Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Ba…","""Video Games""","""Video Games > PC > Games""","""Aerosoft""","""5.0""",1,""""""
"""B00Z9TLVK0""","""NBA 2K17 - Early Tip Off Edition - PlayStation 4""","""Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim …","""The #1 rated NBA video game simulation series for the last 15 years (Metacritic). The #1 selling NBA…","""Video Games""","""Video Games > PlayStation 4 > Games""","""2K""","""4.3""",223,"""58.0"""
"""B002WH4ZJG""","""Thrustmaster Elite Fitness Pack for Nintendo Wii""","""The Thrustmaster Motion Plus Elite Fitness Pack for Wii is Ideal for Nintendo Wii Fit & Wii Fit Plus…","""Includes (9) Total Accessories Pedometer Wii Fit Balance Board Stepper Floor Mat made from high dens…","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Wii > Accessories > Fitness Accessories""","""THRUSTMASTER""","""3.0""",3,""""""
"""B0001ZNU56""","""Spongebob Squarepants, Vol. 1""","""Now you can watch the wild underwater antics of SpongeBob SquarePants on your Game Boy Advance, with…","""Bubblestand: SpongeBob shows Patrick and Squidward his unique talent for blowing bubbles. Squidward …","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Game Boy Systems > Game Boy Advance > Games""","""Majesco""","""4.4""",32,"""33.98"""
"""B009C9E8JY""","""Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers""","""A set of 4 bullet button to replace all the 4 buttons (Cross, Square, Triangle, Circle) of your PS3 …","""Case Color: Silver Case Material: Nickel Primer Color: Bronze Primer Material: Brass Brand of ammo s…","""Computers""","""Video Games > PlayStation 4 > Accessories > Controllers""","""NEXiLUX""","""4.8""",4,""""""


In [14]:
item_df = item_df.with_columns(
    pl.concat_str(
        [
            pl.lit("Product: "),
            pl.col("title"),
            pl.lit("\n\nDescription: "),
            pl.col("description_text"),
            pl.lit("\n\nFeatures: "),
            pl.col("features_text"),
            pl.lit("\n\nCategory: "),
            pl.col("main_category"),
            pl.lit(", Category tree: "),
            pl.col("categories_text"),
            pl.lit("\n\nStore: "),
            pl.col("store"),
            pl.lit("\n\nAverage rating: "),
            pl.col("average_rating"),
            pl.lit(", Rating count: "),
            pl.col("rating_number"),
            pl.lit("\n\nPrice: "),
            pl.col("price"),
        ]
    ).alias("item_context")
)

item_df.head()

parent_asin,title,description_text,features_text,main_category,categories_text,store,average_rating,rating_number,price,item_context
str,str,str,str,str,str,str,str,i64,str,str
"""B000FH0MHO""","""Dash 8-300 Professional Add-On""","""The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the p…","""Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Ba…","""Video Games""","""Video Games > PC > Games""","""Aerosoft""","""5.0""",1,"""""","""Product: Dash 8-300 Professional Add-On Description: The Dash 8-300 Professional Add-On lets you pi…"
"""B00Z9TLVK0""","""NBA 2K17 - Early Tip Off Edition - PlayStation 4""","""Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim …","""The #1 rated NBA video game simulation series for the last 15 years (Metacritic). The #1 selling NBA…","""Video Games""","""Video Games > PlayStation 4 > Games""","""2K""","""4.3""",223,"""58.0""","""Product: NBA 2K17 - Early Tip Off Edition - PlayStation 4 Description: Following the record-breakin…"
"""B002WH4ZJG""","""Thrustmaster Elite Fitness Pack for Nintendo Wii""","""The Thrustmaster Motion Plus Elite Fitness Pack for Wii is Ideal for Nintendo Wii Fit & Wii Fit Plus…","""Includes (9) Total Accessories Pedometer Wii Fit Balance Board Stepper Floor Mat made from high dens…","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Wii > Accessories > Fitness Accessories""","""THRUSTMASTER""","""3.0""",3,"""""","""Product: Thrustmaster Elite Fitness Pack for Nintendo Wii Description: The Thrustmaster Motion Plus…"
"""B0001ZNU56""","""Spongebob Squarepants, Vol. 1""","""Now you can watch the wild underwater antics of SpongeBob SquarePants on your Game Boy Advance, with…","""Bubblestand: SpongeBob shows Patrick and Squidward his unique talent for blowing bubbles. Squidward …","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Game Boy Systems > Game Boy Advance > Games""","""Majesco""","""4.4""",32,"""33.98""","""Product: Spongebob Squarepants, Vol. 1 Description: Now you can watch the wild underwater antics of…"
"""B009C9E8JY""","""Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers""","""A set of 4 bullet button to replace all the 4 buttons (Cross, Square, Triangle, Circle) of your PS3 …","""Case Color: Silver Case Material: Nickel Primer Color: Bronze Primer Material: Brass Brand of ammo s…","""Computers""","""Video Games > PlayStation 4 > Accessories > Controllers""","""NEXiLUX""","""4.8""",4,"""""","""Product: Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers Description: A se…"


In [15]:
logger.info(item_df.slice(0, 1).select("item_context").item())

18:08:23 - Product: Dash 8-300 Professional Add-On

Description: The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the popular Dash 8-300 in a total of 17 different liveries. The Dash 8-300 is one of the most popular short-haul aircraft available and this superbly modelled version from acclaimed aircraft developers PSS is modelled in two versions with a total of 17 different liveries. The package also includes scenery for three European airports, tutorials, tutorial flights and utilities together in one fantastic package.

Features: Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Bahamasair, Augsburg Airways, Lufthansa Cityline, British Airways (Union Jack), British European, FlyBe, Intersky, Wideroe, Iberia, Tyrolean, QantasLink, BWIA Airports include - London City, Frankfurt, Milan and Amsterdam Schipol Includes PSS PanelConfig and LoadEdit tools

Category: Video Games, Category tree: Video

## Load sequences

In [16]:
# Load the gzipped CSV file
df = pl.read_csv(f"../data/{CATEGORY}.train.csv.gz")

# Display basic information about the dataset
logger.info(f"Dataset shape: {df.shape}")
logger.info(f"Columns: {df.columns}")

df.head()

18:08:23 - Dataset shape: (736827, 5)


18:08:23 - Columns: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history']


user_id,parent_asin,rating,timestamp,history
str,str,f64,i64,str
"""AEVPPTMG43C6GWSR7I2UGRQN7WFQ""","""B08R5B7YS4""",1.0,1611459666223,
"""AEVPPTMG43C6GWSR7I2UGRQN7WFQ""","""B0863MT183""",4.0,1613701986538,"""B08R5B7YS4"""
"""AEVPPTMG43C6GWSR7I2UGRQN7WFQ""","""B08P8P7686""",5.0,1613702112995,"""B08R5B7YS4 B0863MT183"""
"""AEVPPTMG43C6GWSR7I2UGRQN7WFQ""","""B0B7LV3DN2""",4.0,1617641445475,"""B08R5B7YS4 B0863MT183 B08P8P7686"""
"""AEVPPTMG43C6GWSR7I2UGRQN7WFQ""","""B09WMQ6DXG""",5.0,1620231368468,"""B08R5B7YS4 B0863MT183 B08P8P7686 B0B7LV3DN2"""


In [17]:
# Deduplicate by user_id, keeping the row with the longest history
# First, calculate the length of each history
df = df.with_columns(
    pl.when(pl.col("history").is_null())
    .then(0)
    .otherwise(pl.col("history").str.count_matches(r"\S+"))
    .alias("history_length")
)

logger.info(f"Original dataset num rows: {df.shape[0]:,}")

# Sort by user_id and history_length (descending), then keep first row per user
df = df.sort(["user_id", "history_length"], descending=[False, True]).group_by("user_id").first().drop("history_length")

logger.info(f"Deduplicated dataset num rows: {df.shape[0]:,}")
logger.info(f"Number of unique users: {df.n_unique('user_id'):,}")

18:08:23 - Original dataset num rows: 736,827


18:08:23 - Deduplicated dataset num rows: 91,562


18:08:23 - Number of unique users: 91,562


In [18]:
# Create sequences column by appending parent_asin to history as a list
df = df.with_columns(pl.col("history").str.split(" ").list.concat([pl.col("parent_asin")]).alias("sequence"))

df.head()

user_id,parent_asin,rating,timestamp,history,sequence
str,str,f64,i64,str,list[str]
"""AE222HFZDH6BPTYFOUWGGU63YSIQ""","""B0BW17W9GM""",5.0,1593366227132,"""B082R1RGZF B07SNN8GV5 B01GY35T4S B07QX99XJJ""","[""B082R1RGZF"", ""B07SNN8GV5"", … ""B0BW17W9GM""]"
"""AE2252DKW4XJIZP5QPFMQVJBVRTA""","""B07JH3LSHN""",5.0,1562210954523,"""B0050SX4CI B002ORTCAQ B0090ECASW B004OYV7ZU B00F27JGVA B01N6N3J8D B01KV3BB0S B0C5K2TWD8""","[""B0050SX4CI"", ""B002ORTCAQ"", … ""B07JH3LSHN""]"
"""AE225O22SA7DLBOGOEIFL7FT5VYQ""","""B0053BCML6""",4.0,1370812046000,"""B00005YTYC B00029QOQS B0006B7DXA B001LETH2Q B0009XEC02 B000NNDN1M B00136MBHA B007VTVRFA""","[""B00005YTYC"", ""B00029QOQS"", … ""B0053BCML6""]"
"""AE227CCN4C37WTOB3J2TZPOKLEQQ""","""B001QCWRWK""",1.0,1442558286000,"""B0049U4DXM B002V8KA72 B0000NSZMM B000KQLDP0 B0032C9V7G B001EYUPHO B000WCCURW B00LZVNWIA B00AYZMZ9K B…","[""B0049U4DXM"", ""B002V8KA72"", … ""B001QCWRWK""]"
"""AE22BPPZGGRTSYOHK2J3LCG5HGAQ""","""B0053BCML6""",5.0,1418058672000,"""B00KVP3OY8 B07K3KHFSY B00KVP76G0 B00KVOVBGM""","[""B00KVP3OY8"", ""B07K3KHFSY"", … ""B0053BCML6""]"


## Filter items without metadata

We need to filter out items from the history that don't have valid metadata (both title and description). This ensures we only work with items that have sufficient information for generating semantic representations.

In [19]:
# Function to filter sequence items based on valid metadata
def filter_sequence_items(sequence_list, valid_items_set):
    if sequence_list is None:
        return None

    # Filter the list to keep only items with valid metadata
    filtered_items = [item for item in sequence_list if item in valid_items_set]

    return filtered_items if filtered_items else None


# Filter sequences where metadata is missing
df = df.with_columns(
    pl.col("sequence")
    .map_elements(lambda x: filter_sequence_items(x, valid_items), return_dtype=pl.List(pl.String))
    .alias("sequence")
)

# Filter out rows where:
# 1. The target item (parent_asin) doesn't have valid metadata
# 2. The filtered sequences is empty or null
rows_before_filtering = df.shape[0]
logger.info(f"Rows before filtering: {rows_before_filtering:,}")

df = df.filter((pl.col("sequence").is_not_null()) & (pl.col("sequence").list.len() >= MIN_SEQUENCE_LENGTH))

# Log statistics
logger.info(f"Rows after filtering: {df.shape[0]:,}")
logger.info(
    f"Rows removed: {rows_before_filtering - df.shape[0]:,} ({(rows_before_filtering - df.shape[0]) / rows_before_filtering * 100:.1f}%)"
)

18:08:24 - Rows before filtering: 91,562


18:08:24 - Rows after filtering: 78,643


18:08:24 - Rows removed: 12,919 (14.1%)


## Truncate long sequences

For users with sequences longer than a maximum length, we truncate to keep only the last n items to maintain consistent sequence lengths and focus on recent interactions.

In [20]:
# Calculate sequence lengths before truncation
df = df.with_columns(pl.col("sequence").list.len().alias("sequence_length_before"))

# Using Polars expressions for efficient truncation - take last N items
df = df.with_columns(pl.col("sequence").list.tail(MAX_SEQUENCE_LENGTH).alias("sequence"))

# Update sequence length for truncated sequences
df = df.with_columns(pl.col("sequence").list.len().alias("sequence_length"))

# Calculate truncation statistics
sequences_truncated = (df["sequence_length_before"] > MAX_SEQUENCE_LENGTH).sum()
pct_truncated = sequences_truncated / len(df) * 100

logger.info(f"Sequences truncated: {sequences_truncated:,} ({pct_truncated:.1f}%)")

# Replace the sequences column with the truncated version
df = df.drop(["sequence_length_before"])

logger.info(
    f"Sequence lengths - Min: {df['sequence_length'].min()}, Max: {df['sequence_length'].max()}, Mean: {df['sequence_length'].mean():.1f}, Median: {df['sequence_length'].median()}"
)

18:08:24 - Sequences truncated: 28 (0.0%)


18:08:24 - Sequence lengths - Min: 3, Max: 100, Mean: 6.5, Median: 5.0


In [21]:
df.group_by("sequence_length").len().with_columns((pl.col("len") / pl.sum("len")).alias("probability")).sort(
    "sequence_length"
).with_columns(pl.col("probability").cum_sum().alias("cumulative_probability")).head(10)

sequence_length,len,probability,cumulative_probability
u32,u32,f64,f64
3,13154,0.167262,0.167262
4,17458,0.221991,0.389253
5,15689,0.199496,0.588749
6,9288,0.118103,0.706852
7,5876,0.074717,0.78157
8,4018,0.051092,0.832662
9,2824,0.035909,0.868571
10,2036,0.025889,0.89446
11,1508,0.019175,0.913635
12,1180,0.015005,0.92864


In [22]:
df = df.select(["user_id", "sequence", "sequence_length"])
df.head()

user_id,sequence,sequence_length
str,list[str],u32
"""AE222HFZDH6BPTYFOUWGGU63YSIQ""","[""B082R1RGZF"", ""B07SNN8GV5"", … ""B0BW17W9GM""]",5
"""AE2252DKW4XJIZP5QPFMQVJBVRTA""","[""B002ORTCAQ"", ""B0090ECASW"", … ""B07JH3LSHN""]",7
"""AE225O22SA7DLBOGOEIFL7FT5VYQ""","[""B00029QOQS"", ""B0006B7DXA"", … ""B0053BCML6""]",7
"""AE227CCN4C37WTOB3J2TZPOKLEQQ""","[""B0049U4DXM"", ""B002V8KA72"", … ""B001QCWRWK""]",9
"""AE22BPPZGGRTSYOHK2J3LCG5HGAQ""","[""B00KVP3OY8"", ""B07K3KHFSY"", … ""B0053BCML6""]",4


## Save the processed data

Now we'll save the filtered data for use in subsequent steps of the semantic ID generation pipeline.

In [23]:
df.head()

user_id,sequence,sequence_length
str,list[str],u32
"""AE222HFZDH6BPTYFOUWGGU63YSIQ""","[""B082R1RGZF"", ""B07SNN8GV5"", … ""B0BW17W9GM""]",5
"""AE2252DKW4XJIZP5QPFMQVJBVRTA""","[""B002ORTCAQ"", ""B0090ECASW"", … ""B07JH3LSHN""]",7
"""AE225O22SA7DLBOGOEIFL7FT5VYQ""","[""B00029QOQS"", ""B0006B7DXA"", … ""B0053BCML6""]",7
"""AE227CCN4C37WTOB3J2TZPOKLEQQ""","[""B0049U4DXM"", ""B002V8KA72"", … ""B001QCWRWK""]",9
"""AE22BPPZGGRTSYOHK2J3LCG5HGAQ""","[""B00KVP3OY8"", ""B07K3KHFSY"", … ""B0053BCML6""]",4


In [24]:
item_df.head()

parent_asin,title,description_text,features_text,main_category,categories_text,store,average_rating,rating_number,price,item_context
str,str,str,str,str,str,str,str,i64,str,str
"""B000FH0MHO""","""Dash 8-300 Professional Add-On""","""The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the p…","""Features Dash 8-300 and 8-Q300 ('Q' rollout livery) Airlines - US Airways, South African Express, Ba…","""Video Games""","""Video Games > PC > Games""","""Aerosoft""","""5.0""",1,"""""","""Product: Dash 8-300 Professional Add-On Description: The Dash 8-300 Professional Add-On lets you pi…"
"""B00Z9TLVK0""","""NBA 2K17 - Early Tip Off Edition - PlayStation 4""","""Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim …","""The #1 rated NBA video game simulation series for the last 15 years (Metacritic). The #1 selling NBA…","""Video Games""","""Video Games > PlayStation 4 > Games""","""2K""","""4.3""",223,"""58.0""","""Product: NBA 2K17 - Early Tip Off Edition - PlayStation 4 Description: Following the record-breakin…"
"""B002WH4ZJG""","""Thrustmaster Elite Fitness Pack for Nintendo Wii""","""The Thrustmaster Motion Plus Elite Fitness Pack for Wii is Ideal for Nintendo Wii Fit & Wii Fit Plus…","""Includes (9) Total Accessories Pedometer Wii Fit Balance Board Stepper Floor Mat made from high dens…","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Wii > Accessories > Fitness Accessories""","""THRUSTMASTER""","""3.0""",3,"""""","""Product: Thrustmaster Elite Fitness Pack for Nintendo Wii Description: The Thrustmaster Motion Plus…"
"""B0001ZNU56""","""Spongebob Squarepants, Vol. 1""","""Now you can watch the wild underwater antics of SpongeBob SquarePants on your Game Boy Advance, with…","""Bubblestand: SpongeBob shows Patrick and Squidward his unique talent for blowing bubbles. Squidward …","""Video Games""","""Video Games > Legacy Systems > Nintendo Systems > Game Boy Systems > Game Boy Advance > Games""","""Majesco""","""4.4""",32,"""33.98""","""Product: Spongebob Squarepants, Vol. 1 Description: Now you can watch the wild underwater antics of…"
"""B009C9E8JY""","""Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers""","""A set of 4 bullet button to replace all the 4 buttons (Cross, Square, Triangle, Circle) of your PS3 …","""Case Color: Silver Case Material: Nickel Primer Color: Bronze Primer Material: Brass Brand of ammo s…","""Computers""","""Video Games > PlayStation 4 > Accessories > Controllers""","""NEXiLUX""","""4.8""",4,"""""","""Product: Set of 4 Bullet Buttons Nickel+Brass for Playstation PS3 PS2 controllers Description: A se…"


In [25]:
# Save the filtered sequences with full history
output_path = DATA_DIR / "output" / f"{CATEGORY}_sequences.parquet"
df.write_parquet(output_path)
logger.info(f"Saved filtered sequences to: {output_path} (rows = {df.shape[0]:,})")

# Save the valid items metadata
metadata_output_path = DATA_DIR / "output" / f"{CATEGORY}_items.parquet"
item_df.write_parquet(metadata_output_path)
logger.info(f"Saved valid item metadata to: {metadata_output_path} (rows = {len(item_df):,})")

18:08:24 - Saved filtered sequences to: /Users/eugeneyan/projects/semantic-id/data/output/Video_Games_sequences.parquet (rows = 78,643)


18:08:24 - Saved valid item metadata to: /Users/eugeneyan/projects/semantic-id/data/output/Video_Games_items.parquet (rows = 66,133)
