# BERT4Rec Experiment: Fashion Recommender on H&M Dataset

This notebook implements a **BERT4Rec-style sequential recommender** on top of the existing H&M pipeline, optimized to run locally on a **MacBook M4 Air (16GB)**:

- Start from processed data (transactions + features) under `fashion_recommender_candidate_generation_2`.
- Build per-user interaction sequences with careful filtering and capping for memory efficiency.
- Train a lightweight BERT4Rec model in **PyTorch** using the Apple Silicon **MPS** backend when available.
- Evaluate ranking quality with **MAP@12** on the existing validation/test candidate sets.



In [None]:
# %% [code]
"""Core imports and configuration for BERT4Rec on H&M, optimized for M4 Air (16 GB)."""

import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# ---------------------------------------------------------------------------
# Paths and basic config
# ---------------------------------------------------------------------------
BASE_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2')
PROC_PATH = BASE_PATH / 'fashion_recommender_candidate_generation_2'
MODELS_PATH = PROC_PATH / 'models'
FEATURES_PATH = BASE_PATH / 'fashion_recommender_features_2'

# These can be tweaked if your local column names differ
USER_COL = 'customer_id'
ITEM_COL = 'article_id'
TIME_COL = 't_dat'            # from *_transactions.parquet
LABEL_COL = 'label'           # used in candidate sets for MAP@12

# Device: prefer Apple Silicon GPU (MPS) when available
if torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
elif torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')

# For evaluation (MAP@12), force CPU to avoid missing MPS ops in Transformer
EVAL_DEVICE = torch.device('cpu')

print(f"Using device for training: {DEVICE}")
print(f"Using device for evaluation: {EVAL_DEVICE}")

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ---------------------------------------------------------------------------
# Sequence building hyperparameters tuned for 16 GB M4 Air
# ---------------------------------------------------------------------------
MAX_SEQ_LEN = 100        # cap history length per user for memory/speed
MIN_USER_INTERACTIONS = 5  # drop very sparse users
MAX_USERS = 300_000        # optional cap on number of users for experiments
BATCH_SIZE = 256

print("Config ready")

Using device: mps
Config ready


In [6]:
# %% [code]
"""Build per-user interaction sequences from transactions for BERT4Rec.

We:
- Use train_transactions.parquet as the source of positive interactions.
- Optionally intersect users/items with training_features.parquet for consistency.
- Filter users with very short histories and cap max sequence length.
This keeps memory down while leveraging the rich H&M dataset.
"""

# Paths to raw interaction data and features
train_tx_path = PROC_PATH / 'train_transactions.parquet'
val_tx_path = PROC_PATH / 'val_transactions.parquet'
train_feat_path = PROC_PATH / 'training_features.parquet'  # used mainly for ID overlap if needed

print(f"Train transactions: {train_tx_path}")
print(f"Val transactions:   {val_tx_path}")
print(f"Training features:  {train_feat_path}")

# Load transactions with only necessary columns to save memory
train_tx = pd.read_parquet(train_tx_path, columns=[USER_COL, ITEM_COL, TIME_COL])
val_tx = pd.read_parquet(val_tx_path, columns=[USER_COL, ITEM_COL, TIME_COL])

# Ensure proper dtypes
for df in (train_tx, val_tx):
    df[USER_COL] = df[USER_COL].astype('category')
    df[ITEM_COL] = df[ITEM_COL].astype('category')
    df[TIME_COL] = pd.to_datetime(df[TIME_COL])

print(f"Train transactions shape: {train_tx.shape}")
print(f"Val transactions shape:   {val_tx.shape}")

# Optionally, intersect with users present in training_features to align with existing pipeline
try:
    train_feats_head = pd.read_parquet(train_feat_path, columns=[USER_COL, ITEM_COL]).head(1_000_000)
    feat_users = set(train_feats_head[USER_COL].unique())
    train_tx = train_tx[train_tx[USER_COL].isin(feat_users)]
    val_tx = val_tx[val_tx[USER_COL].isin(feat_users)]
    print(f"After aligning with training_features (head), train shape: {train_tx.shape}, val shape: {val_tx.shape}")
except Exception as e:
    print(f"Could not align with training_features (optional step), proceeding anyway: {e}")

# Build item and user integer vocabularies
all_items = pd.concat([train_tx[ITEM_COL], val_tx[ITEM_COL]]).unique()
item2idx = {item: idx + 3 for idx, item in enumerate(all_items)}  # reserve 0:[PAD], 1:[MASK], 2:[UNK]
idx2item = {idx: item for item, idx in item2idx.items()}

all_users = train_tx[USER_COL].unique()
if MAX_USERS is not None and len(all_users) > MAX_USERS:
    # Subsample users for feasibility
    all_users = np.random.choice(all_users, size=MAX_USERS, replace=False)

user_set = set(all_users)
train_tx = train_tx[train_tx[USER_COL].isin(user_set)]
val_tx = val_tx[val_tx[USER_COL].isin(user_set)]

user2idx = {user: idx for idx, user in enumerate(all_users)}
idx2user = {idx: user for user, idx in user2idx.items()}

n_users = len(user2idx)
n_items = len(item2idx) + 3  # including special tokens
print(f"Users: {n_users:,}, Items (including specials): {n_items:,}")

# Sort interactions by time and build sequences
train_tx = train_tx.sort_values([USER_COL, TIME_COL])

user_sequences = {}
for user, group in train_tx.groupby(USER_COL, sort=False):
    item_ids = [item2idx.get(i, 2) for i in group[ITEM_COL].tolist()]  # 2 = [UNK]
    if len(item_ids) < MIN_USER_INTERACTIONS:
        continue
    # Keep only the most recent MAX_SEQ_LEN interactions
    if len(item_ids) > MAX_SEQ_LEN:
        item_ids = item_ids[-MAX_SEQ_LEN:]
    user_sequences[user2idx[user]] = item_ids

print(f"Built sequences for {len(user_sequences):,} users with >= {MIN_USER_INTERACTIONS} interactions")


Train transactions: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/train_transactions.parquet
Val transactions:   /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/val_transactions.parquet
Training features:  /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/training_features.parquet
Train transactions shape: (412156, 3)
Val transactions shape:   (16480, 3)
After aligning with training_features (head), train shape: (101761, 3), val shape: (3043, 3)
Users: 11,766, Items (including specials): 14,571


  for user, group in train_tx.groupby(USER_COL, sort=False):


Built sequences for 6,025 users with >= 5 interactions


In [7]:
# %% [code]
"""Dataset and BERT4Rec model definition.

We implement a minimal BERT4Rec-style masked item prediction model:
- Input: tokenized user sequence with [PAD]=0, [MASK]=1, [UNK]=2.
- Randomly mask a fraction of positions and predict original items.
- For inference, we typically mask the last position to get next-item scores.
"""

MASK_TOKEN_ID = 1
PAD_TOKEN_ID = 0
UNK_TOKEN_ID = 2


class Bert4RecDataset(Dataset):
    """Generates masked sequences for BERT4Rec from pre-built user sequences.

    user_sequences: dict[user_idx -> list[int]]
    """

    def __init__(self, user_sequences, max_seq_len=MAX_SEQ_LEN, mask_prob=0.15):
        self.user_ids = list(user_sequences.keys())
        self.sequences = user_sequences
        self.max_seq_len = max_seq_len
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx]
        seq = self.sequences[user_id]

        # Pad/truncate to max_seq_len
        seq = seq[-self.max_seq_len :]
        pad_len = self.max_seq_len - len(seq)
        if pad_len > 0:
            seq = [PAD_TOKEN_ID] * pad_len + seq

        input_ids = np.array(seq, dtype=np.int64)
        labels = np.full_like(input_ids, fill_value=-100)  # ignore index

        # Apply BERT-style masking
        mask = np.random.rand(len(input_ids)) < self.mask_prob
        # don't mask PAD tokens
        mask[input_ids == PAD_TOKEN_ID] = False

        for i in range(len(input_ids)):
            if not mask[i]:
                continue
            original_id = input_ids[i]
            labels[i] = original_id
            prob = np.random.rand()
            if prob < 0.8:
                input_ids[i] = MASK_TOKEN_ID
            elif prob < 0.9:
                # replace with random item id (excluding specials)
                input_ids[i] = np.random.randint(3, n_items)
            else:
                # keep original
                pass

        return {
            'user_id': np.int64(user_id),
            'input_ids': torch.from_numpy(input_ids),
            'labels': torch.from_numpy(labels),
        }


def collate_bert4rec(batch):
    input_ids = torch.stack([b['input_ids'] for b in batch], dim=0)
    labels = torch.stack([b['labels'] for b in batch], dim=0)
    user_ids = torch.tensor([b['user_id'] for b in batch], dtype=torch.long)
    return {
        'user_ids': user_ids,
        'input_ids': input_ids,
        'labels': labels,
    }


class Bert4RecModel(nn.Module):
    def __init__(
        self,
        num_items: int,
        d_model: int = 128,
        n_heads: int = 4,
        n_layers: int = 2,
        max_seq_len: int = MAX_SEQ_LEN,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.num_items = num_items
        self.d_model = d_model
        self.max_seq_len = max_seq_len

        self.item_embedding = nn.Embedding(num_items, d_model, padding_idx=PAD_TOKEN_ID)
        self.pos_embedding = nn.Embedding(max_seq_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=4 * d_model,
            dropout=dropout,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        # Prediction head: project hidden states back to item vocab
        self.output_layer = nn.Linear(d_model, num_items)

    def forward(self, input_ids):
        """input_ids: (batch, seq_len)"""
        batch_size, seq_len = input_ids.shape
        device = input_ids.device

        positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, seq_len)

        x = self.item_embedding(input_ids) + self.pos_embedding(positions)
        x = self.layer_norm(x)
        x = self.dropout(x)

        # Generate padding mask: True for PAD positions
        pad_mask = input_ids.eq(PAD_TOKEN_ID)

        x = self.encoder(x, src_key_padding_mask=pad_mask)
        logits = self.output_layer(x)
        return logits


# Instantiate model
model = Bert4RecModel(num_items=n_items, d_model=128, n_heads=4, n_layers=2, max_seq_len=MAX_SEQ_LEN)
model.to(DEVICE)

print(model)


Bert4RecModel(
  (item_embedding): Embedding(14571, 128, padding_idx=0)
  (pos_embedding): Embedding(100, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (output_layer): Linear(in_features=128, out_features=14571, bias=True

In [None]:
# %% [code]
"""Training loop and MAP@12 evaluation on validation/test candidate sets.

We assume candidate sets in:
- MODELS_PATH / 'val_data.parquet'
- MODELS_PATH / 'test_data.parquet'
with columns [USER_COL, ITEM_COL, LABEL_COL] where LABEL_COL is 1 for the
true purchased item(s) and 0 otherwise.
"""

from collections import defaultdict


def get_dataloaders():
    dataset = Bert4RecDataset(user_sequences, max_seq_len=MAX_SEQ_LEN, mask_prob=0.15)
    loader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0,
        collate_fn=collate_bert4rec,
    )
    return loader


train_loader = get_dataloaders()


def train_one_epoch(model, data_loader, optimizer, scheduler=None, epoch: int = 1, total_epochs: int = 1):
    model.train()
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch}/{total_epochs}", leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids)  # (batch, seq_len, n_items)

        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()

        # Gradient clipping for stability on small devices
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item() * input_ids.size(0)
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(data_loader.dataset)


def build_user_latest_context(val_df):
    """Build latest sequence context per user from train+val transactions.

    For each user in val/test, we take their sequence from user_sequences
    (based on train data). If needed, you can also append their earliest
    val interactions for more context.
    """
    users_in_val = val_df[USER_COL].unique()
    user_context = {}
    for user in users_in_val:
        if user not in user2idx:
            continue
        uidx = user2idx[user]
        seq = user_sequences.get(uidx, [])
        # Pad/truncate
        seq = seq[-MAX_SEQ_LEN:]
        pad_len = MAX_SEQ_LEN - len(seq)
        if pad_len > 0:
            seq = [PAD_TOKEN_ID] * pad_len + seq
        user_context[uidx] = np.array(seq, dtype=np.int64)
    return user_context


def score_candidates(model, candidate_df, user_context):
    """Score candidate (user, item) pairs using BERT4Rec next-item scores.

    We:
    - For each user, take their context sequence and mask the last position.
    - Run model once per user to get logits over items at last position.
    - Use the resulting scores to rank candidate items.

    This function assumes `model` is already on `EVAL_DEVICE` (CPU).
    """
    model.eval()

    # Build mapping: user_idx -> list of (row_idx, item_idx)
    user_to_items = defaultdict(list)
    for row_idx, row in candidate_df.iterrows():
        user = row[USER_COL]
        item = row[ITEM_COL]
        if user not in user2idx:
            continue
        uidx = user2idx[user]
        item_idx = item2idx.get(item, UNK_TOKEN_ID)
        user_to_items[uidx].append((row_idx, item_idx))

    scores = np.zeros(len(candidate_df), dtype=np.float32)

    with torch.no_grad():
        for uidx, pairs in user_to_items.items():
            context = user_context.get(uidx)
            if context is None:
                continue

            # Mask last non-PAD position
            seq = context.copy()
            last_pos = np.where(seq != PAD_TOKEN_ID)[0]
            if len(last_pos) == 0:
                continue
            last_pos = last_pos[-1]
            original_token = seq[last_pos]
            seq[last_pos] = MASK_TOKEN_ID

            input_ids = torch.from_numpy(seq[None, :]).to(EVAL_DEVICE)
            logits = model(input_ids)  # (1, seq_len, n_items)
            logits_last = logits[0, last_pos]  # (n_items,)

            # Convert to CPU numpy
            logits_last = logits_last.detach().cpu().numpy()

            for row_idx, item_idx in pairs:
                if item_idx >= len(logits_last):
                    scores[row_idx] = -1e9
                else:
                    scores[row_idx] = logits_last[item_idx]

    return scores


def map_at_k(df, user_col, label_col, score_col, k=12):
    """Compute MAP@k given a dataframe with per-(user, item) scores and labels."""
    df = df.sort_values([user_col, score_col], ascending=[True, False])

    ap_sum = 0.0
    n_users = 0

    for user, group in df.groupby(user_col, sort=False):
        labels = group[label_col].values
        scores = group[score_col].values  # noqa: F841

        # Indices of positives within top-k
        topk = min(k, len(labels))
        rel = labels[:topk]
        if rel.sum() == 0:
            continue

        # AP@k
        precisions = []
        hits = 0
        for i in range(topk):
            if rel[i] == 1:
                hits += 1
                precisions.append(hits / (i + 1))
        if precisions:
            ap = np.mean(precisions)
            ap_sum += ap
            n_users += 1

    return ap_sum / max(n_users, 1)


def evaluate_map12(model, split='val'):
    fname = 'val_data.parquet' if split == 'val' else 'test_data.parquet'
    path = MODELS_PATH / fname
    print(f"Loading candidate data from {path}")

    candidate_df = pd.read_parquet(path)

    # Move model to eval device (CPU) for scoring to avoid missing MPS ops
    model_was_training = model.training
    model.to(EVAL_DEVICE)
    model.eval()

    # Build user context from train sequences (could be cached)
    user_context = build_user_latest_context(candidate_df)

    # Score candidates
    scores = score_candidates(model, candidate_df, user_context)
    candidate_df = candidate_df.copy()
    candidate_df['score'] = scores

    map12 = map_at_k(candidate_df, user_col=USER_COL, label_col=LABEL_COL, score_col='score', k=12)
    print(f"{split.upper()} MAP@12: {map12:.6f}")

    # Move model back to training device if needed
    model.to(DEVICE)
    if model_was_training:
        model.train()

    return map12


# Simple training loop over a few epochs
EPOCHS = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * len(train_loader))

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, epoch=epoch, total_epochs=EPOCHS)
    print(f"Epoch {epoch}/{EPOCHS} - Train loss: {train_loss:.4f}")

    # Lightweight validation after each epoch
    try:
        evaluate_map12(model, split='val')
    except Exception as e:
        print(f"Validation MAP@12 evaluation failed (check paths/columns): {e}")

# Final test evaluation (optional)
try:
    evaluate_map12(model, split='test')
except Exception as e:
    print(f"Test MAP@12 evaluation failed (check paths/columns): {e}")



Epoch 1/5:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 1/5 - Train loss: 9.5985
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/val_data.parquet


Epoch 2/5:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 2/5 - Train loss: 9.3225
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/val_data.parquet


Epoch 3/5:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 3/5 - Train loss: 9.2321
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/val_data.parquet


Epoch 4/5:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 4/5 - Train loss: 9.1827
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/val_data.parquet


Epoch 5/5:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 5/5 - Train loss: 9.1577
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/val_data.parquet
Loading candidate data from /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/test_data.parquet
