# LLM4Rec - Colab Notebook

This notebook runs the full CLLM4Rec pipeline on Colab:
1. Training (content pretrain + iterative mutual training)
2. Finetuning (recommendation head)
3. Evaluation (Recall@20/40, NDCG@100)

**Requirements:**
- GPU runtime (Runtime → Change runtime type → GPU)
- Hugging Face account (for GPT-2 model)
- Weights & Biases account (for experiment tracking)


In [1]:
# ====== Cell 1: Setup ======

import subprocess
import sys
import os
from pathlib import Path

# Enable Colab widget manager for tqdm progress bars
try:
    from google.colab import output
    output.enable_custom_widget_manager()
    IN_COLAB = True
    print("✓ Colab widget manager enabled")
except ImportError:
    IN_COLAB = False
    print("Not running in Colab")

# Clone the repo
REPO_URL = "https://github.com/fmegp/LLM4Rec.git"
REPO_DIR = "/content/LLM4Rec"

if IN_COLAB and not Path(REPO_DIR).exists():
    print("Cloning repository...")
    subprocess.run(["git", "clone", "--depth", "1", REPO_URL, REPO_DIR], check=True)
    os.chdir(REPO_DIR)
elif IN_COLAB:
    os.chdir(REPO_DIR)

print(f"Working directory: {os.getcwd()}")

# Install dependencies
print("Installing dependencies...")
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "requirements-colab.txt"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "ipywidgets>=8.1.0"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-e", "."], check=True)

# Add src to path
src_path = str(Path(REPO_DIR if IN_COLAB else ".").resolve() / "src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Verify tqdm works
print("\nTesting tqdm progress bar...")
from tqdm.auto import tqdm
import time
for _ in tqdm(range(10), desc="Test", leave=True):
    time.sleep(0.1)
print("✓ tqdm working")

# Verify llm4rec imports
from llm4rec.runtime import print_runtime_report
print_runtime_report()
print("\n✓ Setup complete!")


✓ Colab widget manager enabled
Cloning repository...
Working directory: /content/LLM4Rec
Installing dependencies...

Testing tqdm progress bar...


Test:   0%|          | 0/10 [00:00<?, ?it/s]

✓ tqdm working
Runtime Report
Python: 3.12.12
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
Torch: 2.9.0+cu126
CUDA available: True
CUDA version: 12.6
GPU: NVIDIA A100-SXM4-40GB
Numpy: 2.0.2

✓ Setup complete!


In [2]:
# ====== Cell 2: Output Directory (Ephemeral) ======

from datetime import datetime, timezone
from pathlib import Path

RUN_ID = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
OUTPUT_DIR = f"/content/outputs/run_{RUN_ID}"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"RUN_ID: {RUN_ID}")
print("\nNote: This is ephemeral storage. Outputs will be lost when the runtime resets.")


OUTPUT_DIR: /content/outputs/run_20251219_165107
RUN_ID: 20251219_165107

Note: This is ephemeral storage. Outputs will be lost when the runtime resets.


In [3]:
# ====== Cell 3: Hugging Face Login ======

from huggingface_hub import notebook_login

print("Login to Hugging Face to download GPT-2 model.")
print("Get your token from: https://huggingface.co/settings/tokens\n")

notebook_login()


Login to Hugging Face to download GPT-2 model.
Get your token from: https://huggingface.co/settings/tokens



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# ====== Cell 4: Weights & Biases Login ======

import wandb

print("Login to Weights & Biases for experiment tracking.")
print("Get your API key from: https://wandb.ai/authorize\n")

wandb.login()


Login to Weights & Biases for experiment tracking.
Get your API key from: https://wandb.ai/authorize



[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfmenol[0m ([33mfmenol-csynbiosys[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
# ====== Cell 5: Dataset Download + W&B Init ======

from pathlib import Path
from llm4rec.io import (
    download_gdrive,
    safe_extract_archive,
    validate_dataset_layout,
    build_dataset_manifest,
    save_json,
)
from llm4rec.logging_wandb import WandbHandle

# Dataset config
DATASET_NAME = "beauty"  # Options: beauty, sports, toys
LAMBDA_V = 1.0
DATA_GDRIVE_URL = "https://drive.google.com/file/d/1G4t64tzAlXN0gq_0TJ5Wik8dsERz8pMJ/view?usp=drive_link"

# Download and extract
RAW_DIR = Path("/content/data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)
ARCHIVE_PATH = RAW_DIR / "cllm4rec_dataset"
EXTRACT_DIR = RAW_DIR / "extracted"

if not ARCHIVE_PATH.exists():
    print("Downloading dataset...")
    download_gdrive(DATA_GDRIVE_URL, ARCHIVE_PATH, quiet=False)
else:
    print("Dataset already downloaded")

if not EXTRACT_DIR.exists():
    print("Extracting dataset...")
    EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
    safe_extract_archive(ARCHIVE_PATH, EXTRACT_DIR)
else:
    print("Dataset already extracted")

# Find dataset directory
candidates = [p.parent for p in EXTRACT_DIR.rglob("meta.pkl")]
match = next((c for c in candidates if c.name == DATASET_NAME), candidates[0] if len(candidates) == 1 else None)
if match is None:
    raise RuntimeError(f"Dataset '{DATASET_NAME}' not found. Available: {[c.name for c in candidates]}")

DATASET_DIR = str(match)
print(f"\nDATASET_DIR: {DATASET_DIR}")

# Validate
layout = validate_dataset_layout(DATASET_DIR)
print(f"\nDataset layout validated:")
print(f"  - meta.pkl: {layout.meta_path}")
print(f"  - train_matrix.npz: {layout.train_matrix_path}")
print(f"  - review.pkl: {layout.review_path}")

# Initialize W&B run (wrap in WandbHandle for stage functions)
print("\nInitializing W&B run...")
_wandb_run = wandb.init(
    project="cllm4rec",
    name=f"{DATASET_NAME}_lambda{LAMBDA_V}_{RUN_ID}",
    config={
        "dataset_name": DATASET_NAME,
        "lambda_V": LAMBDA_V,
        "run_id": RUN_ID,
        "output_dir": OUTPUT_DIR,
    },
)
wandb_handle = WandbHandle(run=_wandb_run, enabled=True)
print(f"W&B run: {_wandb_run.url}")

# Save manifest
manifest = build_dataset_manifest(DATASET_DIR, include_optional=True)
save_json(manifest, Path(OUTPUT_DIR) / "dataset_manifest.json")
print(f"\n✓ Dataset ready!")


Downloading dataset...
Downloading Google Drive file id=1G4t64tzAlXN0gq_0TJ5Wik8dsERz8pMJ to /content/data/raw/cllm4rec_dataset ...


Downloading...
From (original): https://drive.google.com/uc?id=1G4t64tzAlXN0gq_0TJ5Wik8dsERz8pMJ
From (redirected): https://drive.google.com/uc?id=1G4t64tzAlXN0gq_0TJ5Wik8dsERz8pMJ&confirm=t&uuid=e7bb28fc-ab7e-44e8-99a3-778b468b27a4
To: /content/data/raw/cllm4rec_dataset
100%|██████████| 334M/334M [00:53<00:00, 6.24MB/s] 


Extracting dataset...

DATASET_DIR: /content/data/raw/extracted/data/beauty

Dataset layout validated:
  - meta.pkl: /content/data/raw/extracted/data/beauty/meta.pkl
  - train_matrix.npz: /content/data/raw/extracted/data/beauty/train_matrix.npz
  - review.pkl: /content/data/raw/extracted/data/beauty/user_item_texts/review.pkl

Initializing W&B run...


W&B run: https://wandb.ai/fmenol-csynbiosys/cllm4rec/runs/5w0z9jig

✓ Dataset ready!


In [None]:
# ====== Cell 6: Training ======

from pathlib import Path
from llm4rec.stages.training_stage import run_training

HF_MODEL_NAME = "openai-community/gpt2"
HF_CACHE_DIR = "/content/hf_cache"

training_out = Path(OUTPUT_DIR) / "training"
content_user = training_out / "content" / f"user_embeddings_{LAMBDA_V}.pt"
collab_user = training_out / "collaborative" / f"user_embeddings_{LAMBDA_V}.pt"

if content_user.exists() and collab_user.exists():
    print(f"Training artifacts already exist at {training_out}")
    print("Skipping training stage.")
else:
    print("Starting training stage...\n")
    run_training(
        dataset_dir=DATASET_DIR,
        output_dir=OUTPUT_DIR,
        lambda_V=LAMBDA_V,
        hf_model_name=HF_MODEL_NAME,
        hf_cache_dir=HF_CACHE_DIR,
        hf_token=None,  # Uses cached HF login
        mixed_precision="bf16",
        wandb_handle=wandb_handle,
    )
    print("\n✓ Training complete!")


Starting training stage...



  self.setter(val)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

-----Begin Content GPT Pretraining Loop-----
Content GPT Pretraining - Epoch 1/10
Starting epoch 1, 3531 batches total...
  batch 1/3531 loss=3.5876 avg=3.5876
  batch 50/3531 loss=3.4814 avg=3.8058
  batch 100/3531 loss=3.8337 avg=3.8014
  batch 150/3531 loss=3.3728 avg=3.8160
  batch 200/3531 loss=3.8996 avg=3.8041
  batch 250/3531 loss=3.4805 avg=3.8040
  batch 300/3531 loss=3.6270 avg=3.7861
  batch 350/3531 loss=4.6844 avg=3.8023
  batch 400/3531 loss=4.0420 avg=3.7913
  batch 450/3531 loss=3.4581 avg=3.7864
  batch 500/3531 loss=4.1565 avg=3.7865
  batch 550/3531 loss=4.0093 avg=3.7891
  batch 600/3531 loss=3.7890 avg=3.7860
  batch 650/3531 loss=3.8487 avg=3.7840
  batch 700/3531 loss=3.6102 avg=3.7885
  batch 750/3531 loss=3.5979 avg=3.7907
  batch 800/3531 loss=4.4340 avg=3.7929
  batch 850/3531 loss=3.8138 avg=3.7900
  batch 900/3531 loss=4.1332 avg=3.7896
  batch 950/3531 loss=4.2196 avg=3.7919
  batch 1000/3531 loss=3.9695 avg=3.7927
  batch 1050/3531 loss=3.7335 avg=3.7902

In [None]:
# ====== Cell 7: Finetuning ======

from pathlib import Path
from llm4rec.stages.finetune_stage import run_finetuning

training_out = Path(OUTPUT_DIR) / "training"
finetune_out = Path(OUTPUT_DIR) / "finetuning"
rec_user = finetune_out / "rec" / f"user_embeddings_{LAMBDA_V}.pt"

if rec_user.exists():
    print(f"Finetuning artifacts already exist at {finetune_out}")
    print("Skipping finetuning stage.")
else:
    print("Starting finetuning stage...\n")
    run_finetuning(
        dataset_dir=DATASET_DIR,
        output_dir=OUTPUT_DIR,
        pretrained_dir=str(training_out),
        lambda_V=LAMBDA_V,
        hf_model_name=HF_MODEL_NAME,
        hf_cache_dir=HF_CACHE_DIR,
        hf_token=None,
        mixed_precision="bf16",
        wandb_handle=wandb_handle,
    )
    print("\n✓ Finetuning complete!")


In [None]:
# ====== Cell 8: Evaluation ======

from pathlib import Path
from llm4rec.stages.eval_stage import run_eval

finetune_out = Path(OUTPUT_DIR) / "finetuning"

print("Starting evaluation...\n")
results = run_eval(
    dataset_dir=DATASET_DIR,
    output_dir=OUTPUT_DIR,
    rec_embeddings_dir=str(finetune_out / "rec"),
    lambda_V=LAMBDA_V,
    hf_model_name=HF_MODEL_NAME,
    hf_cache_dir=HF_CACHE_DIR,
    hf_token=None,
    wandb_handle=wandb_handle,
)

print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
for k, v in results.items():
    if isinstance(v, float):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Finish W&B run
wandb.finish()
print(f"\n✓ Run complete! Results saved to {OUTPUT_DIR}")
