# Oolong Benchmark for RLM-ADK

Evaluate long-context aggregation reasoning using the [Oolong benchmark](https://arxiv.org/abs/2511.02817).

This notebook runs RLM with `grok-4-1-fast-reasoning` on the Oolong datasets,
which test classification, counting, and comparison across large volumes of text.
No frontier model exceeds 50% accuracy at 128K context -- RLM's recursive
decomposition approach aims to beat that ceiling.

In [None]:
# Cell 1: Install dependencies and clone repo
!pip install xai-sdk tiktoken python-dotenv datasets -q
!git clone https://github.com/fabiopauli/rlm-adk.git 2>/dev/null || (cd rlm-adk && git pull)

import sys
sys.path.insert(0, 'rlm-adk')
print('Setup complete.')

In [None]:
# Cell 2: Load API key from Colab secrets
from google.colab import userdata
import os

os.environ['XAI_API_KEY'] = userdata.get('XAI_API_KEY')
print('API key loaded.')

In [None]:
# Cell 3: Configuration
# Choose a profile: "smoke", "quick", "moderate" (default), "standard_validation", or "custom"
PROFILE = "moderate"

PROFILES = {
    "smoke":                {"subset": 5,    "max_cost": 1.0,   "max_cost_per_q": 0.50},
    "quick":                {"subset": 25,   "max_cost": 5.0,   "max_cost_per_q": 0.50},
    "moderate":             {"subset": 50,   "max_cost": 5.0,   "max_cost_per_q": 0.50},
    "standard_validation":  {"subset": None, "max_cost": 100.0, "max_cost_per_q": 0.50},
}

# Custom overrides (only used if PROFILE == "custom")
CUSTOM_SUBSET = 10
CUSTOM_MAX_COST = 2.0
CUSTOM_MAX_COST_PER_Q = 0.50

if PROFILE == "custom":
    cfg = {"subset": CUSTOM_SUBSET, "max_cost": CUSTOM_MAX_COST, "max_cost_per_q": CUSTOM_MAX_COST_PER_Q}
else:
    cfg = PROFILES[PROFILE]

# Model configuration
MODEL = "grok-4-1-fast-reasoning"
PROVIDER = "xai"
DATASET = "synth"       # "synth" or "real"
SPLIT = "validation"    # "validation" or "test"
REAL_CONFIG = "toy_dnd"  # "toy_dnd" or "dnd" (for real dataset)

print(f"Profile: {PROFILE}")
print(f"  Subset: {cfg['subset'] or 'all'} questions")
print(f"  Max cost: ${cfg['max_cost']:.2f}")
print(f"  Model: {MODEL}")
print(f"  Dataset: {DATASET} ({SPLIT})")

In [None]:
# Cell 4: Load data
from tests.test_oolong import OolongBenchmark

bench = OolongBenchmark(
    api_key=os.environ['XAI_API_KEY'],
    model=MODEL,
    provider=PROVIDER,
    verbose=True,
    max_cost_per_question=cfg['max_cost_per_q'],
    max_cost_total=cfg['max_cost'],
)

if DATASET == 'synth':
    windows = bench.load_synth(split=SPLIT, max_samples=cfg['subset'])
else:
    windows = bench.load_real(config=REAL_CONFIG, split=SPLIT, max_samples=cfg['subset'])

total_q = sum(len(qs) for qs in windows.values())
print(f"\nLoaded {total_q} questions across {len(windows)} context windows")

In [None]:
# Cell 5: Run benchmark
if DATASET == 'synth':
    summary = bench.run_synth(
        split=SPLIT,
        max_samples=cfg['subset'],
    )
else:
    summary = bench.run_real(
        config=REAL_CONFIG,
        split=SPLIT,
        max_samples=cfg['subset'],
    )

bench.print_summary(summary)

In [None]:
# Cell 6: Results summary
import json

print("=" * 60)
print("DETAILED RESULTS")
print("=" * 60)

s = summary.to_dict()
print(f"\nOverall Score: {s['overall_score']:.3f}")
print(f"Questions: {s['num_questions']} (scored: {s['num_scored']}, errors: {s['num_errors']})")
print(f"Total Cost: ${s['total_cost']:.2f}")
print(f"Total Tokens: {s['total_tokens']:,}")

if s['score_by_answer_type']:
    print("\nBy Answer Type:")
    for k, v in sorted(s['score_by_answer_type'].items()):
        print(f"  {k:20s}: {v:.3f}")

if s['score_by_task']:
    print("\nBy Task:")
    for k, v in sorted(s['score_by_task'].items()):
        print(f"  {k:20s}: {v:.3f}")

if s['score_by_context_len']:
    print("\nBy Context Length:")
    for k, v in s['score_by_context_len'].items():
        print(f"  {k:20s}: {v:.3f}")

In [None]:
# Cell 7: Save and download results
from google.colab import files
from datetime import datetime

filename = f"oolong_results_{DATASET}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
bench.save_results(filename)
files.download(filename)
print(f"Results saved and downloading: {filename}")

In [None]:
# Cell 8: Debug -- Run a single question with verbose output
# Pick the first context window and first question for inspection
from tests.test_oolong import OolongScorer
from rlm.core import RecursiveLanguageModel

debug_windows = bench.load_synth(split=SPLIT, max_samples=1)
debug_cw_id = list(debug_windows.keys())[0]
debug_q = debug_windows[debug_cw_id][0]

print(f"Context window: {debug_cw_id}")
print(f"Question: {debug_q['question']}")
print(f"Answer type: {debug_q.get('answer_type', 'N/A')}")
print(f"Gold answer: {debug_q.get('answer', debug_q.get('gold_answer', 'N/A'))}")
print(f"Context length: {debug_q.get('context_len', 'N/A')} tokens")
print(f"Context preview: {debug_q.get('context_window_text', '')[:200]}...")
print("\n" + "=" * 60)
print("Running RLM (force_repl=True)...")
print("=" * 60 + "\n")

debug_rlm = RecursiveLanguageModel(
    model=MODEL,
    provider=PROVIDER,
    xai_api_key=os.environ['XAI_API_KEY'],
    max_cost=cfg['max_cost_per_q'],
    enable_cache=True,
    log_level='INFO',
)

debug_answer = debug_rlm.run(
    task=debug_q['question'],
    context=debug_q['context_window_text'],
    verbose=True,
    force_repl=True,
)

print(f"\n{'=' * 60}")
print(f"Raw answer: {debug_answer}")
gold = debug_q.get('answer', debug_q.get('gold_answer', ''))
answer_type = str(debug_q.get('answer_type', '')).replace('ANSWER_TYPE.', '')
parsed = OolongScorer.parse_synth_answer(str(debug_answer), answer_type)
score = OolongScorer.score_synth(gold, str(debug_answer), answer_type)
print(f"Parsed answer: {parsed}")
print(f"Gold answer: {gold}")
print(f"Score: {score:.3f}")
debug_rlm.print_metrics()

In [None]:
# Cell 9: Standard validation run with checkpoint + Google Drive persistence
# Mount Google Drive for persistent checkpoint storage across Colab reconnects
from google.colab import drive
drive.mount('/content/drive')

CHECKPOINT_DIR = '/content/drive/MyDrive/oolong_benchmarks'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'oolong_validation_checkpoint.json')

print(f"Checkpoint will be saved to: {CHECKPOINT_PATH}")
if os.path.exists(CHECKPOINT_PATH):
    print("Existing checkpoint found -- will resume from it.")

validation_bench = OolongBenchmark(
    api_key=os.environ['XAI_API_KEY'],
    model=MODEL,
    provider=PROVIDER,
    verbose=True,
    max_cost_per_question=0.50,
    max_cost_total=100.0,
)

validation_summary = validation_bench.run_synth(
    split='validation',
    max_samples=None,  # All questions
    checkpoint_path=CHECKPOINT_PATH,
)

validation_bench.print_summary(validation_summary)

# Save final results to Drive
final_path = os.path.join(CHECKPOINT_DIR, f"oolong_validation_final_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
validation_bench.save_results(final_path)
print(f"Final results saved to: {final_path}")