# Blackjack Rainbow DQN Notebook

![Tests](https://img.shields.io/badge/tests-pytest-blue) ![Coverage](https://img.shields.io/badge/coverage-summary-green)

> End-to-end pipeline for training and evaluating a Rainbow DQN Blackjack agent.

In [None]:
# 🚀 Run All
RUN_ALL = True
PIPELINE_STEPS = []
TRAINING_RESULTS = {}
EVALUATION_RESULTS = {}

def register_step(func):
    PIPELINE_STEPS.append(func)
    return func


def run_all_pipeline():
    from time import perf_counter
    start = perf_counter()
    for step in PIPELINE_STEPS:
        print(f"▶️ {step.__name__}")
        step()
    print(f"✅ Pipeline finished in {perf_counter() - start:.2f}s")

if not RUN_ALL:
    print("Set `RUN_ALL = True` in this cell to execute the entire notebook automatically.")


## Configuration

The following dataclasses expose the configuration used by the environment, agent, and training pipeline.

In [None]:
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Tuple

@dataclass
class NotebookEnvConfig:
    num_decks: int = 6
    penetration: float = 0.8
    natural_payout: float = 1.5
    hit_soft_17: bool = False
    min_bet: float = 1.0
    max_bet: float = 8.0
    bankroll: float = 100.0
    bankroll_stop_loss: float = 0.0
    bankroll_target: float = 200.0
    allow_surrender: bool = True
    allow_double: bool = True
    allow_split: bool = True
    max_splits: int = 1
    reward_shaping: bool = True
    shaping_stop_step: int = 60_000
    penetration_reset: bool = True
    bet_actions: int = 8
    reward_clip: float | None = 5.0
    seed: int = 123

    def to_env(self):
        from blackjackai_rl.env import BlackjackEnvConfig
        return BlackjackEnvConfig(**asdict(self))

@dataclass
class NotebookAgentConfig:
    state_dim: int = 36
    num_actions: int = 5
    bet_actions: int = 8
    hidden_sizes: Tuple[int, int] = (512, 512)
    gamma: float = 0.99
    lr: float = 3e-4
    bet_lr: float = 3e-4
    epsilon_start: float = 1.0
    epsilon_final: float = 0.05
    epsilon_decay: int = 1_200_000
    batch_size: int = 512
    buffer_size: int = 800_000
    min_buffer_size: int = 20_000
    grad_clip: float = 5.0
    double_dqn: bool = True
    dueling: bool = True
    prioritized_replay: bool = True
    per_alpha: float = 0.6
    per_beta_start: float = 0.4
    per_beta_end: float = 1.0
    per_beta_steps: int = 1_200_000
    n_step: int = 3
    distributional_c51: bool = True
    atoms: int = 51
    v_min: float = -20.0
    v_max: float = 20.0
    noisy_nets: bool = False
    target_update_interval: int = 15_000
    tau: float | None = None
    device: str = "cpu"

    def to_agent(self):
        from blackjackai_rl.agents import DQNConfig
        return DQNConfig(**asdict(self))

@dataclass
class NotebookTrainingConfig:
    vector_envs: int = 32
    training_steps: int = 2_000_000
    log_interval: int = 2_000
    evaluation_hands: int = 100_000
    seed: int = 2024
    output_root: Path = Path("runs")


env_config = NotebookEnvConfig()
agent_config = NotebookAgentConfig()
training_config = NotebookTrainingConfig()

from pprint import pprint
print("Environment config:")
pprint(asdict(env_config))
print("
Agent config:")
pprint(asdict(agent_config))
print("
Training config:")
pprint(asdict(training_config))


### Field guide

* **EnvConfig** controls table rules, deck penetration, bankroll bounds, and betting spread.
* **AgentConfig** toggles Rainbow DQN components (dueling heads, PER, C51, etc.) and optimizer hyperparameters.
* **TrainingConfig** governs vectorised environment count, total frames, logging cadence, evaluation horizon, and reproducibility seed.

In [None]:
@register_step
def setup_environment():
    import subprocess
    import sys
    print("Installing blackjackai in editable mode…")
    subprocess.run([sys.executable, "-m", "pip", "install", "-e", "."], check=True)

if not RUN_ALL:
    setup_environment()


In [None]:
@register_step
def set_reproducibility():
    import random
    import numpy as np
    seed = training_config.seed
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    except ImportError:
        torch = None
    print(f"Seeds initialised to {seed}")
    return seed

if not RUN_ALL:
    set_reproducibility()


In [None]:
@register_step
def environment_sanity_checks():
    from blackjackai_rl.env import BlackjackEnv
    env = BlackjackEnv(env_config.to_env())
    obs, info = env.reset()
    print("Initial observation shape:", obs.shape)
    print("Initial legal actions:", [env.action_names[a] for a in info["legal_actions"]])
    print("Action mask:", info["action_mask"])
    obs_play, _, _, info_play = env.step(0)
    print("Play phase mask:", info_play["action_mask"])
    env.step(env.ACTION_STAND)

if not RUN_ALL:
    environment_sanity_checks()


In [None]:
@register_step
def run_training():
    from blackjackai_rl.training import train_rainbow
    print("Starting training…")
    results = train_rainbow(
        env_config.to_env(),
        agent_config.to_agent(),
        training_config.training_steps,
        vector_envs=training_config.vector_envs,
        log_interval=training_config.log_interval,
        output_dir=training_config.output_root,
    )
    globals()["TRAINING_RESULTS"] = results
    print("Best checkpoint:", results["best_model_path"])

if not RUN_ALL:
    run_training()


In [None]:
@register_step
def run_evaluation():
    from blackjackai_rl.agents import RainbowDQNAgent
    from blackjackai_rl.evaluation import evaluate_agent
    agent = RainbowDQNAgent(agent_config.to_agent())
    best_path = TRAINING_RESULTS.get("best_model_path")
    if best_path:
        agent.load(best_path)
    results = evaluate_agent(
        agent,
        env_config.to_env(),
        training_config.evaluation_hands,
        training_config.output_root,
        training_history=TRAINING_RESULTS,
    )
    globals()["EVALUATION_RESULTS"] = results
    print("Evaluation artifacts saved to:", results["run_dir"])
    from IPython.display import Image, display
    for name, path in results["plots"].items():
        display(Image(filename=path))

if not RUN_ALL:
    run_evaluation()


In [None]:
@register_step
def show_summary():
    from IPython.display import Markdown, display
    metrics = EVALUATION_RESULTS.get("metrics", {})
    if not metrics:
        print("No evaluation metrics available yet.")
        return
    summary = (
        f"### Evaluation snapshot

"
        f"* EV/100 hands: {metrics['ev_per_100']:.3f}
"
        f"* 95% CI: {metrics['ev_confidence'][0]:.3f} – {metrics['ev_confidence'][1]:.3f}
"
        f"* Win rate: {metrics['win_rate']:.2%}
"
        f"* Loss rate: {metrics['loss_rate']:.2%}
"
        f"* Push rate: {metrics['push_rate']:.2%}
"
        f"* Bust rate: {metrics['bust_rate']:.2%}

"
        f"Summary file: `{EVALUATION_RESULTS['summary_path']}`"
    )
    display(Markdown(summary))

if not RUN_ALL:
    show_summary()


In [None]:
@register_step
def baseline_appendix():
    baselines = EVALUATION_RESULTS.get("metrics", {}).get("baselines", {})
    if not baselines:
        print("Baseline metrics unavailable.")
        return
    print("Baseline EV/100 comparison:")
    for name, data in baselines.items():
        print(f"  {name}: {data.get('ev_per_100', 0.0):.3f}")

if not RUN_ALL:
    baseline_appendix()


In [None]:
@register_step
def quick_smoke_evaluation():
    from blackjackai_rl.agents import RainbowDQNAgent
    from blackjackai_rl.env import BlackjackEnv
    import numpy as np
    agent = RainbowDQNAgent(agent_config.to_agent())
    best_path = TRAINING_RESULTS.get("best_model_path")
    if best_path:
        agent.load(best_path)
    env = BlackjackEnv(env_config.to_env())
    obs, info = env.reset()
    total_reward = 0.0
    for _ in range(10):
        action = agent.select_actions(obs[np.newaxis, :], [info], deterministic=True)[0]
        obs, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            obs, info = env.reset()
    print("10-hand smoke reward:", total_reward)

if not RUN_ALL:
    quick_smoke_evaluation()


In [None]:
if RUN_ALL:
    run_all_pipeline()