# Stratego RL Training on Google Colab

This notebook trains a PPO agent to play Stratego using GPU acceleration.

**Setup:**
1. Runtime → Change runtime type → T4 GPU (or A100 if available with Colab Pro)
2. Run cells in order
3. Models will be saved to your Google Drive

**Expected Training Time:**
- 2M timesteps on T4 GPU: ~6-8 hours
- 2M timesteps on A100: ~2-3 hours

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Mount Google Drive for checkpoint persistence
from google.colab import drive
drive.mount('/content/drive')

# Create directories in Google Drive
!mkdir -p /content/drive/MyDrive/Stratego_RL/models
!mkdir -p /content/drive/MyDrive/Stratego_RL/logs

In [None]:
# Clone your repository
!git clone https://github.com/charlie-tucker1/Stratego_RL.git
%cd Stratego_RL

In [None]:
# Install dependencies
!pip install -q -r requirements.txt

In [None]:
# Verify PyTorch can see GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

## Option 1: Start Fresh Training

In [None]:
# Update train.py to save to Google Drive
# This ensures checkpoints persist even if runtime disconnects

import os

# Read the training script
with open('train.py', 'r') as f:
    content = f.read()

# Update paths to use Google Drive
content = content.replace('LOG_DIR = "./logs"', 'LOG_DIR = "/content/drive/MyDrive/Stratego_RL/logs"')
content = content.replace('MODEL_DIR = "./models"', 'MODEL_DIR = "/content/drive/MyDrive/Stratego_RL/models"')

# Write back
with open('train.py', 'w') as f:
    f.write(content)

print("Updated train.py to save to Google Drive")

In [None]:
# Start training
!python train.py

## Option 2: Resume from Existing Checkpoint

In [None]:
# Upload your local checkpoints to Google Drive first, then run this
# This will continue training from your 1.75M step model

import os
import numpy as np
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker

from stratego_logic import StrategoEnv

# Update import for the callback and mask function
import sys
sys.path.insert(0, '/content/Stratego_RL')
from train import MetricsCallback, mask_fn, make_env

# Configuration
CHECKPOINT_PATH = "/content/drive/MyDrive/Stratego_RL/models/stratego_ppo_1750000_steps.zip"
TOTAL_TIMESTEPS = 2_000_000
RESUME_FROM_STEP = 1_750_000
REMAINING_STEPS = TOTAL_TIMESTEPS - RESUME_FROM_STEP

LOG_DIR = "/content/drive/MyDrive/Stratego_RL/logs"
MODEL_DIR = "/content/drive/MyDrive/Stratego_RL/models"
SAVE_FREQ = 50_000

print(f"Loading model from: {CHECKPOINT_PATH}")
print(f"Resuming from step: {RESUME_FROM_STEP:,}")
print(f"Remaining steps: {REMAINING_STEPS:,}")

# Load the model
model = MaskablePPO.load(
    CHECKPOINT_PATH,
    tensorboard_log=LOG_DIR,
    device="auto"
)

# Create environment
env = DummyVecEnv([make_env])
model.set_env(env)

# Setup callbacks
metrics_callback = MetricsCallback(eval_freq=10_000, n_eval_episodes=10)
checkpoint_callback = CheckpointCallback(
    save_freq=SAVE_FREQ,
    save_path=MODEL_DIR,
    name_prefix="stratego_ppo"
)

print("\nContinuing training...")

# Continue training
try:
    model.learn(
        total_timesteps=REMAINING_STEPS,
        callback=[metrics_callback, checkpoint_callback],
        progress_bar=True,
        reset_num_timesteps=False  # Keep timestep counter
    )
except KeyboardInterrupt:
    print("\n\nTraining interrupted by user")
except ValueError as e:
    if "Simplex()" in str(e):
        print("\n\nTraining stopped due to numerical stability issue")
        print("Your checkpoints are saved!")
    else:
        raise

# Save final model
final_path = os.path.join(MODEL_DIR, "stratego_ppo_final")
model.save(final_path)
print(f"\nFinal model saved to: {final_path}")

## Monitor Training with TensorBoard

In [None]:
# Load TensorBoard in the notebook
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Stratego_RL/logs

## Evaluate Trained Model

In [None]:
# Evaluate a specific checkpoint
!python train.py --eval /content/drive/MyDrive/Stratego_RL/models/stratego_ppo_1750000_steps.zip --episodes 50

## Download Checkpoints to Local Machine

In [None]:
# Zip all models for easy download
!zip -r stratego_models.zip /content/drive/MyDrive/Stratego_RL/models/

# Download via Colab files panel or use:
from google.colab import files
files.download('stratego_models.zip')

## Watch Agent Play

In [None]:
# Watch the agent play a game with rendering
import numpy as np
from sb3_contrib import MaskablePPO
from stratego_logic import StrategoEnv

# Load model
model = MaskablePPO.load("/content/drive/MyDrive/Stratego_RL/models/stratego_ppo_1750000_steps.zip")

# Create environment with rendering
env = StrategoEnv(render_mode="human")

obs, info = env.reset()
done = False
total_reward = 0

print("Starting game...\n")

step = 0
while not done and step < 500:
    action_mask = info.get("action_mask", np.ones(3600))
    action, _ = model.predict(obs, action_masks=action_mask, deterministic=True)
    
    obs, reward, done, truncated, info = env.step(action)
    total_reward += reward
    step += 1
    
    if step % 10 == 0:
        print(f"Step {step}: Reward = {total_reward:.2f}")
    
    if done or truncated:
        break

print(f"\nGame ended after {step} steps")
print(f"Total reward: {total_reward:.2f}")
print(f"Winner: {env.game.winner if hasattr(env.game, 'winner') else 'Unknown'}")