# REINFORCE Training

This notebook contains a complete, self-contained REINFORCE (Policy Gradient) training pipeline for the Daladala environment.

**What's Included:**
- Full environment definition (5 actions, 14 observations)
- 12 REINFORCE hyperparameter configurations for systematic tuning
- Training loop with 300,000 timesteps per configuration

**Output:** Best model + detailed results JSON saved to Google Drive

## Section 1: Install and Import Dependencies

In [None]:
!pip install gymnasium torch pandas numpy opencv-python --quiet

In [None]:
import os
import json
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✓ Using device: {device}")
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA Version: {torch.version.cuda}")

✓ Using device: cpu


## Section 2: Mount Google Drive (for saving models and results)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create output directories
os.makedirs('/content/drive/MyDrive/daladala_results/models/reinforce', exist_ok=True)
os.makedirs('/content/drive/MyDrive/daladala_results/results', exist_ok=True)
print("✓ Google Drive mounted successfully")

Mounted at /content/drive
✓ Google Drive mounted successfully


## Section 3: Define the DaladalaEnv Environment

In [None]:
class DaladalaEnv(gym.Env):
    """Daladala (mini-bus) optimization environment with 5 actions and 14 observations."""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 12}

    def __init__(self, render_mode=None):
        super().__init__()
        self.observation_space = spaces.Box(-1, 1, shape=(14,), dtype=np.float32)
        self.action_space = spaces.Discrete(5)  # 0:Move, 1:Pickup, 2:Dropoff, 3:Stop, 4:SpeedUp
        self.render_mode = render_mode

        # Fixed route Ubungo → Posta (right then up)
        self.route = [(x, 14) for x in range(15)] + [(14, y) for y in range(13, -1, -1)]
        self.high_demand_stops = [(4,14), (8,14), (14,8), (14,3)]

        # These will be randomized each reset
        self.police_checkpoints = []
        self.traffic_lights = []
        self.traffic_light_states = {}  # Stores state (Red=1, Green=0) for each light

        # Pools for randomization
        self.available_positions = [pos for pos in self.route if pos not in self.high_demand_stops]

        self.max_steps = 350
        self.physical_max = 50
        self.light_cycle = 0  # Track light cycle deterministically

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.passengers = 0
        self.money = 0.0
        self.pos_idx = 0
        self.speed = 0
        self.fined = False
        self.light_cycle = 0

        # Randomize hazard positions each episode
        available = [pos for pos in self.route if pos not in self.high_demand_stops]
        if len(available) >= 7:  # 3 police + 4 traffic lights
            sampled = np.random.choice(len(available), 7, replace=False)
            self.police_checkpoints = [available[i] for i in sampled[:3]]
            self.traffic_lights = [available[i] for i in sampled[3:7]]

            # Assign random constant state (Red=1, Green=0) for each light this episode
            # This ensures the light stays the same throughout the episode
            self.traffic_light_states = {pos: np.random.randint(0, 2) for pos in self.traffic_lights}

        # Initialize deterministic passenger counts per stop (seeded by position)
        self.passengers_at_stop = {}
        for stop in self.high_demand_stops:
            # Deterministic: same stop always has same initial count
            seed_val = hash(stop) % 11  # 0-10 passengers
            self.passengers_at_stop[stop] = seed_val

        return self._get_obs(), {}

    def _get_obs(self):
        """
        Generate observation based on current location and environment state.
        Observations are DETERMINISTIC per location to match training/visualization.
        """
        if self.pos_idx >= len(self.route):
            x, y = 14, 0
        else:
            x, y = self.route[self.pos_idx]

        # === CURRENT LOCATION HAZARDS ===
        # Traffic light: Constant state for this episode (Red=1, Green=0)
        light_is_red = self.traffic_light_states.get((x, y), 0)

        # Police checkpoint detection
        police_here = 1 if (x, y) in self.police_checkpoints else 0

        # Must stop at THIS location?
        must_stop_now = 1 if (light_is_red or police_here) else 0

        # === NEXT LOCATION HAZARDS ===
        next_idx = min(self.pos_idx + 1, len(self.route) - 1)
        next_x, next_y = self.route[next_idx]

        # Check what's ahead
        next_light_is_red = self.traffic_light_states.get((next_x, next_y), 0)
        police_ahead = 1 if (next_x, next_y) in self.police_checkpoints else 0
        must_stop_next = 1 if (next_light_is_red or police_ahead) else 0

        # === PASSENGER STATE ===
        # At high-demand stop: passengers waiting (deterministic)
        at_stop = 1 if (x, y) in self.high_demand_stops else 0
        passengers_waiting = self.passengers_at_stop.get((x, y), 0) if at_stop else 0

        # === DISTANCE AHEAD (for lookahead) ===
        # Distance to next traffic light (in next 5 cells)
        dist_to_light = 5
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.traffic_lights:
                dist_to_light = i - self.pos_idx
                break

        # Distance to next police (in next 5 cells)
        dist_to_police = 5
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.police_checkpoints:
                dist_to_police = i - self.pos_idx
                break

        # === BUILD OBSERVATION VECTOR (all normalized to [-1, 1]) ===
        obs = np.array([
            x / 14.0 * 2 - 1,                      # [0] position_x
            y / 14.0 * 2 - 1,                      # [1] position_y
            self.passengers / self.physical_max * 2 - 1,  # [2] current_passengers
            self.money / 150000.0 * 2 - 1,        # [3] money_earned
            self.speed / 3.0 * 2 - 1,              # [4] current_speed
            light_is_red * 2 - 1,                  # [5] light_is_red_HERE (critical)
            police_here * 2 - 1,                   # [6] police_checkpoint_HERE
            must_stop_now * 2 - 1,                 # [7] must_stop_now_HERE (critical)
            at_stop * 2 - 1,                       # [8] at_high_demand_stop
            passengers_waiting / 10.0 * 2 - 1,    # [9] passengers_waiting_at_stop
            must_stop_next * 2 - 1,                # [10] must_stop_next_location
            dist_to_light / 5.0 * 2 - 1,          # [11] distance_to_traffic_light
            dist_to_police / 5.0 * 2 - 1,         # [12] distance_to_police
            self.step_count / self.max_steps * 2 - 1,  # [13] episode_progress
        ], dtype=np.float32)

        return obs

    def step(self, action):
        """
        Action: 0=Move, 1=Pickup, 2=Dropoff, 3=Stop, 4=SpeedUp
        Movement is ALWAYS automatic. Actions are overlaid on movement.
        Rewards guide agent toward optimal actions based on current state.
        """
        self.step_count += 1
        self.light_cycle += 1  # Update traffic light cycle

        terminated = truncated = False
        x, y = self.route[self.pos_idx]

        # === PHASE 1: AUTOMATIC MOVEMENT (always happens) ===
        if self.pos_idx < len(self.route) - 1:
            self.pos_idx += 1
        else:
            terminated = True

        # === PHASE 2: EXECUTE ACTION ===
        reward = 0.0

        # Observe the CURRENT location (before action)
        light_is_red = self.traffic_light_states.get((x, y), 0)
        police_here = 1 if (x, y) in self.police_checkpoints else 0
        must_stop_here = 1 if (light_is_red or police_here) else 0
        at_stop = 1 if (x, y) in self.high_demand_stops else 0

        # === INTELLIGENT REWARD SYSTEM ===
        # We know the "right" action for each state, so rewards guide strongly

        if action == 0:  # MOVE action (advance to next cell)
            # Movement already happened automatically
            # This action is mostly for consistency; reward small progress bonus
            reward += 2

            # PENALTY: Moved through hazard without stopping
            if must_stop_here:
                reward -= 40  # Heavy penalty: ran through red light or police checkpoint

        elif action == 1:  # PICKUP action
            if at_stop and self.passengers < self.physical_max:
                # GOOD: Picking up at a stop
                base_add = max(3, self.passengers_at_stop.get((x, y), 0))
                add = min(base_add, self.physical_max - self.passengers)
                self.passengers += add
                reward += 15  # High reward for correct action

                # Deduct waiting passengers
                if (x, y) in self.passengers_at_stop:
                    self.passengers_at_stop[(x, y)] = max(0, self.passengers_at_stop[(x, y)] - add)
            else:
                # BAD: Picked up when not at stop
                reward -= 5

            # PENALTY: Picking up at hazard zone
            if must_stop_here:
                reward -= 10

        elif action == 2:  # DROPOFF action
            if at_stop and self.passengers > 0:
                # GOOD: Dropping off at a stop
                drop = min(self.passengers, max(3, self.passengers // 2 + 1))
                self.passengers -= drop
                self.money += drop * 1000
                reward += 12  # Good reward for revenue
            else:
                # BAD: Dropped off when not at stop
                reward -= 8

            # PENALTY: Dropping off at hazard zone
            if must_stop_here:
                reward -= 10

        elif action == 3:  # STOP action (slows down / waits)
            self.speed = max(0, self.speed - 1)

            # GOOD: Stopped at hazard location
            if must_stop_here:
                reward += 25  # Strong reward: correct safety action
            else:
                # BAD: Unnecessary stop
                reward -= 3

        elif action == 4:  # SPEEDUP action
            # GOOD: Speeding up in safe zones
            if not must_stop_here and self.passengers <= 40:
                self.speed = min(self.speed + 1, 3)
                reward += 3
            else:
                # BAD: Speeding in danger or when overloaded
                if must_stop_here:
                    reward -= 15  # Dangerous
                if self.passengers > 40:
                    reward -= 30  # Could crash
                    terminated = True  # Crash!

        # === PHASE 3: SAFETY VIOLATIONS ===
        # Check destination after automatic movement
        new_x, new_y = self.route[self.pos_idx] if self.pos_idx < len(self.route) else (14, 0)

        # Police checkpoint consequences
        if (new_x, new_y) in self.police_checkpoints:
            if self.passengers > 40:
                reward -= 50  # Severe: overloaded at police
                self.fined = True
                terminated = True
            elif self.passengers > 33:
                reward -= 20  # Violation: illegal capacity
                self.fined = True

        # === PHASE 4: PROGRESS & COMPLETION ===
        # Base movement reward (small, to encourage progress)
        reward += 1

        # Destination completion bonus
        if terminated:
            reward += 100  # Large bonus for reaching destination
            if self.passengers <= 33 and not self.fined:
                reward += 50  # Bonus for legal completion

        # === PHASE 5: STATE UPDATES ===
        truncated = self.step_count >= self.max_steps

        return self._get_obs(), reward, terminated, truncated, {}

    def render(self):
        pass  # Rendering disabled for Colab

print("✓ DaladalaEnv class defined successfully")

✓ DaladalaEnv class defined successfully


## Section 4: Define REINFORCE Policy Network and Agent

In [None]:
class PolicyNetwork(nn.Module):
    """Neural network policy for REINFORCE algorithm."""
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(PolicyNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, state):
        return torch.softmax(self.net(state), dim=-1)

    def get_action_and_log_prob(self, state):
        """Get action and log probability from policy."""
        probs = self.forward(state)
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob

class REINFORCEAgent:
    """REINFORCE (Policy Gradient) agent."""
    def __init__(self, state_dim, action_dim, hidden_dim, learning_rate, device='cpu'):
        self.policy = PolicyNetwork(state_dim, action_dim, hidden_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.device = device
        self.policy.to(self.device)

    def train_episode(self, env):
        """Train for one complete episode."""
        obs, _ = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action, log_prob = self.policy.get_action_and_log_prob(obs_tensor)
            obs, reward, terminated, truncated, _ = env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            done = terminated or truncated

        # Calculate returns (discounted cumulative rewards)
        returns = []
        cumulative_return = 0
        for reward in reversed(rewards):
            cumulative_return = reward + 0.99 * cumulative_return
            returns.insert(0, cumulative_return)

        # Normalize returns
        returns = torch.tensor(returns, dtype=torch.float32).to(self.device)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # Calculate policy loss
        policy_loss = 0
        for log_prob, return_val in zip(log_probs, returns):
            policy_loss += -log_prob * return_val

        # Update policy
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

        return sum(rewards)

    def evaluate(self, env, n_episodes=50):
        """Evaluate agent performance."""
        rewards = []
        for _ in range(n_episodes):
            obs, _ = env.reset()
            total_reward = 0
            done = False

            while not done:
                obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    probs = self.policy(obs_tensor)
                    action = probs.argmax(dim=-1).item()
                obs, reward, terminated, truncated, _ = env.step(action)
                total_reward += reward
                done = terminated or truncated

            rewards.append(total_reward)

        return np.mean(rewards), np.std(rewards)

    def save(self, path):
        """Save model to disk."""
        torch.save(self.policy.state_dict(), path + '_policy.pth')

    def load(self, path):
        """Load model from disk."""
        self.policy.load_state_dict(torch.load(path + '_policy.pth', map_location=self.device))

print("✓ REINFORCE Policy Network and Agent defined successfully")

✓ REINFORCE Policy Network and Agent defined successfully


## Section 5: Define REINFORCE Hyperparameter Configurations

In [None]:
# 12 REINFORCE hyperparameter configurations for systematic tuning
reinforce_configs = [
    {"name": "LR_1e3_hid_64", "learning_rate": 1e-3, "hidden_dim": 64},
    {"name": "LR_1e3_hid_128", "learning_rate": 1e-3, "hidden_dim": 128},
    {"name": "LR_3e3_hid_64", "learning_rate": 3e-3, "hidden_dim": 64},
    {"name": "LR_3e3_hid_128", "learning_rate": 3e-3, "hidden_dim": 128},
    {"name": "LR_5e3_hid_64", "learning_rate": 5e-3, "hidden_dim": 64},
    {"name": "LR_5e3_hid_128", "learning_rate": 5e-3, "hidden_dim": 128},
    {"name": "LR_1e2_hid_64", "learning_rate": 1e-2, "hidden_dim": 64},
    {"name": "LR_1e2_hid_128", "learning_rate": 1e-2, "hidden_dim": 128},
    {"name": "LR_1e2_hid_256", "learning_rate": 1e-2, "hidden_dim": 256},
    {"name": "LR_5e3_hid_256", "learning_rate": 5e-3, "hidden_dim": 256},
    {"name": "LR_3e3_hid_256", "learning_rate": 3e-3, "hidden_dim": 256},
    {"name": "LR_1e3_hid_256", "learning_rate": 1e-3, "hidden_dim": 256},
]

print(f"✓ {len(reinforce_configs)} REINFORCE configurations defined")

✓ 12 REINFORCE configurations defined


## Section 6: Train REINFORCE with All Configurations

In [None]:
import time

results = {}
best_reward = -float('inf')
best_config = None
best_agent = None

total_configs = len(reinforce_configs)
state_dim = 14
action_dim = 5

# Training parameters
target_steps = 300000
steps_per_episode = 350
episodes_per_config = (target_steps + steps_per_episode - 1) // steps_per_episode  # ~857 episodes

for idx, config in enumerate(reinforce_configs, 1):
    print(f"\n{'='*70}")
    print(f"Training Configuration {idx}/{total_configs}: {config['name']}")
    print(f"{'='*70}")
    print(f"Learning Rate: {config['learning_rate']}, Hidden Dim: {config['hidden_dim']}")
    print(f"Target: {target_steps:,} timesteps (~{episodes_per_config} episodes)")
    print(f"Device: {device}\n")

    # Create environment
    env = DaladalaEnv()

    # Initialize REINFORCE agent with GPU support
    agent = REINFORCEAgent(
        state_dim=state_dim,
        action_dim=action_dim,
        hidden_dim=config['hidden_dim'],
        learning_rate=config['learning_rate'],
        device=device
    )

    # Training loop with verbose progress
    start_time = time.time()
    episode_rewards = []
    total_steps = 0

    for episode in range(episodes_per_config):
        ep_reward = agent.train_episode(env)
        episode_rewards.append(ep_reward)
        total_steps += steps_per_episode

        # Print progress every 50 episodes
        if (episode + 1) % 50 == 0 or episode == 0:
            recent_avg = np.mean(episode_rewards[-50:]) if len(episode_rewards) >= 50 else np.mean(episode_rewards)
            elapsed = time.time() - start_time
            eps_per_sec = (episode + 1) / elapsed
            eta_sec = (episodes_per_config - episode - 1) / eps_per_sec if eps_per_sec > 0 else 0

            print(f"  Episode {episode+1:4d}/{episodes_per_config} | "
                  f"Recent Avg: {recent_avg:7.2f} | "
                  f"Last Reward: {ep_reward:7.2f} | "
                  f"Steps: {total_steps:,} | "
                  f"ETA: {int(eta_sec//60):3d}m {int(eta_sec%60):02d}s")

    training_time = time.time() - start_time

    # Evaluate agent on 50 episodes with verbose feedback
    print(f"\n  Evaluating on 50 episodes...")
    eval_start = time.time()
    mean_reward, std_reward = agent.evaluate(env, n_episodes=50)
    eval_time = time.time() - eval_start

    results[config['name']] = {
        'config': config,
        'mean_reward': mean_reward,
        'std_reward': std_reward,
        'training_time': training_time,
        'eval_time': eval_time
    }

    print(f"  ✓ Evaluation Complete!")
    print(f"    Mean Reward: {mean_reward:.2f} (±{std_reward:.2f})")
    print(f"    Training Time: {int(training_time//60)}m {int(training_time%60)}s")

    # Track best model
    if mean_reward > best_reward:
        best_reward = mean_reward
        best_config = config['name']
        best_agent = agent
        print(f"    ★ NEW BEST MODEL! ★")

    env.close()

print(f"\n{'='*70}")
print(f"ALL TRAINING COMPLETE!")
print(f"{'='*70}")
print(f"Best Configuration: {best_config}")
print(f"Best Mean Reward: {best_reward:.2f}")
print(f"Total Time: {int((time.time() - start_time)//60)}m")


Training Configuration 1/12: LR_1e3_hid_64
Learning Rate: 0.001, Hidden Dim: 64
Target: 300,000 timesteps (~858 episodes)
Device: cpu

  Episode    1/858 | Recent Avg:  117.00 | Last Reward:  117.00 | Steps: 350 | ETA:   3m 09s
  Episode   50/858 | Recent Avg:   87.62 | Last Reward:   68.00 | Steps: 17,500 | ETA:   0m 26s
  Episode  100/858 | Recent Avg:  106.98 | Last Reward:   74.00 | Steps: 35,000 | ETA:   0m 23s
  Episode  150/858 | Recent Avg:  168.84 | Last Reward:  201.00 | Steps: 52,500 | ETA:   0m 21s
  Episode  200/858 | Recent Avg:  257.44 | Last Reward:  282.00 | Steps: 70,000 | ETA:   0m 20s
  Episode  250/858 | Recent Avg:  289.62 | Last Reward:  286.00 | Steps: 87,500 | ETA:   0m 19s
  Episode  300/858 | Recent Avg:  297.54 | Last Reward:  317.00 | Steps: 105,000 | ETA:   0m 18s
  Episode  350/858 | Recent Avg:  315.90 | Last Reward:  392.00 | Steps: 122,500 | ETA:   0m 18s
  Episode  400/858 | Recent Avg:  338.26 | Last Reward:  369.00 | Steps: 140,000 | ETA:   0m 17s


## Section 7: Save Best Model and Results

In [None]:
# Save best model
best_model_path = '/content/drive/MyDrive/daladala_results/models/reinforce/best_reinforce'
best_agent.save(best_model_path)
print(f"✓ Best model saved to: {best_model_path}")

# Save results as JSON
results_json_path = '/content/drive/MyDrive/daladala_results/results/reinforce_results.json'
results_summary = {}
for config_name, config_results in results.items():
    results_summary[config_name] = {
        'mean_reward': float(config_results['mean_reward']),
        'std_reward': float(config_results['std_reward']),
        'hyperparameters': {
            'learning_rate': config_results['config']['learning_rate'],
            'hidden_dim': config_results['config']['hidden_dim']
        }
    }

with open(results_json_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f"✓ Results saved to: {results_json_path}")

# Display results table
print("\n" + "="*80)
print("RESULTS SUMMARY - All 12 REINFORCE Configurations")
print("="*80)
results_df = pd.DataFrame([
    {
        'Config': name,
        'Mean Reward': f"{results[name]['mean_reward']:.2f}",
        'Std Reward': f"{results[name]['std_reward']:.2f}",
        'LR': results[name]['config']['learning_rate'],
        'Hidden': results[name]['config']['hidden_dim']
    }
    for name in results.keys()
])
print(results_df.to_string(index=False))
print("="*80)

✓ Best model saved to: /content/drive/MyDrive/daladala_results/models/reinforce/best_reinforce
✓ Results saved to: /content/drive/MyDrive/daladala_results/results/reinforce_results.json

RESULTS SUMMARY - All 12 REINFORCE Configurations
        Config Mean Reward Std Reward    LR  Hidden
 LR_1e3_hid_64      423.12      20.62 0.001      64
LR_1e3_hid_128      402.62      22.73 0.001     128
 LR_3e3_hid_64      376.00      23.69 0.003      64
LR_3e3_hid_128      376.00      23.69 0.003     128
 LR_5e3_hid_64      381.28      21.36 0.005      64
LR_5e3_hid_128       45.00      37.52 0.005     128
 LR_1e2_hid_64      237.04      24.21 0.010      64
LR_1e2_hid_128      236.48      28.20 0.010     128
LR_1e2_hid_256      358.44      23.02 0.010     256
LR_5e3_hid_256      225.84      28.16 0.005     256
LR_3e3_hid_256      354.30      22.18 0.003     256
LR_1e3_hid_256      406.72      17.40 0.001     256


## Section 8: Test Best Model on Sample Episodes

In [None]:
# Test the best model on 5 sample episodes
print(f"\nTesting best model ({best_config}) on 5 sample episodes:")
print("="*60)

env = DaladalaEnv()
episode_rewards = []

for ep in range(5):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    steps = 0

    while not done:
        obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
        with torch.no_grad():
            probs = best_agent.policy(obs_tensor)
            action = probs.argmax(dim=-1).item()
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        done = terminated or truncated
        steps += 1

    episode_rewards.append(total_reward)
    print(f"Episode {ep+1}: Reward = {total_reward:.2f} (Steps: {steps})")

env.close()

print("="*60)
print(f"Sample Episodes Mean Reward: {np.mean(episode_rewards):.2f}")
print(f"Sample Episodes Std Reward:  {np.std(episode_rewards):.2f}")
print("\n✓ REINFORCE training and evaluation complete!")
print(f"✓ Models and results saved to Google Drive")


Testing best model (LR_1e3_hid_64) on 5 sample episodes:
Episode 1: Reward = 402.00 (Steps: 29)
Episode 2: Reward = 446.00 (Steps: 29)
Episode 3: Reward = 446.00 (Steps: 29)
Episode 4: Reward = 424.00 (Steps: 29)
Episode 5: Reward = 402.00 (Steps: 29)
Sample Episodes Mean Reward: 424.00
Sample Episodes Std Reward:  19.68

✓ REINFORCE training and evaluation complete!
✓ Models and results saved to Google Drive
