# Raahi-v2 Transit RL Training on Google Colab

This notebook trains reinforcement learning agents for transit optimization using synthetic city data.
Run on Google Colab with GPU for faster training.

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
# Install required packages
!pip install stable-baselines3[extra]
!pip install tensorboard
!pip install gymnasium
!pip install matplotlib
!pip install numpy
!pip install torch

In [None]:
# Import libraries
import json
import random
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from collections import defaultdict, deque
from dataclasses import dataclass
from typing import List, Dict, Tuple
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback
import os

In [None]:
# City Builder class for generating training data
@dataclass
class Station:
    id: str
    x: int
    y: int
    station_type: str
    is_transfer: bool = False

class CityBuilder:
    def __init__(self, grid_size: int = 8):
        self.grid_size = grid_size
        self.zone_types = ["residential", "commercial", "industrial"]
        self.station_types = ["bus", "tram", "metro"]
    
    def generate_random_city(self) -> Dict:
        zones = self._generate_zones()
        stations = self._generate_stations()
        connections = self._generate_connections(stations)
        routes = self._generate_routes(stations)
        
        return {
            "grid_size": self.grid_size,
            "zones": zones,
            "stations": stations,
            "connections": connections,
            "routes": routes
        }
    
    def _generate_zones(self) -> List[Dict]:
        zones = []
        for x in range(self.grid_size):
            for y in range(self.grid_size):
                zone_type = random.choice(self.zone_types)
                zones.append({"x": x, "y": y, "type": zone_type})
        return zones
    
    def _generate_stations(self) -> List[Dict]:
        stations = []
        num_stations = random.randint(5, 15)
        
        for i in range(num_stations):
            x = random.randint(0, self.grid_size - 1)
            y = random.randint(0, self.grid_size - 1)
            station_type = random.choice(self.station_types)
            
            stations.append({
                "id": f"station_{i}",
                "x": x, "y": y,
                "type": station_type,
                "is_transfer": False
            })
        return stations
    
    def _generate_connections(self, stations: List[Dict]) -> List[Dict]:
        connections = []
        for i, s1 in enumerate(stations):
            for j, s2 in enumerate(stations[i+1:], i+1):
                dist = abs(s1["x"] - s2["x"]) + abs(s1["y"] - s2["y"])
                if dist <= 3:
                    connections.append({
                        "from": s1["id"],
                        "to": s2["id"],
                        "walk_time": dist * 60
                    })
        return connections
    
    def _generate_routes(self, stations: List[Dict]) -> List[Dict]:
        routes = []
        station_groups = defaultdict(list)
        
        for station in stations:
            station_groups[station["type"]].append(station)
        
        route_id = 0
        for mode, mode_stations in station_groups.items():
            if len(mode_stations) >= 2:
                routes.append({
                    "id": f"route_{route_id}",
                    "mode": mode,
                    "stations": [s["id"] for s in mode_stations[:4]],
                    "color": f"#{''.join([random.choice('0123456789ABCDEF') for _ in range(6)])}"
                })
                route_id += 1
        
        return routes

print("City Builder ready")

In [None]:
# Transit Environment for RL training with passenger demand
class TransitEnv(gym.Env):
    def __init__(self, grid_size=8, training_cities=None):
        super().__init__()
        self.grid_size = grid_size
        self.city_builder = CityBuilder(grid_size)
        self.training_cities = training_cities or []
        self.city_index = 0
        
        # Action space: choose route frequency for each route (0-10)
        self.action_space = gym.spaces.Box(
            low=0, high=10, shape=(10,), dtype=np.float32
        )
        
        # Observation space: city state + time + demand
        obs_size = grid_size * grid_size * 4 + 24 + 10  # zones + time + demand
        self.observation_space = gym.spaces.Box(
            low=0, high=1, shape=(obs_size,), dtype=np.float32
        )
        
        self.reset()
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        # Use pre-generated cities if available, otherwise generate new
        if self.training_cities:
            self.city_data = self.training_cities[self.city_index % len(self.training_cities)]
            self.city_index += 1
        else:
            self.city_data = self.city_builder.generate_random_city()
            
        self.current_step = 0
        self.max_steps = 100
        self.current_hour = random.randint(0, 23)  # Random start time
        
        # Generate passenger demand patterns
        self._generate_demand_patterns()
        
        obs = self._get_observation()
        return obs, {}
    
    def step(self, action):
        # Apply actions (route frequencies)
        route_frequencies = action
        
        # Update time and passenger demand
        self.current_hour = (self.current_hour + 1) % 24
        current_demand = self._get_current_demand()
        
        # Calculate reward based on efficiency and passenger satisfaction
        reward = self._calculate_reward(route_frequencies, current_demand)
        
        self.current_step += 1
        done = self.current_step >= self.max_steps
        truncated = False
        
        obs = self._get_observation()
        return obs, reward, done, truncated, {}
    
    def _generate_demand_patterns(self):
        """Generate realistic passenger demand patterns"""
        stations = self.city_data["stations"]
        zones = self.city_data["zones"]
        
        # Create zone-based demand patterns
        self.zone_demand = {}
        for zone in zones:
            zone_type = zone["type"]
            if zone_type == "residential":
                # High morning outbound, evening inbound
                self.zone_demand[(zone["x"], zone["y"])] = {
                    "morning_out": random.uniform(0.7, 1.0),
                    "evening_in": random.uniform(0.7, 1.0)
                }
            elif zone_type == "commercial":
                # High morning inbound, evening outbound
                self.zone_demand[(zone["x"], zone["y"])] = {
                    "morning_in": random.uniform(0.6, 0.9),
                    "evening_out": random.uniform(0.6, 0.9)
                }
            else:  # industrial
                # Steady demand throughout day
                self.zone_demand[(zone["x"], zone["y"])] = {
                    "steady": random.uniform(0.3, 0.6)
                }
        
        # Generate common OD pairs
        self.od_pairs = []
        for i, s1 in enumerate(stations[:8]):  # Limit to prevent too many pairs
            for j, s2 in enumerate(stations[:8]):
                if i != j:
                    # Distance-based demand (closer = more likely)
                    dist = abs(s1["x"] - s2["x"]) + abs(s1["y"] - s2["y"])
                    demand_weight = max(0.1, 1.0 - dist / 10.0)
                    
                    self.od_pairs.append({
                        "origin": s1["id"],
                        "destination": s2["id"],
                        "base_demand": random.uniform(0.1, demand_weight),
                        "peak_multiplier": random.uniform(1.5, 3.0)
                    })
    
    def _get_current_demand(self):
        """Get passenger demand for current time"""
        hour = self.current_hour
        
        # Time-based demand multipliers
        if 7 <= hour <= 9:  # Morning rush
            time_multiplier = 2.0
        elif 17 <= hour <= 19:  # Evening rush
            time_multiplier = 2.0
        elif 6 <= hour <= 22:  # Day time
            time_multiplier = 1.0
        else:  # Night
            time_multiplier = 0.3
        
        # Calculate total demand
        total_demand = 0
        for od_pair in self.od_pairs:
            base = od_pair["base_demand"]
            
            # Add rush hour effects
            if 7 <= hour <= 9 or 17 <= hour <= 19:
                demand = base * od_pair["peak_multiplier"] * time_multiplier
            else:
                demand = base * time_multiplier
            
            # Add random variation
            demand *= random.uniform(0.8, 1.2)
            total_demand += demand
        
        return min(total_demand, 10.0)  # Cap at 10
    
    def _get_observation(self):
        # Convert city data to observation vector
        obs = np.zeros(self.grid_size * self.grid_size * 4 + 24 + 10, dtype=np.float32)
        
        # Encode zones
        for zone in self.city_data["zones"]:
            idx = zone["x"] * self.grid_size + zone["y"]
            if zone["type"] == "residential":
                obs[idx] = 1.0
            elif zone["type"] == "commercial":
                obs[idx + self.grid_size * self.grid_size] = 1.0
            elif zone["type"] == "industrial":
                obs[idx + 2 * self.grid_size * self.grid_size] = 1.0
        
        # Encode stations
        for station in self.city_data["stations"]:
            idx = station["x"] * self.grid_size + station["y"]
            obs[idx + 3 * self.grid_size * self.grid_size] = 1.0
        
        # Encode time of day (one-hot)
        time_offset = self.grid_size * self.grid_size * 4
        obs[time_offset + self.current_hour] = 1.0
        
        # Encode current demand levels
        demand_offset = time_offset + 24
        current_demand = self._get_current_demand()
        for i in range(10):
            if i < current_demand:
                obs[demand_offset + i] = 1.0
        
        return obs
    
    def _calculate_reward(self, route_frequencies, current_demand):
        # Advanced reward with passenger satisfaction
        
        # Service efficiency (frequency vs demand matching)
        total_frequency = np.sum(route_frequencies)
        demand_match = 1.0 - abs(total_frequency - current_demand) / 10.0
        demand_reward = demand_match * 2.0
        
        # Coverage reward
        coverage = len(self.city_data["stations"]) * 0.02
        
        # Operating cost (quadratic penalty for high frequencies)
        cost = np.sum(route_frequencies ** 2) * 0.03
        
        # Rush hour bonus for adequate service
        hour = self.current_hour
        if (7 <= hour <= 9 or 17 <= hour <= 19) and total_frequency >= current_demand:
            rush_bonus = 1.0
        else:
            rush_bonus = 0.0
        
        # Passenger waiting time penalty (inverse of frequency)
        wait_penalty = np.sum(1.0 / (route_frequencies + 0.1)) * 0.1
        
        reward = demand_reward + coverage + rush_bonus - cost - wait_penalty
        return float(reward)

print("Enhanced Transit Environment ready")

In [None]:
# Generate training cities
print("Generating training cities...")
builder = CityBuilder(15)  # 15x15 grid
training_cities = []

for i in range(50):
    city = builder.generate_random_city()
    training_cities.append(city)
    if i % 10 == 0:
        print(f"Generated {i+1} cities")

print(f"Generated {len(training_cities)} training cities")
print(f"Sample city has {len(training_cities[0]['stations'])} stations")
print(f"Grid size: {training_cities[0]['grid_size']}x{training_cities[0]['grid_size']}")

In [None]:
# Analyze demand patterns
print("Demand Pattern Analysis:")

# Test environment with demand
test_env = TransitEnv(grid_size=15, training_cities=training_cities[:1])
obs, info = test_env.reset()

print(f"OD pairs generated: {len(test_env.od_pairs)}")
print(f"Zone demand patterns: {len(test_env.zone_demand)}")

# Show demand variation over 24 hours
hourly_demand = []
for hour in range(24):
    test_env.current_hour = hour
    demand = test_env._get_current_demand()
    hourly_demand.append(demand)

print(f"Peak demand: {max(hourly_demand):.2f}")
print(f"Low demand: {min(hourly_demand):.2f}")
print(f"Rush hours (7-9, 17-19): {[hourly_demand[i] for i in [7,8,9,17,18,19]]}")

# Plot demand curve
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
plt.plot(range(24), hourly_demand, marker='o')
plt.title("Passenger Demand by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Demand Level")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Create environment with training cities
env = TransitEnv(grid_size=15, training_cities=training_cities)
print(f"Environment created with {len(training_cities)} training cities")
print(f"Action space: {env.action_space}")
print(f"Observation space: {env.observation_space}")

# Test environment
obs, info = env.reset()
print(f"Initial observation shape: {obs.shape}")
print(f"Current city has {len(env.city_data['stations'])} stations")

action = env.action_space.sample()
obs, reward, done, truncated, info = env.step(action)
print(f"Test step reward: {reward}")

In [None]:
# Setup PPO model
model = PPO(
    "MlpPolicy",
    env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    device=device,
    verbose=1,
    tensorboard_log="./logs/"
)

print(f"PPO model created on {device}")
print(f"Policy network: {model.policy}")

In [None]:
# Training loop
print("Starting training...")
total_timesteps = 100000
save_freq = 10000

# Create callback for evaluation
eval_env = TransitEnv(grid_size=8)
eval_callback = EvalCallback(
    eval_env, 
    best_model_save_path="./models/",
    log_path="./logs/",
    eval_freq=5000,
    deterministic=True,
    render=False
)

# Enhanced training with detailed logging
from stable_baselines3.common.callbacks import BaseCallback
import csv
import os
import json

class MetricsCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_demands = []
        self.episode_frequencies = []
        
    def _on_step(self) -> bool:
        # Log episode data when episode ends
        if self.locals['dones'][0]:
            info = self.locals['infos'][0]
            if 'episode' in info:
                self.episode_rewards.append(info['episode']['r'])
                self.episode_lengths.append(info['episode']['l'])
        return True

print("Starting enhanced training...")
total_timesteps = 100000

# Create directories
os.makedirs("./models/", exist_ok=True)
os.makedirs("./logs/", exist_ok=True)
os.makedirs("./animations/", exist_ok=True)

# Enhanced callbacks
metrics_callback = MetricsCallback()
eval_env = TransitEnv(grid_size=15, training_cities=training_cities)
eval_callback = EvalCallback(
    eval_env, 
    best_model_save_path="./models/",
    log_path="./logs/",
    eval_freq=5000,
    deterministic=True,
    render=False
)

# Train with enhanced logging
model.learn(
    total_timesteps=total_timesteps,
    callback=[eval_callback, metrics_callback],
    progress_bar=True
)

# Save training metrics
metrics_data = {
    'episode_rewards': metrics_callback.episode_rewards,
    'episode_lengths': metrics_callback.episode_lengths
}

with open('./logs/training_metrics.json', 'w') as f:
    json.dump(metrics_data, f, indent=2)

print("Enhanced training completed!")
print(f"Total episodes: {len(metrics_callback.episode_rewards)}")
print(f"Metrics saved to ./logs/training_metrics.json")

In [None]:
# Save final model
model.save("transit_ppo_final")
print("Model saved as 'transit_ppo_final'")

# Test trained model
obs, info = env.reset()
total_reward = 0

for step in range(100):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    total_reward += reward
    
    if done or truncated:
        break

print(f"Test episode reward: {total_reward}")
print(f"Average reward per step: {total_reward / (step + 1)}")

In [None]:
# Performance visualization
import matplotlib.pyplot as plt
import pandas as pd

# Evaluate on multiple episodes
num_eval_episodes = 20
rewards = []

for episode in range(num_eval_episodes):
    obs, info = env.reset()
    episode_reward = 0
    
    for step in range(100):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
        
        if done or truncated:
            break
    
    rewards.append(episode_reward)
    if episode % 5 == 0:
        print(f"Episode {episode}: {episode_reward:.2f}")

# Plot results
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.plot(rewards)
plt.title("Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Total Reward")

plt.subplot(1, 2, 2)
plt.hist(rewards, bins=10)
plt.title("Reward Distribution")
plt.xlabel("Total Reward")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

print(f"Average reward: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}")
print(f"Min reward: {np.min(rewards):.2f}")
print(f"Max reward: {np.max(rewards):.2f}")

In [None]:
# Model analysis
print("Model Analysis:")
print(f"Policy network: {model.policy}")
print(f"Total parameters: {sum(p.numel() for p in model.policy.parameters())}")

# Sample action analysis
obs, info = env.reset()
action, _states = model.predict(obs, deterministic=True)

print(f"\nSample action (route frequencies): {action}")
print(f"Action statistics:")
print(f"  Mean: {np.mean(action):.3f}")
print(f"  Std: {np.std(action):.3f}")
print(f"  Min: {np.min(action):.3f}")
print(f"  Max: {np.max(action):.3f}")

In [None]:
# Download model files (for Colab)
from google.colab import files

try:
    # Zip model files
    !zip -r trained_models.zip transit_ppo_final.zip models/ logs/
    
    # Download
    files.download('trained_models.zip')
    print("Model files downloaded!")
except:
    print("Not running on Colab, skipping download")

## Training Complete!

The RL agent has been trained on synthetic transit data. Key results:

- **Environment**: 15x15 grid cities with multiple transit modes
- **Algorithm**: PPO (Proximal Policy Optimization)
- **Training**: 100K timesteps with GPU acceleration
- **Objective**: Optimize route frequencies for efficiency

The trained model can now make decisions about transit route scheduling to maximize efficiency while minimizing costs.

## Result Analysis

The evaluation results show:
- Average reward: -293.88 ± 65.10
- Min reward: -391.61
- Max reward: -160.87

These negative rewards are typical in RL problems with cost penalties. The large standard deviation (65.10) indicates inconsistent performance across different cities/scenarios.

### Potential Improvements

1. **Extended Training:** 100K timesteps may not be enough for a complex RL task. Try 500K+ timesteps.

2. **Reward Function Tuning:** The current reward calculation might penalize certain actions too heavily.

3. **Hyperparameter Optimization:** Adjust learning rate, batch size, or network architecture.

4. **Curriculum Learning:** Start with simpler cities and gradually increase complexity.

In [None]:
# Extended Training and Parameter Tuning
print("Starting extended training...")

# Create improved model with tuned parameters
improved_model = PPO(
    "MlpPolicy",
    env,
    learning_rate=1e-4,  # Lower learning rate for stability
    n_steps=2048,
    batch_size=128,      # Larger batch size
    n_epochs=15,         # More epochs per update
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.005,      # Lower entropy for exploitation
    policy_kwargs={"net_arch": [256, 256]},  # Deeper network
    device=device,
    verbose=1,
    tensorboard_log="./logs/"
)

# Uncomment to run extended training (takes longer)
# total_timesteps = 500000  # 5x more training

# Extended training (optional - takes longer)
'''
improved_model.learn(
    total_timesteps=total_timesteps,
    callback=[eval_callback, metrics_callback],
    progress_bar=True
)

# Save improved model
improved_model.save("transit_ppo_improved")
print("Improved model saved as 'transit_ppo_improved'")

# Quick test of improved model
obs, info = env.reset()
improved_reward = 0
for step in range(100):
    action, _states = improved_model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    improved_reward += reward
    if done or truncated:
        break
print(f"Improved model reward: {improved_reward:.2f}")
'''

# Reward function analysis
print("\nReward Function Component Analysis:")
obs, info = env.reset()
action, _ = model.predict(obs, deterministic=True)
current_demand = env._get_current_demand()

# Calculate individual reward components
total_frequency = np.sum(action)
demand_match = 1.0 - abs(total_frequency - current_demand) / 10.0
demand_reward = demand_match * 2.0
coverage = len(env.city_data["stations"]) * 0.02
cost = np.sum(action ** 2) * 0.03
hour = env.current_hour
rush_bonus = 1.0 if (7 <= hour <= 9 or 17 <= hour <= 19) and total_frequency >= current_demand else 0.0
wait_penalty = np.sum(1.0 / (action + 0.1)) * 0.1

print(f"Current demand: {current_demand:.2f}")
print(f"Total frequency: {total_frequency:.2f}")
print(f"Demand match reward: {demand_reward:.2f}")
print(f"Coverage reward: {coverage:.2f}")
print(f"Operating cost penalty: {-cost:.2f}")
print(f"Rush hour bonus: {rush_bonus:.2f}")
print(f"Wait time penalty: {-wait_penalty:.2f}")
print(f"Total reward: {demand_reward + coverage + rush_bonus - cost - wait_penalty:.2f}")

In [None]:
# Visualize reward components across different demand scenarios
print("Visualizing reward components...")

test_hours = [0, 8, 12, 18]  # Night, morning rush, day, evening rush
test_actions = [
    np.array([1]*10),         # Low frequency across all routes
    np.array([5]*10),         # Medium frequency
    np.array([10]*10),        # Maximum frequency
    np.array([8,8,8,8,8,2,2,2,2,2])  # Mixed frequency
]

reward_components = []

for hour in test_hours:
    env.reset()
    env.current_hour = hour
    current_demand = env._get_current_demand()
    
    for action in test_actions:
        # Calculate reward components
        total_frequency = np.sum(action)
        demand_match = 1.0 - abs(total_frequency - current_demand) / 10.0
        demand_reward = demand_match * 2.0
        coverage = len(env.city_data["stations"]) * 0.02
        cost = np.sum(action ** 2) * 0.03
        rush_bonus = 1.0 if (7 <= hour <= 9 or 17 <= hour <= 19) and total_frequency >= current_demand else 0.0
        wait_penalty = np.sum(1.0 / (action + 0.1)) * 0.1
        total = demand_reward + coverage + rush_bonus - cost - wait_penalty
        
        reward_components.append({
            'hour': hour,
            'action_type': ['Low', 'Medium', 'Maximum', 'Mixed'][test_actions.index(action)],
            'demand': current_demand,
            'frequency': total_frequency,
            'demand_match': demand_reward,
            'coverage': coverage,
            'cost': -cost,
            'rush_bonus': rush_bonus,
            'wait_penalty': -wait_penalty,
            'total': total
        })

# Convert to DataFrame for visualization
import pandas as pd
df = pd.DataFrame(reward_components)

# Plot reward breakdown by hour and action type
plt.figure(figsize=(15, 10))

# Time labels
time_labels = {0: "Night", 8: "Morning Rush", 12: "Day", 18: "Evening Rush"}

# Loop through hours and create subplots
for i, hour in enumerate(test_hours):
    plt.subplot(2, 2, i+1)
    hour_data = df[df['hour'] == hour]
    
    # Create bar chart for each component
    bar_width = 0.15
    positions = np.arange(len(hour_data))
    
    plt.bar(positions - bar_width*2, hour_data['demand_match'], bar_width, label='Demand Match')
    plt.bar(positions - bar_width, hour_data['coverage'], bar_width, label='Coverage')
    plt.bar(positions, hour_data['rush_bonus'], bar_width, label='Rush Bonus')
    plt.bar(positions + bar_width, hour_data['cost'], bar_width, label='Cost Penalty')
    plt.bar(positions + bar_width*2, hour_data['wait_penalty'], bar_width, label='Wait Penalty')
    
    plt.plot(positions, hour_data['total'], 'ko-', label='Total Reward')
    
    plt.title(f"Time: {time_labels[hour]} (Hour {hour}), Demand: {hour_data['demand'].iloc[0]:.2f}")
    plt.xticks(positions, hour_data['action_type'])
    plt.xlabel("Service Frequency Strategy")
    plt.ylabel("Reward Component Value")
    plt.grid(alpha=0.3)
    
    if i == 0:
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

print("Reward visualization complete!")
print("You can now adjust the reward function components to improve model performance.")