In [None]:
# ===============================
# AGENT INSTANTIATIONS
# ===============================

import numpy as np
import random
from collections import defaultdict

# For Actor-Critic
import torch
import torch.nn as nn
import torch.optim as optim

# ---- state discretizer for tabular agents ----
def discretize_state(state):
    # state: [capacity, onboard, station_idx, direction, sim_hour]
    cap, onboard, station_idx, direction, sim_hour = state
    cap_bin = int(cap // 100)    # bucket by 100s
    on_bin = int(onboard // 50)  # bucket by 50s
    dir_bin = 1 if direction >= 0 else 0
    hour_seg = int(sim_hour // 4)  # 0..5
    return (cap_bin, on_bin, int(station_idx), dir_bin, hour_seg)

# ---- Monte Carlo Agent ----
class MonteCarloAgent:
    def __init__(self, n_actions=3, eps=0.1):
        self.n_actions = n_actions
        self.eps = eps
        self.Q = defaultdict(float)
        self.returns = defaultdict(list)

    def policy(self, state, greedy=False):
        ds = discretize_state(state)
        if (not greedy) and (random.random() < self.eps):
            return random.randint(0, self.n_actions-1)
        qvals = [self.Q[(ds,a)] for a in range(self.n_actions)]
        return int(np.argmax(qvals))

    def update(self, episode):  # list of (state, action, reward)
        G = 0
        visited = set()
        for s,a,r in reversed(episode):
            G = r + G
            key = (tuple(discretize_state(s)), a)
            if key not in visited:
                visited.add(key)
                self.returns[key].append(G)
                self.Q[key] = np.mean(self.returns[key])

# ---- Q-Learning Agent ----
class QLearningAgent:
    def __init__(self, n_actions=3, alpha=0.1, gamma=0.99, eps=0.1):
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps
        self.Q = defaultdict(float)

    def policy(self, state, greedy=False):
        ds = discretize_state(state)
        if (not greedy) and (random.random() < self.eps):
            return random.randint(0, self.n_actions-1)
        qvals = [self.Q[(ds,a)] for a in range(self.n_actions)]
        return int(np.argmax(qvals))

    def update(self, s, a, r, s_next):
        ds = discretize_state(s)
        ds_next = discretize_state(s_next)
        best_next = max([self.Q[(ds_next, a2)] for a2 in range(self.n_actions)])
        self.Q[(ds,a)] += self.alpha * (r + self.gamma * best_next - self.Q[(ds,a)])

# ---- Actor-Critic Agent ----
class ACNetwork(nn.Module):
    def __init__(self, state_dim=5, action_dim=3, hidden=128):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden)
        self.actor = nn.Linear(hidden, action_dim)
        self.critic = nn.Linear(hidden, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        policy = torch.softmax(self.actor(x), dim=-1)
        value = self.critic(x)
        return policy, value

class ActorCriticAgent:
    def __init__(self, state_dim=5, action_dim=3, lr=1e-3, gamma=0.99):
        self.net = ACNetwork(state_dim, action_dim)
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)
        self.gamma = gamma

    def policy(self, state):
        st = torch.FloatTensor(state).unsqueeze(0)  # batchify
        probs, val = self.net(st)
        dist = torch.distributions.Categorical(probs)
        a = dist.sample()
        return a.item(), dist.log_prob(a), val

    def learn(self, trajectory):
        returns = []
        G = 0
        for _,_,r,_ in reversed(trajectory):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)
        log_probs = torch.stack([t[1] for t in trajectory])
        values = torch.cat([t[3] for t in trajectory]).squeeze()
        advantages = returns - values.detach()
        actor_loss = -(log_probs * advantages).mean()
        critic_loss = nn.MSELoss()(values, returns)
        loss = actor_loss + critic_loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [1]:
# ===============================
# AGENT TRAINING
# ===============================

import matplotlib.pyplot as plt
import numpy as np

def evaluate_policy_agent(agent, env_ctor, episodes=10):
    scores = []
    for _ in range(episodes):
        env = env_ctor()
        s = env.reset()
        done = False
        while not done:
            if isinstance(agent, ActorCriticAgent):
                a,_,_ = agent.policy(s)
            else:
                a = agent.policy(s, greedy=True)
            s, r, done, _ = env.step(a)
        norm, raw = env.final_score()
        scores.append(norm)
    return np.mean(scores), np.std(scores)

def train_mc(agent, env_ctor, episodes=300, max_steps_per_ep=200):
    eval_every = max(10, episodes//20)
    eval_scores = []
    for ep in range(episodes):
        env = env_ctor()
        state = env.reset()
        episode = []
        done = False
        steps = 0
        while not done and steps < max_steps_per_ep:
            a = agent.policy(state)
            next_state, reward, done, _ = env.step(a)
            episode.append((state, a, reward))
            state = next_state
            steps += 1
        agent.update(episode)
        if (ep+1) % eval_every == 0 or ep==episodes-1:
            mean_score, _ = evaluate_policy_agent(agent, env_ctor, episodes=6)
            eval_scores.append((ep+1, mean_score))
    return eval_scores

def train_q(agent, env_ctor, episodes=300, max_steps_per_ep=200):
    eval_every = max(10, episodes//20)
    eval_scores = []
    for ep in range(episodes):
        env = env_ctor()
        s = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_ep:
            a = agent.policy(s)
            s_next, r, done, _ = env.step(a)
            agent.update(s, a, r, s_next)
            s = s_next
            steps += 1
        if (ep+1) % eval_every == 0 or ep==episodes-1:
            mean_score, _ = evaluate_policy_agent(agent, env_ctor, episodes=6)
            eval_scores.append((ep+1, mean_score))
    return eval_scores

def train_ac(agent, env_ctor, episodes=300, max_steps_per_ep=200):
    eval_every = max(10, episodes//20)
    eval_scores = []
    for ep in range(episodes):
        env = env_ctor()
        s = env.reset()
        done = False
        steps = 0
        trajectory = []
        while not done and steps < max_steps_per_ep:
            a, logprob, val = agent.policy(s)
            s_next, r, done, _ = env.step(a)
            trajectory.append((s, logprob, r, val))
            s = s_next
            steps += 1
        if len(trajectory) > 0:
            agent.learn(trajectory)
        if (ep+1) % eval_every == 0 or ep==episodes-1:
            mean_score, _ = evaluate_policy_agent(agent, env_ctor, episodes=6)
            eval_scores.append((ep+1, mean_score))
    return eval_scores


In [3]:
# ===============================
# AGENT TEST RUN (cleaned + fixed)
# ===============================

import matplotlib.pyplot as plt
from functools import partial
import time

# Environment constructor for fresh envs
def env_ctor():
    # Always create new env, difficulty scaling is inside TrainGameEnv
    return TrainGameEnv(seed=None, verbose=False)

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Instantiate agents
mc_agent = MonteCarloAgent(n_actions=3, eps=0.1)
q_agent = QLearningAgent(n_actions=3, alpha=0.1, gamma=0.99, eps=0.1)
ac_agent = ActorCriticAgent(state_dim=5, action_dim=3, lr=1e-3, gamma=0.99)

# Training parameters
EPISODES = 1000

print("Training Monte Carlo...")
mc_scores = train_mc(mc_agent, env_ctor, episodes=EPISODES)
print("Training Q-Learning...")
q_scores = train_q(q_agent, env_ctor, episodes=EPISODES)
print("Training Actor-Critic...")
ac_scores = train_ac(ac_agent, env_ctor, episodes=EPISODES)

# Convert training logs to X/Y for plotting
def to_xy(score_list):
    xs = [x for x, _ in score_list]
    ys = [y for _, y in score_list]
    return xs, ys

mc_x, mc_y = to_xy(mc_scores)
q_x, q_y = to_xy(q_scores)
ac_x, ac_y = to_xy(ac_scores)

# Plot learning curves
plt.figure(figsize=(10,5))
plt.plot(mc_x, mc_y, label="Monte Carlo")
plt.plot(q_x, q_y, label="Q-Learning")
plt.plot(ac_x, ac_y, label="Actor-Critic")
plt.xlabel("Episodes")
plt.ylabel("Eval average normalized score (1–100)")
plt.title("Learning curves (evaluated periodically)")
plt.legend()
plt.grid(True)
plt.show()

# Final evaluation playthroughs
def rollout_and_print(agent, title, max_steps=30):
    env = env_ctor()
    state = env.reset()
    print("\n" + "="*40)
    print(f"Playthrough — {title}")
    steps = 0

    while (not env.done) and steps < max_steps:
        if isinstance(agent, ActorCriticAgent):
            action, _, _ = agent.policy(state)
        else:
            action = agent.policy(state, greedy=True)

        state, reward, done, info = env.step(action)

        # ASCII track
        line = ["·"] * env.num_stations
        if 0 <= env.station_idx < len(line):
            line[env.station_idx] = "🚂"
        print("".join(line), f"| Step {steps+1} | Action {action} | Reward {reward:.1f} | raw_score {env.raw_score:.1f}")

        steps += 1
        time.sleep(0.04)

    norm, raw = env.final_score()  # no steps arg anymore
    print(f"Result {title}: normalized={norm}/100 raw={raw:.2f} reason={env.done_reason}")

# Show each trained agent
rollout_and_print(mc_agent, "Monte Carlo (trained)")
rollout_and_print(q_agent, "Q-Learning (trained)")
rollout_and_print(ac_agent, "Actor-Critic (trained)")


NameError: name 'random' is not defined