# Import Libraries

In [33]:
import numpy as np
import gymnasium as gym
from Map import MapEnv 
from collections import defaultdict
from tqdm import tqdm
from matplotlib import pyplot as plt
from gymnasium.utils.env_checker import check_env
from tqdm import tqdm

# Training and Visualization

In [None]:
class DriverAgent:
    def __init__(
        self,
        env: gym.Env,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float,
    ):
        self.env = env
        # Q-table
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def _obs_to_tuple(self, obs):
        d = obs["driver"]
        dest = obs["destination"]
        return (int(d[0]), int(d[1]), int(dest[0]), int(dest[1]))

    def choose_action(self, obs: tuple[int, int, bool]) -> int: 
        obs_tuple = self._obs_to_tuple(obs)
        # Exploration
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        # Exploitation
        else :
            return int(np.argmax(self.q_values[obs_tuple]))
        
    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        obs_tuple = self._obs_to_tuple(obs)
        next_obs_tuple = self._obs_to_tuple(next_obs)
        
        # Decide next best action
        future_q_value = (not terminated) * np.max(self.q_values[next_obs_tuple])

        # Bellman Equation to update Q-value
        destination = reward + self.discount_factor * future_q_value

        # How wrong was our current estimate?
        temporal_difference = destination - self.q_values[obs_tuple][action]

        # Update q-value in direction of error
        self.q_values[obs_tuple][action] = (
            self.q_values[obs_tuple][action] + self.lr * temporal_difference
        )

        # Track learning progress
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

    # def test_agent(self, env, n_episodes):
    #     total_rewards = []
    #     successes = []
    #     episode_lengths = []

    #     # Disable exploration for testing
    #     old_epsilon = self.epsilon
    #     self.epsilon = 0.0

    #     for _ in range(n_episodes):
    #         obs, info = env.reset()
    #         episode_reward = 0
    #         steps = 0
    #         done = False

    #         while not done:
    #             action = self.choose_action(obs)
    #             obs, reward, terminated, truncated, info = env.step(action)
    #             episode_reward += reward
    #             steps += 1
    #             done = terminated or truncated

    #         total_rewards.append(episode_reward)
    #         successes.append(terminated) 
    #         episode_lengths.append(steps)

    #     # Restore original epsilon
    #     self.epsilon = old_epsilon

    #     success_rate = np.mean(successes)
    #     average_reward = np.mean(total_rewards)
    #     average_length = np.mean(episode_lengths)

    #     tqdm.write(f"Test Results over {n_episodes} episodes:")
    #     tqdm.write(f"Success Rate: {success_rate:.1%}")
    #     tqdm.write(f"Average Reward: {average_reward:.3f}")
    #     tqdm.write(f"Average Episode Length: {average_length:.1f}")
    #     tqdm.write(f"Std Episode Length: {np.std(episode_lengths):.1f}")


    def train_agent(agent, env, n_episodes: int, log_interval: int=5000):
        episode_rewards = []
        episode_lengths = []
        epsilon_history = []
        success_rate_history = []

        print(f"\nTraining for {n_episodes} episodes...\n")

        for episode in tqdm(range(n_episodes), desc="Training"):
            obs, info = env.reset()
            done = False
            episode_reward = 0
            steps = 0

            while not done:
                # Choose action using epsilon-greedy policy
                action = agent.choose_action(obs)
                
                # Take action in environment
                next_obs, reward, terminated, truncated, info = env.step(action)

                # Q-learning update (uses max over next state, not next action)
                agent.update(obs, action, reward, terminated, next_obs)

                episode_reward += reward
                steps += 1
                done = terminated or truncated
                obs = next_obs

            agent.decay_epsilon()
            episode_rewards.append(episode_reward)
            episode_lengths.append(steps)
            epsilon_history.append(agent.epsilon)

            if episode >= 99:
                recent_successes = [1 if r > 0 else 0 for r in episode_rewards[-100:]]
                success_rate_history.append(np.mean(recent_successes))

            if (episode + 1) % log_interval == 0:
                recent_rewards = episode_rewards[-1000:]
                tqdm.write(f"\nEpisode {episode + 1}/{n_episodes}")
                tqdm.write(f"  Avg Reward: {np.mean(recent_rewards):.2f}")
                tqdm.write(f"  Success Rate: {np.mean(recent_successes):.1%}")
                tqdm.write(f"  Avg Steps: {np.mean(episode_lengths[-1000:]):.1f}")
                tqdm.write(f"  Epsilon: {agent.epsilon:.4f}")
                tqdm.write(f"  States: {len(agent.q_values)}")

        # Success rate over all episodes
        overall_success_rate = np.mean([1 if r > 0 else 0 for r in episode_rewards])

        print(f"\nTraining complete. States explored: {len(agent.q_values)}")
        print(f"Final epsilon: {agent.epsilon:.4f}")
        print(f"Overall Success Rate: {overall_success_rate:.1%}")

        return {
            "episode_rewards": episode_rewards,
            "episode_lengths": episode_lengths,
            "epsilon_history": epsilon_history,
            "success_rate_history": success_rate_history,
            "overall_success_rate": overall_success_rate,
        }   


    def test_agent(agent, env, n_episodes: int = 100, success_threshold: float = 0):
        total_rewards = []
        successes = []
        episode_lengths = []

        old_epsilon = agent.epsilon
        agent.epsilon = 0.0  # no exploration

        for _ in tqdm(range(n_episodes), desc="Testing"):
            obs, info = env.reset()
            episode_reward = 0
            steps = 0
            done = False

            while not done:
                action = agent.choose_action(obs)
                obs, reward, terminated, truncated, info = env.step(action)
                episode_reward += reward
                steps += 1
                done = terminated or truncated

            total_rewards.append(episode_reward)
            successes.append(1 if episode_reward > success_threshold else 0)
            episode_lengths.append(steps)

        # restore epsilon
        agent.epsilon = old_epsilon

        results = {
            "success_rate": np.mean(successes),
            "avg_reward": np.mean(total_rewards),
            "std_reward": np.std(total_rewards),
            "avg_length": np.mean(episode_lengths),
            "std_length": np.std(episode_lengths),
            "rewards": total_rewards,
            "lengths": episode_lengths,
        }

        # Optional summary print
        print(f"\n{'='*60}")
        print(f"TEST RESULTS ({n_episodes} episodes)")
        print(f"{'='*60}")
        print(f"Success Rate: {results['success_rate']:.1%}")
        print(f"Average Reward: {results['avg_reward']:.3f} ± {results['std_reward']:.3f}")
        print(f"Average Episode Len: {results['avg_length']:.1f} ± {results['std_length']:.1f}")
        print(f"{'='*60}")

        return results

    def visualize_testing_progress(agent, env_class, n_episodes=10):
        render_env = env_class(render_mode="human")
        old_epsilon = agent.epsilon
        agent.epsilon = 0.0  # pure exploitation mode

        for episode in range(n_episodes):
            obs, info = render_env.reset()
            episode_reward = 0
            steps = 0
            done = False

            print(f"\n=== Episode {episode + 1} ===")
            print(f"Driver at: {obs['driver']}, Destination at: {obs['destination']}")

            while not done:
                action = agent.choose_action(obs)
                obs, reward, terminated, truncated, info = render_env.step(action)
                episode_reward += reward
                steps += 1
                done = terminated or truncated

            print(f"Episode finished in {steps} steps")
            print(f"Total reward: {episode_reward:.2f}")
            print(f"Success: {'Yes' if terminated else 'No'}")

        agent.epsilon = old_epsilon
        render_env.close()


In [35]:
def get_moving_avgs(arr, window, convolution_mode):
        return np.convolve(
            np.array(arr).flatten(),
            np.ones(window),
            mode=convolution_mode
        ) / window

In [36]:
def visualize_training(results, agent, window=100, algo_name='Q-learning', save_path=None):

    episode_rewards = results["episode_rewards"]
    episode_lengths = results["episode_lengths"]
    success_rate_history = results["success_rate_history"]
    # overall_success_rate = results.get("overall_success_rate", None)

    fig, axes = plt.subplots(3, 3, figsize=(18, 14))
    fig.suptitle(f'{algo_name} Training Metrics', fontsize=16, fontweight='bold')

    # Helper for moving average
    def get_moving_avgs(data, window=100, mode='valid'):
        return np.convolve(data, np.ones(window)/window, mode=mode)

    # 1. Episode Rewards
    ax = axes[0, 0]
    rewards_ma = get_moving_avgs(episode_rewards, window, 'valid')
    ax.plot(episode_rewards, alpha=0.2, color='blue', label='Raw')
    ax.plot(range(window-1, len(episode_rewards)), rewards_ma,
            color='blue', linewidth=2, label=f'{window}-ep MA')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Total Reward')
    ax.set_title('Episode Rewards')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 2. Episode Lengths
    ax = axes[0, 1]
    lengths_ma = get_moving_avgs(episode_lengths, window, 'valid')
    ax.plot(episode_lengths, alpha=0.2, color='green', label='Raw')
    ax.plot(range(window-1, len(episode_lengths)), lengths_ma, color='green', linewidth=2, label=f'{window}-ep MA')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Steps')
    ax.set_title('Episode Lengths')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 3. Success Rate
    ax = axes[0, 2]
    ax.plot(range(window-1, window-1 + len(success_rate_history)),
            np.array(success_rate_history)*100,
            color='orange', linewidth=2.5, label=f'Rolling Success Rate ({window}ep)')
    ax.axhline(y=100, color='gray', linestyle='--', alpha=0.5, linewidth=1, label='Perfect (100%)')
    ax.axhline(y=71, color='red', linestyle=':', alpha=0.7, linewidth=1.5, label='Test Result (71%)')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Success Rate (%)')
    ax.set_title(f'Success Rate During Training ({window}-ep window)')
    ax.set_ylim(0, 105)
    ax.legend()
    ax.grid(True, alpha=0.3)

    if len(success_rate_history) > 0:
        final_success = success_rate_history[-1]
        ax.text(0.98, 0.02, f'Final: {final_success:.1f}%',
                transform=ax.transAxes, ha='right', va='bottom',
                fontsize=10, fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

    # 4. Epsilon Decay
    ax = axes[1, 0]
    epsilon_history = results.get("epsilon_history", [])

    if len(epsilon_history) > 0:
        ax.plot(epsilon_history, color='orange', linewidth=2)
        ax.set_xlabel('Episode')
        ax.set_ylabel('Epsilon (ε)')
        ax.set_title('Exploration Rate Decay')
        ax.grid(True, alpha=0.3)
        ax.set_ylim(-0.05, 1.05)
    else:
        ax.text(0.5, 0.5, "No epsilon history", ha='center', va='center',
                fontsize=10, color='gray')
        ax.axis('off')

    # 5. TD Errors (Per Episode)
    ax = axes[1, 1]
    td_errors = getattr(agent, "training_error", [])
    if len(td_errors) > 0:
        if len(td_errors) > window:
            td_ma = np.convolve(td_errors, np.ones(window)/window, mode='valid')
            ax.plot(td_errors, alpha=0.2, color='red', label='Raw')
            ax.plot(range(window-1, len(td_errors)), td_ma,
                    color='red', linewidth=2, label=f'{window}-ep MA')
        else:
            ax.plot(td_errors, color='red')
        ax.set_xlabel('Episode')
        ax.set_ylabel('Mean |TD Error|')
        ax.set_title('Learning Progress (TD Errors)')
        ax.legend()
        ax.grid(True, alpha=0.3)
    else:
        ax.text(0.5, 0.5, "No TD error data", ha='center', va='center', fontsize=10, color='gray')
        ax.axis('off')

    # 6. Reward Distribution
    ax = axes[1, 2]
    recent_rewards = episode_rewards[-1000:]
    ax.hist(recent_rewards, bins=50, color='teal', alpha=0.7, edgecolor='black')
    mean_reward = np.mean(recent_rewards)
    ax.axvline(mean_reward, color='red', linestyle='--',
               linewidth=2.5, label=f'Mean: {mean_reward:.1f}')
    ax.axvline(0, color='gray', linestyle=':', linewidth=1.5, alpha=0.7, label='Zero')
    ax.set_xlabel('Total Reward')
    ax.set_ylabel('Frequency')
    ax.set_title('Reward Distribution (Last 1000 eps)')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    # 7. Cumulative Reward

    ax = axes[2, 1]
    cumulative_rewards = np.cumsum(episode_rewards)
    ax.plot(cumulative_rewards, color='purple', linewidth=2)
    ax.set_xlabel('Episode')
    ax.set_ylabel('Cumulative Reward')
    ax.set_title('Cumulative Reward Over Episodes')
    ax.grid(True, alpha=0.3)

    plt.tight_layout(rect=[0, 0, 1, 0.96])

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.savefig(f'{algo_name.lower()}_training_metrics.png', dpi=300, bbox_inches='tight')
        print(f"Saved: {algo_name.lower()}_training_metrics.png")


    plt.show()

#  Model Training

In [37]:
# if __name__ == "__main__":
#     n_episodes = 20000
#     env = MapEnv(render_mode=None)

#     agent = DriverAgent(
#         env=env,
#         learning_rate=0.1,
#         initial_epsilon=1.0,
#         epsilon_decay= 1e-5,
#         final_epsilon=0.01,
#         discount_factor=0.95,
#     )

#     # TRAINING AGENT
#     for episode in range(n_episodes):
#         obs, info = env.reset()
#         done = False
#         while not done:
#             # 1. Choose action
#             action = agent.choose_action(obs)

#             # 2. Take action
#             next_obs, reward, terminated, truncated, info = env.step(action)

#             # 3. Learn from experience & update Q-values
#             agent.update(obs, action, reward, terminated, next_obs)

#             # 4. Move to next state
#             done = terminated or truncated
#             obs = next_obs
#         # Take less random actions over time
#         agent.decay_epsilon()

#     # Run evaluation
#     agent.test_agent(env, 100)

# Testing and Visualization

In [38]:
# Visualize testing progress
agent.visualize_testing_progess(10)


=== Episode 1 ===
Driver at: [2 4], Destination at: [5 1]
Episode finished in 6 steps
Total reward: 99.87
Success: Yes

=== Episode 2 ===
Driver at: [3 5], Destination at: [1 2]
Episode finished in 5 steps
Total reward: 99.92
Success: Yes

=== Episode 3 ===
Driver at: [4 2], Destination at: [7 3]
Episode finished in 4 steps
Total reward: 99.95
Success: Yes

=== Episode 4 ===
Driver at: [0 7], Destination at: [4 0]
Episode finished in 11 steps
Total reward: 99.59
Success: Yes

=== Episode 5 ===
Driver at: [6 6], Destination at: [6 7]
Episode finished in 1 steps
Total reward: 100.00
Success: Yes

=== Episode 6 ===
Driver at: [4 0], Destination at: [4 1]
Episode finished in 1 steps
Total reward: 100.00
Success: Yes

=== Episode 7 ===
Driver at: [5 7], Destination at: [5 1]
Episode finished in 6 steps
Total reward: 99.85
Success: Yes

=== Episode 8 ===
Driver at: [4 6], Destination at: [1 5]
Episode finished in 4 steps
Total reward: 99.95
Success: Yes

=== Episode 9 ===
Driver at: [1 7], 

In [39]:
n_episodes = 20000
env = MapEnv(render_mode=None)

agent = DriverAgent(
    env=env,
    learning_rate=0.1,
    initial_epsilon=1.0,
    epsilon_decay=1e-5,
    final_epsilon=0.01,
    discount_factor=0.95,
)

print(f"Environment: {env.size}×{env.size} grid, {len(env._obstacles)} obstacles")
print(f"Agent: α=0.1, γ=0.95, ε: 1.0→0.01")

Environment: 8×8 grid, 8 obstacles
Agent: α=0.1, γ=0.95, ε: 1.0→0.01


In [41]:
print("\nTraining...\n")

train_results = train_agent(agent, env, n_episodes)
print(f"Training success rate: {train_results['overall_success_rate']:.1%}")

visualize_training(train_results, agent, n_episodes, window=100, algo_name='Q-Learning', save_path='q_learning_training_metrics.png')

print("\nTesting... \n")
test_results = test_agent(agent, env, 10)

visualize_testing_progress(agent, MapEnv, n_episodes=10)





Training...


Training for 20000 episodes...



Training:   0%|          | 0/20000 [00:00<?, ?it/s]


TypeError: DriverAgent.update() takes 6 positional arguments but 7 were given