# Q-Learning Quick Start

This notebook demonstrates how to:
1. Load environments
2. Apply reward shaping
3. Train a Q-Learning agent
4. Visualize results

In [None]:
# Imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

import numpy as np
import matplotlib.pyplot as plt
from env import load_environment, CustomRewardWrapper
from agents.q_learning.q_table import QTable
from agents.q_learning.utils import epsilon_greedy, decay_epsilon, evaluate_policy

## 1. Explore Environment

In [None]:
# Load CartPole environment
env = load_environment("CartPole-v1", seed=42)

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
print(f"Number of actions: {env.action_space.n}")

In [None]:
# Test random agent
state, info = env.reset()
print(f"Initial state: {state}")

total_reward = 0
for step in range(10):
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    print(f"Step {step+1}: Action={action}, Reward={reward:.2f}, State={state}")
    if terminated or truncated:
        break

print(f"\nTotal reward: {total_reward}")

## 2. Apply Reward Shaping

In [None]:
# Compare default vs modified reward
env_default = load_environment("CartPole-v1", seed=42)
env_modified = CustomRewardWrapper(
    load_environment("CartPole-v1", seed=42),
    reward_type="bonus_center",
    env_name="CartPole-v1"
)

# Test one step
state_default, _ = env_default.reset()
state_modified, _ = env_modified.reset()

_, reward_default, _, _, _ = env_default.step(1)
_, reward_modified, _, _, _ = env_modified.step(1)

print(f"Default reward: {reward_default}")
print(f"Modified reward (bonus_center): {reward_modified}")

## 3. Train Q-Learning Agent

In [None]:
# Setup
env = load_environment("CartPole-v1", seed=42)
env = CustomRewardWrapper(env, reward_type="bonus_center", env_name="CartPole-v1")

# Initialize Q-Table
state_bounds = [(-2.4, 2.4), (-3.0, 3.0), (-0.5, 0.5), (-2.0, 2.0)]
q_table = QTable(num_actions=2, num_bins=[10, 10, 10, 10], state_bounds=state_bounds)

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_end = 0.01
epsilon_decay = 0.995
num_episodes = 200

# Training
episode_rewards = []
episode_steps = []

print("Training Q-Learning agent...")
for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    steps = 0
    done = False
    
    while not done:
        # Select action
        action = epsilon_greedy(q_table, state, epsilon, 2)
        
        # Take step
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        # Q-Learning update
        current_q = q_table.get_q_value(state, action)
        if done:
            target_q = reward
        else:
            max_next_q = q_table.get_max_q_value(next_state)
            target_q = reward + gamma * max_next_q
        
        new_q = current_q + alpha * (target_q - current_q)
        q_table.update(state, action, new_q)
        
        state = next_state
        episode_reward += reward
        steps += 1
    
    # Decay epsilon
    epsilon = decay_epsilon(epsilon, epsilon_end, epsilon_decay)
    
    episode_rewards.append(episode_reward)
    episode_steps.append(steps)
    
    if (episode + 1) % 50 == 0:
        avg_reward = np.mean(episode_rewards[-50:])
        print(f"Episode {episode+1}/{num_episodes} | Avg Reward: {avg_reward:.2f} | Epsilon: {epsilon:.4f}")

print("Training completed!")

## 4. Visualize Results

In [None]:
# Plot learning curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Rewards
ax1.plot(episode_rewards, alpha=0.3, label='Episode Reward')
moving_avg = np.convolve(episode_rewards, np.ones(20)/20, mode='valid')
ax1.plot(range(len(moving_avg)), moving_avg, color='red', linewidth=2, label='Moving Average (20)')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Q-Learning: Episode Rewards')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Steps
ax2.plot(episode_steps, alpha=0.3, color='green', label='Episode Steps')
moving_avg_steps = np.convolve(episode_steps, np.ones(20)/20, mode='valid')
ax2.plot(range(len(moving_avg_steps)), moving_avg_steps, color='darkgreen', linewidth=2, label='Moving Average (20)')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Steps')
ax2.set_title('Q-Learning: Episode Steps')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal Statistics:")
print(f"  Mean reward (last 50 episodes): {np.mean(episode_rewards[-50:]):.2f}")
print(f"  Max reward: {np.max(episode_rewards):.2f}")
print(f"  Q-Table size: {q_table.get_stats()['num_states']} states")

## 5. Evaluate Trained Agent

In [None]:
# Evaluate policy
avg_reward, eval_rewards = evaluate_policy(env, q_table, num_episodes=10, render=False, seed=100)

print(f"\nEvaluation Results:")
print(f"  Average reward: {avg_reward:.2f}")
print(f"  Std deviation: {np.std(eval_rewards):.2f}")
print(f"  Episode rewards: {[f'{r:.2f}' for r in eval_rewards]}")

## 6. Save Model

In [None]:
# Save Q-Table
save_path = project_root / "results" / "q_learning" / "models" / "notebook_demo_model.pkl"
save_path.parent.mkdir(parents=True, exist_ok=True)
q_table.save(save_path)

print(f"Model saved to: {save_path}")

# Load and verify
loaded_q_table = QTable(num_actions=2, num_bins=[10, 10, 10, 10], state_bounds=state_bounds)
loaded_q_table.load(save_path)

print(f"Model loaded successfully!")
print(f"Loaded Q-Table stats: {loaded_q_table.get_stats()}")

## Next Steps

1. Try different environments (MountainCar, LunarLander)
2. Experiment with different reward types
3. Tune hyperparameters (alpha, gamma, epsilon_decay, bins)
4. Compare default vs modified rewards
5. Train for more episodes
6. Use the training script for full experiments: `python agents/q_learning/train.py --config experiments/configs/q_learning_cartpole_default.json`