# CartPole DQN Exploration Notebook

This notebook provides an interactive environment for exploring the CartPole DQN model and experiments.

In [None]:
import sys
sys.path.append('../src')

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import mlflow
import pandas as pd
from IPython.display import HTML

from dqn_agent import DQNAgent
from mlflow_utils import MLflowTracker

# Set up plotting
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Environment Exploration

In [None]:
# Create CartPole environment
env = gym.make('CartPole-v1')

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
print(f"Max episode steps: {env._max_episode_steps}")

# Sample some observations
print("\nSample observations:")
for i in range(5):
    obs, _ = env.reset()
    print(f"Reset {i+1}: {obs}")

## 2. Random Agent Baseline

In [None]:
def test_random_agent(episodes=100):
    """Test random agent performance."""
    env = gym.make('CartPole-v1')
    rewards = []
    
    for episode in range(episodes):
        obs, _ = env.reset()
        total_reward = 0
        
        while True:
            action = env.action_space.sample()  # Random action
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            
            if terminated or truncated:
                break
        
        rewards.append(total_reward)
    
    env.close()
    return rewards

# Test random agent
random_rewards = test_random_agent(100)
print(f"Random agent average reward: {np.mean(random_rewards):.2f} ± {np.std(random_rewards):.2f}")
print(f"Random agent max reward: {np.max(random_rewards)}")

# Plot random agent performance
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(random_rewards)
plt.title('Random Agent Episode Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')

plt.subplot(1, 2, 2)
plt.hist(random_rewards, bins=20, alpha=0.7)
plt.title('Random Agent Reward Distribution')
plt.xlabel('Reward')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Load and Analyze MLflow Experiments

In [None]:
# Set MLflow tracking URI
mlflow.set_tracking_uri("http://localhost:5000")

# Get experiment
experiment_name = "cartpole-dqn"
try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment:
        print(f"Experiment ID: {experiment.experiment_id}")
        print(f"Experiment Name: {experiment.name}")
        print(f"Artifact Location: {experiment.artifact_location}")
        
        # Get all runs
        runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
        print(f"\nNumber of runs: {len(runs_df)}")
        
        if len(runs_df) > 0:
            # Display run summary
            print("\nRun Summary:")
            display_cols = ['run_id', 'status', 'start_time', 'metrics.summary_avg_reward', 
                          'params.learning_rate', 'params.episodes']
            available_cols = [col for col in display_cols if col in runs_df.columns]
            print(runs_df[available_cols].head())
    else:
        print(f"Experiment '{experiment_name}' not found. Run training first.")
        
except Exception as e:
    print(f"Error connecting to MLflow: {e}")
    print("Make sure MLflow server is running: docker-compose up mlflow-server")

## 4. Visualize Training Progress

In [None]:
# Plot experiment comparison
if 'runs_df' in locals() and len(runs_df) > 0:
    plt.figure(figsize=(15, 5))
    
    # Plot 1: Average rewards comparison
    plt.subplot(1, 3, 1)
    if 'metrics.summary_avg_reward' in runs_df.columns:
        plt.bar(range(len(runs_df)), runs_df['metrics.summary_avg_reward'])
        plt.title('Average Reward by Run')
        plt.xlabel('Run Index')
        plt.ylabel('Average Reward')
        plt.axhline(y=195, color='r', linestyle='--', label='Solved Threshold')
        plt.legend()
    
    # Plot 2: Learning rate vs performance
    plt.subplot(1, 3, 2)
    if 'params.learning_rate' in runs_df.columns and 'metrics.summary_avg_reward' in runs_df.columns:
        lr_vals = runs_df['params.learning_rate'].astype(float)
        rewards = runs_df['metrics.summary_avg_reward']
        plt.scatter(lr_vals, rewards)
        plt.title('Learning Rate vs Average Reward')
        plt.xlabel('Learning Rate')
        plt.ylabel('Average Reward')
        plt.xscale('log')
    
    # Plot 3: Episodes to solve
    plt.subplot(1, 3, 3)
    if 'metrics.episodes_to_solve' in runs_df.columns:
        solve_episodes = runs_df['metrics.episodes_to_solve'].dropna()
        if len(solve_episodes) > 0:
            plt.hist(solve_episodes, bins=10, alpha=0.7)
            plt.title('Episodes to Solve Distribution')
            plt.xlabel('Episodes')
            plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 5. Load and Test Trained Model

In [None]:
def load_best_model():
    """Load the best trained model."""
    model_path = '../models/best_model.pth'
    
    try:
        # Create agent
        agent = DQNAgent(state_size=4, action_size=2)
        
        # Load model
        checkpoint = agent.load_model(model_path)
        print(f"Model loaded successfully from {model_path}")
        print(f"Training epsilon: {checkpoint['epsilon']:.4f}")
        
        return agent
        
    except FileNotFoundError:
        print(f"Model file not found: {model_path}")
        print("Train a model first: python src/train.py")
        return None
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Load model
trained_agent = load_best_model()

In [None]:
def evaluate_trained_agent(agent, episodes=10, render=False):
    """Evaluate trained agent performance."""
    if agent is None:
        print("No agent provided")
        return []
    
    env = gym.make('CartPole-v1', render_mode="human" if render else None)
    rewards = []
    
    for episode in range(episodes):
        obs, _ = env.reset()
        total_reward = 0
        steps = 0
        
        while True:
            action = agent.act(obs, training=False)  # No exploration
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            steps += 1
            
            if terminated or truncated:
                break
        
        rewards.append(total_reward)
        print(f"Episode {episode+1}: {total_reward} steps")
    
    env.close()
    return rewards

# Evaluate trained agent
if trained_agent:
    trained_rewards = evaluate_trained_agent(trained_agent, episodes=5)
    print(f"\nTrained agent average reward: {np.mean(trained_rewards):.2f}")
    print(f"Trained agent max reward: {np.max(trained_rewards)}")
    
    # Compare with random agent
    plt.figure(figsize=(10, 6))
    plt.boxplot([random_rewards[:50], trained_rewards], 
                labels=['Random Agent', 'Trained DQN'])
    plt.title('Performance Comparison')
    plt.ylabel('Episode Reward')
    plt.axhline(y=195, color='r', linestyle='--', label='Solved Threshold')
    plt.legend()
    plt.show()

## 6. Action Analysis

In [None]:
def analyze_agent_decisions(agent, num_samples=1000):
    """Analyze what actions the agent takes in different states."""
    if agent is None:
        print("No agent provided")
        return
    
    env = gym.make('CartPole-v1')
    
    states = []
    actions = []
    q_values = []
    
    # Collect data
    for _ in range(num_samples):
        obs, _ = env.reset()
        
        # Get action and Q-values
        action = agent.act(obs, training=False)
        
        # Get Q-values
        with torch.no_grad():
            state_tensor = torch.FloatTensor(obs).unsqueeze(0).to(agent.device)
            q_vals = agent.q_network(state_tensor).cpu().numpy()[0]
        
        states.append(obs)
        actions.append(action)
        q_values.append(q_vals)
    
    env.close()
    
    states = np.array(states)
    actions = np.array(actions)
    q_values = np.array(q_values)
    
    # Analysis plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # State variable distributions by action
    state_names = ['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Velocity']
    
    for i, name in enumerate(state_names):
        ax = axes[i//2, i%2]
        
        left_states = states[actions == 0, i]
        right_states = states[actions == 1, i]
        
        ax.hist(left_states, bins=30, alpha=0.5, label='Left (0)', density=True)
        ax.hist(right_states, bins=30, alpha=0.5, label='Right (1)', density=True)
        ax.set_title(f'{name} Distribution by Action')
        ax.set_xlabel(name)
        ax.set_ylabel('Density')
        ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Q-value analysis
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.scatter(q_values[:, 0], q_values[:, 1], alpha=0.5)
    plt.xlabel('Q-value for Left (0)')
    plt.ylabel('Q-value for Right (1)')
    plt.title('Q-value Correlation')
    plt.plot([0, 1], [0, 1], 'r--', alpha=0.5)
    
    plt.subplot(1, 2, 2)
    q_diff = q_values[:, 1] - q_values[:, 0]  # Right - Left
    plt.hist(q_diff, bins=50, alpha=0.7)
    plt.xlabel('Q-value Difference (Right - Left)')
    plt.ylabel('Frequency')
    plt.title('Action Preference Distribution')
    plt.axvline(x=0, color='r', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    return states, actions, q_values

# Analyze agent decisions
if trained_agent:
    states, actions, q_values = analyze_agent_decisions(trained_agent)

## 7. Hyperparameter Analysis

In [None]:
# Analyze hyperparameter effects (if multiple runs exist)
if 'runs_df' in locals() and len(runs_df) > 1:
    
    # Convert relevant columns to numeric
    numeric_params = ['learning_rate', 'batch_size', 'epsilon_decay']
    numeric_metrics = ['summary_avg_reward', 'summary_max_reward', 'episodes_to_solve']
    
    for param in numeric_params:
        col_name = f'params.{param}'
        if col_name in runs_df.columns:
            runs_df[col_name] = pd.to_numeric(runs_df[col_name], errors='coerce')
    
    for metric in numeric_metrics:
        col_name = f'metrics.{metric}'
        if col_name in runs_df.columns:
            runs_df[col_name] = pd.to_numeric(runs_df[col_name], errors='coerce')
    
    # Create correlation matrix
    param_cols = [f'params.{p}' for p in numeric_params if f'params.{p}' in runs_df.columns]
    metric_cols = [f'metrics.{m}' for m in numeric_metrics if f'metrics.{m}' in runs_df.columns]
    
    if param_cols and metric_cols:
        corr_data = runs_df[param_cols + metric_cols].corr()
        
        plt.figure(figsize=(10, 8))
        import seaborn as sns
        sns.heatmap(corr_data, annot=True, cmap='coolwarm', center=0)
        plt.title('Hyperparameter-Performance Correlation')
        plt.tight_layout()
        plt.show()
    
    # Performance over time
    if 'start_time' in runs_df.columns and 'metrics.summary_avg_reward' in runs_df.columns:
        runs_df['start_time'] = pd.to_datetime(runs_df['start_time'])
        runs_sorted = runs_df.sort_values('start_time')
        
        plt.figure(figsize=(12, 6))
        plt.plot(runs_sorted['start_time'], runs_sorted['metrics.summary_avg_reward'], 'o-')
        plt.title('Performance Over Time')
        plt.xlabel('Experiment Time')
        plt.ylabel('Average Reward')
        plt.xticks(rotation=45)
        plt.axhline(y=195, color='r', linestyle='--', label='Solved Threshold')
        plt.legend()
        plt.tight_layout()
        plt.show()

else:
    print("Not enough runs for hyperparameter analysis. Run multiple experiments with different parameters.")

## 8. Next Steps and Experiments

Based on this analysis, here are some experiments you can try:

1. **Hyperparameter Tuning**: Experiment with different learning rates, batch sizes, and network architectures
2. **Algorithm Variations**: Try Double DQN, Dueling DQN, or Rainbow DQN
3. **Different Environments**: Apply the same code to other Gym environments
4. **Curriculum Learning**: Start with easier tasks and gradually increase difficulty
5. **Model Interpretability**: Analyze what features the network learns

### Quick Experiment Commands:

```bash
# Try different learning rates
python src/train.py --lr 0.0001 --episodes 500
python src/train.py --lr 0.001 --episodes 500
python src/train.py --lr 0.01 --episodes 500

# Try different batch sizes
python src/train.py --batch-size 16 --episodes 500
python src/train.py --batch-size 64 --episodes 500
python src/train.py --batch-size 128 --episodes 500
```