import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import deque
from unityagents import UnityEnvironment

# Import our modules
import sys
import os
sys.path.append('..')

from src.navigation.agents import Agent
from src.navigation.models import QNetwork
from src.navigation.buffers import ReplayBuffer
from src.navigation import config

%matplotlib inline

## 1. Import Dependencies

In [9]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import deque

# Import local unityagents instead of the package
import sys
import os
sys.path.append('..')
from unityagents import UnityEnvironment

# Import our modules
from src.navigation.agents import Agent
from src.navigation.models import QNetwork
from src.navigation.buffers import ReplayBuffer
from src.navigation import config

%matplotlib inline

## 2. Initialize Environment

In [10]:
# Initialize the real Unity environment
env = UnityEnvironment(file_name=config.UNITY_ENV_PATH)

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Get environment dimensions
state_size = len(env_info.vector_observations[0])
action_size = brain.vector_action_space_size

print(f'Number of agents: {len(env_info.agents)}')
print(f'State size: {state_size}')
print(f'Action size: {action_size}')

Running in mock mode...
Environment closed
Number of agents: 1
State size: 37
Action size: 4


## 3. Initialize Agent

In [11]:
# Create the agent
agent = Agent(state_size=state_size, action_size=action_size, seed=config.SEED)
print(f"Agent initialized with:")
print(f"  State size: {state_size}")
print(f"  Action size: {action_size}")
print(f"  Learning rate: {config.LEARNING_RATE}")
print(f"  Buffer size: {config.BUFFER_SIZE}")
print(f"  Batch size: {config.BATCH_SIZE}")

Agent initialized with:
  State size: 37
  Action size: 4
  Learning rate: 0.0005
  Buffer size: 100000
  Batch size: 64


## 4. Training Function

In [12]:
def train_agent(n_episodes=config.N_EPISODES, max_t=config.MAX_STEPS, 
                eps_start=config.EPS_START, eps_end=config.EPS_END, eps_decay=config.EPS_DECAY):
    """Train the DQN agent."""
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            
            if done:
                break
                
        scores_window.append(score)
        scores.append(score)
        eps = max(eps_end, eps_decay*eps)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        if np.mean(scores_window) >= config.SOLVE_SCORE:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(
                i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), config.CHECKPOINT_PATH)
            break
            
    return scores

## 5. Train the Agent

In [13]:
# Train the agent
scores = train_agent()

# Save the scores
np.save(config.SCORES_PATH, scores)

Episode 100	Average Score: 0.64
Episode 200	Average Score: 0.87
Episode 300	Average Score: 0.146
Episode 400	Average Score: 0.671
Episode 500	Average Score: -0.04
Episode 600	Average Score: -0.64
Episode 700	Average Score: 1.121
Episode 800	Average Score: 0.018
Episode 900	Average Score: -0.95
Episode 1000	Average Score: -0.86
Episode 1100	Average Score: 0.468
Episode 1200	Average Score: 0.401
Episode 1281	Average Score: -0.10

KeyboardInterrupt: 

## 6. Plot Training Results

In [None]:
# Plot the scores
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.title('DQN Training Progress')
plt.grid(True)
plt.show()

## 7. Test the Trained Agent

In [None]:
def test_agent(num_episodes=5):
    """Test the trained agent."""
    # Load the trained weights
    agent.qnetwork_local.load_state_dict(torch.load(config.CHECKPOINT_PATH))
    
    for i_episode in range(1, num_episodes+1):
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        
        while True:
            action = agent.act(state, eps=0.)  # No exploration during testing
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score += reward
            state = next_state
            
            if done:
                break
                
        print(f"Episode {i_episode}: Score = {score}")

# Test the trained agent
test_agent()

## 8. Close Environment

In [None]:
# Close the environment
env.close()