# Traning Agent with DDQN

## Step 1: Import the libraries

In [None]:
import time
import random
import math
from collections import deque

import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt
import torch

from gym_unity.envs import UnityEnv

In [None]:
from ddpg_agent import DDPGAgent

## Step 2: Create our environment

Initialize the environment in the code cell below.


In [None]:
ENV_PATH = '../unity_envs/Reacher/'
ENV_NAME = 'Unity Environment'
env = UnityEnv(ENV_PATH + ENV_NAME, worker_id=0, multiagent=True, no_graphics=True)

In [None]:
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

## Step 3: Viewing our Enviroment

In [None]:
print("The size of state is: ", env.observation_space.shape[0])
print("No. of Actions: ", env.action_space)

## Step 4: Creating our Agent

In [None]:
STATE_SIZE = env.observation_space.shape[0]
ACTION_SIZE = 4
GAMMA = 0.99                # discount factor
BUFFER_SIZE = int(1e6)      # replay buffer size
BATCH_SIZE = 512            # Update batch size
LR_ACTOR = 1e-4             # Actor learning rate
LR_CRITIC = 1e-3            # Critic learning rate
TAU = 1e-3                  # for soft update of target parameters
UPDATE_EVERY = 4            # how often to update the network 
NUM_AGENTS = 20

agent = DDPGAgent(STATE_SIZE, ACTION_SIZE, BUFFER_SIZE, BATCH_SIZE, GAMMA, LR_ACTOR, LR_CRITIC, TAU, UPDATE_EVERY,NUM_AGENTS, device)

## Step 5: Creating Helper Function

### Plot Traning


In [None]:
def plot_result(scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()


## Step 6: Train the Agent with DDPG

In [None]:
def train(n_episodes, max_steps, scores_average_window, benchmark_reward):
    """
    Params
    ======
        n_episodes (int): maximum number of training episodes
        benchmark_reward (float): benchmark reward at which environment is solved.
    """
    scores = []
    scores_window = deque(maxlen=scores_average_window)
    for i_episode in range(1, n_episodes+1):
        states = np.array(env.reset())
        score = np.zeros(NUM_AGENTS)
        agent.reset()
        for step in range(max_steps):
            actions = agent.act(states)
            next_states, rewards, dones, _ = env.step(actions)
            score += rewards
            agent.step(states, actions, rewards, next_states, dones)
            if np.any(dones):
                break
            else:
                states = np.array(next_states)
        scores_window.append(np.mean(score))       # save most recent score
        scores.append(np.mean(score))              # save most recent score
        
        # printing and ploting results
        clear_output(wait=True)
        plot_result(scores)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        
        if float(np.mean(scores_window)) >= benchmark_reward:
            agent.save_model("reacher_solved.pth", scores)
            print("Yah Environment is solved :)")
            break
    
    return scores

In [None]:
BENCHMARK_REWARD = 29
SCORES_AVERAGE_WINDOW = 100
NUM_EPISODES = 2000
MAX_STEPS = 1000

scores = train(NUM_EPISODES, MAX_STEPS, SCORES_AVERAGE_WINDOW, BENCHMARK_REWARD)
print("Done Training")

In [None]:
env.close()