## DDPG - Reacher

### Set environment

In [None]:
%config IPCompleter.greedy=True
from unityagents import UnityEnvironment
import numpy as np

aws_unity_filename= './Reacher_Linux_NoVis/Reacher.x86_64'
laptop_unity_filename= './Reacher_Linux/Reacher.x86_64'
seed= 0 #datetime.now().second
env = UnityEnvironment(seed= seed, file_name= laptop_unity_filename)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

#Handy functions to help understand the code
def env_reset(env, mode=True):
    env_info = env.reset(train_mode=mode)[env.brain_names[0]]
    return env_info.vector_observations[0]

def env_step(env, action):
    env_info= env.step(action)[env.brain_names[0]]
    return env_info.vector_observations[0], env_info.rewards[0], env_info.local_done[0]

### Training

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import torch
from torch.autograd import Variable
from collections import deque
from datetime import datetime

from ddpg import DDPG

NUM_EPISODES = 300
SOLVED_IN= 30

start= datetime.now()

state= env_reset(env, True)
# Hyperparameters
config= {
    "label": "Noise",
    "state_size": len(state),
    "action_size": brain.vector_action_space_size,
    "seed": seed,
    "actor_lr": 0.001,
    "critic_lr": 0.001,
    "actor_nodes": [32, 32],
    "critic_nodes": [128, 128],
    "batch_size": 256,
    "memory_size": 100000,
    "discount": 0.9,
    "sigma": 0.0, # OUNoise
    "tau": 0.001,
}


agent = DDPG(config= config)

scores_window = deque(maxlen=100)

for episode in range(NUM_EPISODES):
    state= env_reset(env, True)
    agent.reset()
    ep_reward = 0
    score= 0  
    while True:
        action = agent.act(state)    # Agent action. Include noise
        next_state, reward, done= env_step(env, action)   # Environmet step
        # Agent step. Includes learnig from memory
        agent.step(state, action, reward, next_state, done)
        
        score+= reward        # update the score
        state= next_state     # roll over the state to next time step
        if done:              # exit loop if episode finished
            break
    scores_window.append(score)       
    agent.scores.append(score)              # save most recent score
    mean_w_scores= np.mean(scores_window)
    print('\rEpisode {}\tAverage Score: {:.2f}  '.format(episode+ 1, mean_w_scores), end="")
    if (episode+ 1) % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}  '.format(episode+ 1, mean_w_scores))
    if mean_w_scores >= SOLVED_IN:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format((episode+ 1)-100, mean_w_scores))
        break
        
agent.save()

print('Elapsed time', datetime.now()- start)
        
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N 

smoothed_scores= running_mean(agent.scores, 10)
plt.plot(np.arange(len(smoothed_scores)), smoothed_scores)
plt.plot(np.arange(len(agent.scores)), agent.scores, color='grey', alpha=0.5)
plt.xlabel('Episode')
plt.ylabel('Reward')


plt.show()


### See how the agent behaves

In [None]:
import torch
from ddpg import DDPG

state= env_reset(env, False)
config= {
    "label": "Noise",
    "state_size": len(state),
    "action_size": brain.vector_action_space_size,
    "seed": seed,
    "actor_lr": 0.001,
    "critic_lr": 0.001,
    "actor_nodes": [32, 32],
    "critic_nodes": [128, 128],
    "batch_size": 256,
    "memory_size": 100000,
    "discount": 0.9,
    "sigma": 0.0, # OUNoise
    "tau": 0.001,
}
agent = DDPG(config= config)
agent.actor.load_state_dict(torch.load("last_actor-aws.pth", map_location=lambda storage, loc: storage))
agent.actor.eval() 

score = 0                                          # initialize the score
while True:
    action = agent.act(state, False)
    next_state, reward, done= env_step(env, action)
        
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    print('\rScore: {:.1f} '.format(score), end="")
    if done:                                       # exit loop if episode finished
        break
    
print("\rFinal score: {:.1f}".format(score))
