## Changes made to original

1. model layer sizes: 400, 300 --> 64, 32
> result: flatlined below 10
2. model layer sizes: 400, 300 --> 256, 64 --> 256, 128
> result: all got to 30 about same time
3. multiple to single Agents.
4. integrate parallelEnv for multiple agents

## Import Necessary Modules

In [None]:
# render ai gym environment
#!pip install gymnasium[box2d]
import gymnasium as gym

#!pip install progressbar
#import progressbar as pb
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, defaultdict
import random
import time
import numpy as np
# install package for displaying animation
#!pip install JSAnimation
#from JSAnimation.IPython_display import display_animation
import matplotlib
from matplotlib import animation
import matplotlib.pyplot as plt
%matplotlib inline

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display
else:  
    #!python -m pip install pyvirtualdisplay
    from pyvirtualdisplay import Display
    display = Display(visible=True, size=(1400, 900))
    display.start()
    
_=plt.ion()

#import agents as AG
from parallelEnv import parallelEnv
from agents import Agents, device, BUFFER_SIZE, SEED, num_agents
MANY = bool(num_agents>1)
print("Using device: ",device) 

If *continuous=True* is passed, continuous actions (corresponding to the throttle of the engines) will be used and the action space will be `Box(-1, +1, (2,), dtype=np.float32)`. 

The **first coordinate** of an action determines the **throttle of the main engine**, while the **second** coordinate specifies the **throttle of the lateral boosters**. 

Given an action `np.array([main, lateral])`, the main engine will be turned off completely if **main < 0** and the throttle scales affinely from **50% to 100%** for **0 <= main <= 1** (in particular, the main engine doesn’t work with **less than 50% power**). 

Similarly, if **-0.5 < lateral < 0.5**, the lateral boosters will not fire at all. If **lateral < -0.5**, the **left booster** will fire, and if **lateral > 0.5**, the **right booster** will fire. Again, the throttle scales affinely from **50% to 100%** between **-1 and -0.5** (and **0.5 and 1**, respectively).

*gravity* dictates the gravitational constant, this is bounded to be within **0 and -12.**

If *enable_wind=True* is passed, there will be wind effects applied to the lander. The wind is generated using the function `tanh(sin(2 k (t+C)) + sin(pi k (t+C)))`. **k** is set to **0.01**. **C** is sampled randomly between **-9999 and 9999**.

*wind_power* dictates the maximum magnitude of linear wind applied to the craft. The recommended value for *wind_power* is between **0.0 and 20.0**. 

*turbulence_power* dictates the maximum magnitude of rotational wind applied to the craft. The recommended value for *turbulence_power* is between **0.0 and 2.0**.

## Environment

### Instantiate the Environment and Agent

In [None]:
#print([k for k in gym.envs.registry.keys() if "Continuous" in k])  #.all().keys()  #.make
#gym.envs.registry['LunarLanderContinuous-v2']

num_agents = 1
MANY = bool(num_agents>1)

In [None]:
if not MANY:
    ### SINGLE ENV
    env = gym.make('LunarLanderContinuous-v2', render_mode="rgb_array")
    state_size=env.observation_space.shape[0]
    action_size=env.action_space.shape[0]
    print(state_size, action_size)

    ### SINGLE AGENT
    agent = Agents(state_size=state_size, action_size=action_size, num_agents=1, random_seed=SEED)
    #actor_filename=None, critic_filename=None, num_agents=1

    #agent.actor_local.load_state_dict(torch.load('data/highscore_actor.pth'))#, map_location=torch.device('cpu')))
    #agent.critic_local.load_state_dict(torch.load('data/highscore_critic.pth'))#, map_location=torch.device('cpu')))
    
else:
    ### PARALLEL ENVS
    envs = parallelEnv("LunarLanderContinuous-v2", n=num_agents, seed=SEED)
    state_size=envs.observation_space.shape[0]
    action_size=envs.action_space.shape[0]
    print(state_size, action_size)

    ### PARALLEL AGENTS
    agents = Agents(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=SEED,
                    actor_filename='data/trained_actor.pth', critic_filename='data/trained_critic.pth')

### Examine the State and Action Spaces

## Random Agent

## DDPG

### Train the Agent

In [None]:
### Single Agent Training
def sol_train(n_episodes=1000, print_every=100):
    start = time.time
    episode_times = []
    episode_lengths = []
    scores_deque = deque(maxlen=print_every)
    scores = []
    
    for i_episode in (range(1, n_episodes+1)):
        state, _ = env.reset(seed=SEED)
        agent.reset()
        score = 0        
        episteps = 0
        epistart = time.time()
        while True:
            action = agent.act(state)
            next_state, reward, done, trun, _ = env.step(action)
            agent.step(state, action, reward, next_state, done or trun)
            state = next_state
            score += reward 
            episteps += 1
            if done or trun: 
                break 
        scores_deque.append(score)
        scores.append(score)
        episode_lengths.append(episteps)
        episode_times.append(time.time()-epistart)
   
        cycle_steps = agent.steps%BUFFER_SIZE
        buffer_cycle = agent.steps//BUFFER_SIZE
        
        print("\rEpisode {} | Score:{:8.2f} | {:4d} Steps in {:5.3f} sec | Buffer cycle:{:3d} mod {:7d}".format(i_episode,
                                                                                                                score, 
                                                                                                                episteps,
                                                                                                                time.time()-epistart,
                                                                                                                buffer_cycle,
                                                                                                                cycle_steps), end="")
        
        if i_episode % 100 == 0:
            print("\rEpisodes {}-{} | Mean Score: {:7.2f} | Mean Steps {:4d} | Mean Time: {:5.3f} sec || Buffer cycle: {:3d} mod {:7d}".format(i_episode-100, 
                                                                                                                                               i_episode, 
                                                                                                                                               np.mean(scores_deque), 
                                                                                                                                               round(np.mean(episode_lengths)),
                                                                                                                                               np.mean(episode_times),
                                                                                                                                               buffer_cycle,
                                                                                                                                               cycle_steps))
        if sum(np.array(scores_deque)>=200)>=100:
            print('\n\nEnvironment Solved in {:d} episodes!\tAverage Reward: {:.2f}'.format(i_episode-100,
                                                                                            np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'data/slvdpnt_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'data/slvdpnt_critic.pth')
            break
            
    return scores #, agent_scores

#scores, agent_scores = ddpg()


In [None]:
def poly_train(n_episodes=1000, print_every=100, num_agents=20):
    start = time.time
    episode_times = []
    episode_lengths = []
    scores_deque = deque(maxlen=print_every)
    scores = []
    agent_scores = []
    for i_episode in (range(1, n_episodes+1)):
        epistart = time.time()
        states = envs.reset()
        agents.reset()
        episteps = 0
        score = 0
        individual_scores = np.zeros(num_agents)
        while True:
            actions = agents.act(states)
            next_states, rewards, dones, _ = envs.step(actions)
            agents.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += np.mean(rewards)
            individual_scores += rewards
            episteps+=1
            if np.any(dones):
                break 
        scores_deque.append(score)
        scores.append(score)
        agent_scores.append(individual_scores)
        episode_lengths.append(episteps)
        episode_times.append(time.time()-epistart)
        cycle_steps = agents.steps%BUFFER_SIZE
        buffer_cycle = agents.steps//BUFFER_SIZE

        print("\rEpisode {} | Score: {:7.2f} | {:4d} Steps in {:5.3f} sec || Buffer cycle: {:3d} mod {:7d}".format(i_episode,
                                                                                                                score, 
                                                                                                                episteps,
                                                                                                                time.time()-epistart,
                                                                                                                buffer_cycle,
                                                                                                                cycle_steps), end="")
        
        if i_episode % 100 == 0:
            print("\rEpisodes {}-{} | Mean Score: {:7.2f} | Mean Steps {:4d} | Mean Time: {:5.3f} sec || Buffer cycle: {:3d} mod {:7d}".format(i_episode-100, 
                                                                                                                                               i_episode, 
                                                                                                                                               np.mean(scores_deque), 
                                                                                                                                               round(np.mean(episode_lengths)),
                                                                                                                                               np.mean(episode_times),
                                                                                                                                               buffer_cycle,
                                                                                                                                               cycle_steps))
        if sum(np.array(scores_deque)>=200)>=100:
            print('\n\nEnvironment Solved in {:d} episodes!\tAverage Reward: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            torch.save(agents.actor_local.state_dict(), 'data/slvdpnt_actor.pth')
            torch.save(agents.critic_local.state_dict(), 'data/slvdpnt_critic.pth')
            break

    return scores, agent_scores

In [None]:
if False:
    #def old poly_train(n_episodes=1000, print_every=100, num_agents=20): 
    scores_deque = deque(maxlen=print_every)
    scores = []
    agent_scores = []
    for i_episode in (range(1, n_episodes+1)):
        states = envs.reset()
        agents.reset()
        score = 0
        individual_scores = np.zeros(num_agents)
        while True:
            actions = agents.act(states)
            next_states, rewards, dones, _ = envs.step(actions)
            agents.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += np.mean(rewards)
            individual_scores += rewards
            if np.any(dones):
                break 
        scores_deque.append(score)
        scores.append(score)
        agent_scores.append(individual_scores)
        print('\rEpisode {}\tReward: {:.2f}\tAverage Reward: {:.2f}'.format(i_episode, score, np.mean(scores_deque)), end="")
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tReward: {:.2f}\tAverage Reward: {:.2f}'.format(i_episode, score, np.mean(scores_deque)))
        
        if sum(np.array(scores_deque)>=200)>=100:
            print('\n\nEnvironment Solved in {:d} episodes!\tAverage Reward: {:.2f}'.format(i_episode-100,
                                                                                          np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'data/slvdpnt_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'data/slvdpnt_critic.pth')
            break
            
    #return scores, agent_scores

In [None]:
if MANY:
    ## TRAIN MANY AGENTS
    sc, ag = sol_train(n_episodes=1000)#, num_agents=num_agents)
    try:
        scores+=sc 
        agent_scores+=ag
    except:
        scores=sc
        agent_scores=ag
else:    
    ## TRAIN ONE AGENT
    sc = sol_train(n_episodes=1000)
    try: scores+=sc 
    except: scores=sc

In [None]:
torch.save(agents.actor_local.state_dict(), 'data/retraining_actor.pth')
torch.save(agents.critic_local.state_dict(), 'data/retraining_critic.pth')

In [None]:
### Plot Multiple Agent Scores
fig, ax = plt.subplots(figsize=(16, 8))
ax.plot(np.arange(1, len(scores)+1), scores, color='r', label="Mean Reward across {} Agents".format(num_agents))
for i in range(20):
    ax.plot(np.arange(1, len(scores)+1), [s[i] for s in agent_scores], alpha=0.1)
plt.legend()
ax.set_xlabel('Episode #', fontsize=14)
ax.set_ylabel('Reward', fontsize=14)
ax.set_title('Gym LunarLander Environment using DDPG', fontsize=16)
plt.show()

In [None]:
states, actions, rewards, next_states, dones = zip(*agent.memory.memory)
actions[:20]

When finished, you can close the environment.

In [None]:
env.close()

# Evaluation

In [None]:
#from unityagents import UnityEnvironment
import numpy as np
import random
import torch
from collections import deque
import matplotlib.pyplot as plt
import agents
%matplotlib inline

In [None]:
env = UnityEnvironment(file_name='.\environment\Reacher_Windows_x86_64\Reacher.exe')

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

In [None]:
agent = ddpg_agent.Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0, 
                         actor_filename='checkpoint_actor2.pth', critic_filename='checkpoint_critic2.pth')

## Iteration 1

In [None]:
env_info = env.reset(train_mode=False)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states, add_noise=False)       # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

## Iteration 2

In [None]:
env_info = env.reset(train_mode=False)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states, add_noise=False) # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

## Iteration 3

In [None]:
env_info = env.reset(train_mode=False)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states, add_noise=False) # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

## Iteration 4

In [None]:
env_info = env.reset(train_mode=False)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states, add_noise=True) # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
env.close()