# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's BipedalWalker-v2 environment.



## 1. Import the Necessary Packages

In [None]:
!pip install gymnasium[mujoco]
#[box2d]
import gymnasium as gym
import random
import torch
import numpy as np
import pandas as pd
from google.colab import files
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

#from ddpg_agent_reacher_X import Agent

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Models

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=400, fc2_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed, fcs1_units=400, fc2_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size, fcs1_units)
        self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xs = F.relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

#### Agent

In [None]:
import numpy as np
import random
import copy
from collections import namedtuple, deque

import torch
import torch.nn.functional as F
import torch.optim as optim
#.cpu().data.numpy()

BUFFER_SIZE = int(6.4e5)  # replay buffer size = int(1e6)
BATCH_SIZE = 64        # minibatch size = 128
GAMMA = 0.999          # discount factor = 0.99
TAU = 3e-3              # for soft update of target parameters = 1e-3
LR_ACTOR = 1e-4         # learning rate of the actor  = 1e-4
LR_CRITIC = 1e-3        # learning rate of the critic = 3e-4
WEIGHT_DECAY = 1e-6     # L2 weight decay = 0.0001 to 0.

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):#, state_highs):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, 
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = PriorityReplay(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)#, state_highs)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences, batch_ids = self.memory.sample()
            priority = self.learn(experiences, GAMMA)
            priority = [p[0] for p in priority]
            self.memory.update_priorities(batch_ids, priority)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        ### Using LOCAL actor ###
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            #action += self.noise.sample() * 0.75 # fraction bc clipping alot
            pass
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        ####### TO DO What to do about the dones? 
        ####### Log of all rewards? Pre-scale?
        Q_targets = (rewards + gamma * Q_targets_next)# * ((1 - dones) + dones/100.)
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # Compute *critic* priority == *only* critic experience loss??
        critic_priority = torch.abs(Q_expected - Q_targets) + 0.00001

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ####### TO DO critic_priority * actor_priority...? 
        actor_priority = torch.abs(self.critic_local(states, actions_pred).mean()) + 0.00001

        # ------------------- soft update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

        #priority = (actor_priority + critic_priority) / 2.
        return critic_priority.tolist()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state

class PriorityReplay:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):#, highs):
        """Initialize a PriorityReplay object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.max_priority = 1.1
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", 
                                                                "action", 
                                                                "reward", 
                                                                "next_state", 
                                                                "done"])
        self.seed = random.seed(seed)
        #self.state_highs = highs

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        # rescale states by states.high
        #state = [s/h for s,h in zip(state, self.state_highs)]
        #next_state = [s/h for s,h in zip(next_state, self.state_highs)]
        init_priority = np.random.random()*self.max_priority
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append([init_priority, e])

    def update_priorities(self, batch_ids, new_priority):
        if max(new_priority) > self.max_priority:
            self.max_priority = max(new_priority)
            print("\n>>>>>>>> Max priority =", self.max_priority,"<<<<<<<<")
            print(">>>>>>>> Min priority =", min(new_priority),"<<<<<<<<")
            print(">>>>>>>> Avg priority =", sum(new_priority)/len(new_priority),"<<<<<<<<")
        for id, new_p in zip(batch_ids, new_priority):
            self.memory[id][0] = 0.3*new_p + 0.7*self.memory[id][0]

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        exp_ids = list(range(len(self.memory)))
        priority = [m[0] for m in self.memory]#/ self.max_priority
        sum_priority = sum(priority)#sum(norm_priority)
        probs = [p / sum_priority for p in priority]
        batch_ids = np.random.choice(exp_ids, size=self.batch_size, replace=False, p=probs)
        experiences = [self.memory[id][1] for id in batch_ids]

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return (states, actions, rewards, next_states, dones), batch_ids

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [None]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed, state_highs=HIGHS)

In [None]:
agent.memory.sample()

In [None]:
scores, high_score = ddpg(n_episodes=100, max_t=1600, max_score=-10000)

In [None]:
buffer = agent.memory.memory
priorities = [b[0] for b in buffer]
experiences = [b[1] for b in buffer]
#[p for p in priorities if p>3]

In [None]:
[(e.reward, p) for e, p in zip(experiences, priorities) if not e.done and p<10 and p>5]

In [None]:
[(e.reward, p) for e, p in zip(experiences, priorities) if e.done and p<10 and p>5]

In [None]:
[e for e in experiences if e.done]

## 2. Instantiate the Environment and Agent

**Reacher Rewards**

The reward consists of two parts:
* *reward_distance*: This reward is a measure of how far the fingertip of the reacher (the unattached end) is from the target, with a more negative value assigned for when the reacher’s fingertip is further away from the target. It is calculated as the negative vector norm of (position of the fingertip - position of target), or `-norm(“fingertip” - “target”)`.
* *reward_control*: A negative reward for penalising the reacher if it takes actions that are too large. It is measured as the negative squared Euclidean norm of the action, i.e. as `-sum(action**2)`.

The total reward returned is `reward = reward_distance + reward_control`

In [None]:
from ddpg_agent_reacher_X import Agent

#env = gym.make('BipedalWalker-v3', render_mode="rgb_array")
env = gym.make('Reacher-v4', render_mode="rgb_array")
seed=0
state_size=env.observation_space.shape[0]
action_size=env.action_space.shape[0]
#HIGHS=env.observation_space.high
agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed)#, state_highs=HIGHS)
print(state_size, action_size)#, "\n", HIGHS)

### Uncomment to run saved agents of CPU
#agent.actor_local.load_state_dict(torch.load('highscore_actorA_bip.pth', map_location=torch.device('cpu')))
#agent.critic_local.load_state_dict(torch.load('highscore_criticA_bip.pth', map_location=torch.device('cpu')))

## 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [None]:
def ddpg(n_episodes=2000, max_t=1600, max_score=-10000.):
    scores_deque = deque(maxlen=100)
    scores = []
    #max_score = -10000 #-np.Inf
    for i_episode in range(1, n_episodes+1):
        state, info = env.reset(seed=seed)
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, trun, info = env.step(action)
            agent.step(state, action, reward, next_state, done or trun)
            state = next_state
            score += reward
            if done or trun:
                if done: print("\nDone. Step: {}\t Reward: {}".format(t, reward))
                else: print("\nTruncated. Step: {}\t Reward: {}".format(t, reward))
                break 
        scores_deque.append(score)
        scores.append(score)
        if score >= max_score + 15:
            torch.save(agent.actor_local.state_dict(), 'highscore_actor_bip.pth')
            torch.save(agent.critic_local.state_dict(), 'highscore_critic_bip.pth')
            print('\rEpisode {}\tNEW HIGH SCORE! {:.2f}'.format(i_episode,score))
            max_score = score           
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_bip.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_bip.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}\tHigh Score: {:.2f}'.format(i_episode, np.mean(scores_deque), max_score))   
    return scores, max_score 


In [None]:
#scores = ddpg(n_episodes=2000, max_t=800)
# ~23 min for N=1000 T=500
scores = []; new_scores = []; high_score = -10000.
n_episodes=[400, 400, 400]#, 300]#, 600,  300, 300, 600]
max_t=[100, 1200]#, 200, 600]#, 1200, 200, 400, 800]
n_episodes=[ne//4 for ne in n_episodes]

for ne, mt in zip(n_episodes, max_t):
    mt=1600
    print('\r### Episodes: {}\tTime Limit: {:.2f} ###'.format(ne,mt))
    new_scores, high_score = ddpg(n_episodes=ne, max_score=high_score)#, max_t
    scores += new_scores


In [None]:
new_scores = []
new_scores, high_score = ddpg(n_episodes=128, max_t=1200, max_score=high_score)#, max_score=-10000), experiences
scores += new_scores

In [None]:
torch.save(agent.actor_local.state_dict(), 'checkpoint_actorB_bip.pth')
torch.save(agent.critic_local.state_dict(), 'checkpoint_criticB_bip.pth')
files.download('checkpoint_criticB_bip.pth')
files.download('checkpoint_actorB_bip.pth')

In [None]:
### Scores plot
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

## 4. Observe

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
agent.actor_local.load_state_dict(torch.load('highscore_actorA_bip.pth'))
agent.critic_local.load_state_dict(torch.load('highscore_criticA_bip.pth'))

In [None]:
# load the weights from file
#agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
#frames = deque(maxlen=300)
frames = [] #np.zeros(300,)
n_steps = 300

for i in range(1):
    state, info = env.reset(seed=seed)
    frames.append(env.render())
    #img = plt.imshow(env.render())    
    for j in range(n_steps):
        action = agent.act(state)
        state, reward, done, trun, info = env.step(action)
        frames.append(env.render())
        #img.set_data(env.render()) 
        #plt.axis('off')
        #display.display(plt.gcf())
        #display.clear_output(wait=True)
        if done or trun:
            break 
frames = np.asarray(frames)
print(frames.shape)

In [None]:
img = plt.imshow(frames[0])
for f in frames[1:]:
    img.set_data(f) 
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)


    
#env.close()

In [None]:
action

In [None]:
#### Get data
rewards = []
final_rewards = []
steps = []
epi_acts = []
actions = []
tries = 1000
max_t = 1000
for i in range(tries):
    step_count = 0
    reward_sum = 0
    state, info = env.reset(seed=seed)
    for j in range(max_t):
        action = agent.act(state)
        epi_acts += [action]
        state, reward, done, trun, info = env.step(action)
        if done or trun:
            final_rewards += [reward]
            break 
        else:
            final_rewards += [0]
            reward_sum += reward
            step_count += 1
    actions += [epi_acts]
    steps += [step_count]
    rewards += [reward_sum]

actions = np.asarray(actions)
data = np.asarray([(int(s),int(r),int(f)) for s,r,f in zip(steps, np.round(rewards), final_rewards)])

In [None]:
data_df = pd.DataFrame(data, columns=["Steps", "Rewards", "Final Rewards"])
data_df.describe()

In [None]:
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(rewards)+1), rewards)
plt.ylabel('Score')
plt.xlabel('Episode')
plt.show()
print("Total Rewards[:-1]", sum(data[1]), "Average Reward:", np.mean(data[1]), "Avg. Final Rewards:", np.mean(data[-1]))

# 5. Explore

In this exercise, we have provided a sample DDPG agent and demonstrated how to use it to solve an OpenAI Gym environment.  To continue your learning, you are encouraged to complete any (or all!) of the following tasks:
- **Amend the various hyperparameters and network architecture to see if you can get your agent to solve the environment faster than this benchmark implementation.**  Once you build intuition for the hyperparameters that work well with this environment, try solving a different OpenAI Gym task!
- Write your own DDPG implementation.  Use this code as reference only when needed -- try as much as you can to write your own algorithm from scratch.
- You may also like to implement **prioritized experience replay**, to see if it speeds learning.  
- The current implementation adds Ornsetein-Uhlenbeck noise to the action space.  However, it has [been shown](https://blog.openai.com/better-exploration-with-parameter-noise/) that **adding noise to the parameters of the neural network policy can improve performance.  Make this change to the code, to verify it for yourself!**
- Write a blog post explaining the intuition behind the DDPG algorithm and demonstrating how to use it to solve an RL environment of your choosing.  

## How well does DQN with Tile Coding how well does work?
* Reuse DQN from Project 1, but use Tile Coding to turn continuous into discrete actions
* Implement improvements on DQN from Project 1 first