# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's BipedalWalker-v2 environment.

### 1. Import the Necessary Packages

In [None]:
!pip install gymnasium[box2d]
import gymnasium as gym
import random
import numpy as np
import torch
import pandas as pd
#from google.colab import files
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from ddpg_agent_bip_X import Agent

In [14]:
#!nvcc --version
device

device(type='cpu')

### 2. Instantiate the Environment and Agent

In [None]:
#gymnasium/envs/box2d/bipedal_walker.py
#gym.pprint_registry()
#gym.spec("BipedalWalker-v3")

In [18]:
from ddpg_agent_bip_X import Agent

env = gym.make('BipedalWalker-v3', render_mode="rgb_array")
seed=10
state_size=env.observation_space.shape[0]
action_size=env.action_space.shape[0]
agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed)
print(state_size, action_size)

#agent.actor_local.load_state_dict(torch.load('./data/highscore_actor_bip.pth'), map_location=torch.device('cpu'))
#agent.critic_local.load_state_dict(torch.load('./data/highscore_critic_bip.pth'), map_location=torch.device('cpu'))

24 4


In [19]:
def ddpg(n_episodes=2000, max_t=1600, max_score=-10000.):
    scores_deque = deque(maxlen=100)
    scores = []
    #max_score = -10000 #-np.Inf
    for i_episode in range(1, n_episodes+1):
        state, _ = env.reset(seed=seed)
        agent.reset()
        score = 0
        
        for step in range(max_t):
            action = agent.act(state)
            next_state, reward, done, trun, _ = env.step(action)
            if reward==-100.:
                prob = step/1600
                reward = -8*prob
            else:
                prob = (1.+reward+(1600-step)/1600)
                if reward>=0.: 
                    reward = prob
                    prob = 8*prob
                else:
                    reward = prob
                    prob = 4*prob-2

            agent.step(state, action, reward, next_state, done or trun, prob)
            state = next_state
            score += reward
            if done or trun or reward==-3.0:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if score >= max_score + int(np.round(np.abs(0.2*max_score))):
            torch.save(agent.actor_local.state_dict(), './data/highscore_actor_bip.pth')
            torch.save(agent.critic_local.state_dict(), './data/highscore_critic_bip.pth')
            print('\rEpisode {}\tNEW HIGH SCORE! {:.2f}'.format(i_episode, score))
            max_score = score           
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), './data/checkpoint_actor_bip.pth')
            torch.save(agent.critic_local.state_dict(), './data/checkpoint_critic_bip.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}\tHigh Score: {:.2f}'.format(i_episode, np.mean(scores_deque), max_score))   
    return scores, max_score


### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [None]:
from ddpg_agent_bip_X import BATCH_SIZE
#agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed)
high_score=-1000000

In [None]:
scores, high_score = ddpg(n_episodes=500, max_score=high_score)#-1000000)
#scores += ores

In [None]:
#scores = ddpg(n_episodes=2000, max_t=800)
# ~23 min for N=1000 T=500
#scores = []; new_scores = []; high_score = -10000.
n_episodes=[400, 200, 200, 400]#, 300, 300, 600,  300, 300, 600]
max_t=     [200, 400, 1200, 1600]#, 200, 600, 1200, 200, 400, 800]
#n_episodes=[ne//4 for ne in n_episodes]

for ne, mt in zip(n_episodes, max_t):
    print('\r### Episodes: {}\tTime Limit: {:.2f} ###'.format(ne,mt))
    new_scores, high_score = ddpg(n_episodes=ne, max_score=high_score)
    scores += new_scores


In [None]:
for ne, mt in zip(n_episodes, max_t):
    print('\r### Episodes: {}\tTime Limit: {:.2f} ###'.format(ne,mt))
    new_scores, high_score = ddpg(n_episodes=ne, max_score=high_score)
    scores += new_scores

In [None]:
torch.save(agent.actor_local.state_dict(), './data/checkpoint_actor_bip.pth')
torch.save(agent.critic_local.state_dict(), './data/checkpoint_critic_bip.pth')
#files.download('checkpoint_criticA_bip.pth')
#files.download('checkpoint_actorA_bip.pth')

In [None]:
### Scores plot
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores, )
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### Examination

In [None]:
experiences = random.choices(agent.memory.memory, k=BATCH_SIZE, cum_weights=agent.memory.priority_weights)
estates = [e.state for e in experiences if e is not None]
eactions = [e.action for e in experiences if e is not None]
erewards = [e.reward for e in experiences if e is not None]
enext_states = [e.next_state for e in experiences if e is not None]
edones = [e.done for e in experiences if e is not None]

emu = np.mean(np.asarray(estates + enext_states), axis=0)
esig = np.std(np.asarray(estates + enext_states), axis=0) + 1e-3

#if len(agent.memory.memory)>200:
estates = (np.asarray(estates)-emu) / esig
enext_states = (np.asarray(enext_states)-emu) / esig
emu, esig

In [None]:
len(experiences), len(agent.memory.memory), emu, esig, 

In [None]:
import pandas as pd

## Read data
states_df = pd.read_csv('./data/states.csv', index_col=0)
nexts_df = pd.read_csv('./data/next_states.csv', index_col=0)
actions_df = pd.read_csv('./data/actions.csv', index_col=0)
norm_states_df = pd.read_csv('./data/norm_states.csv', index_col=0)
norm_nexts_df = pd.read_csv('./data/norm_next_states.csv', index_col=0)
rewprodon_df = pd.read_csv('./data/rewprodon.csv', index_col=0)

In [None]:
agmem = agent.memory.memory
probs = [pw for pw in agent.memory.priority_weights if pw is not None]
states = [e.state for e in agmem if e is not None]
actions = [e.action for e in agmem if e is not None]
rewards = [e.reward for e in agmem if e is not None]
nexts = [e.next_state for e in agmem if e is not None]
dones = [e.done for e in agmem if e is not None]
      
mu = np.mean(np.asarray(states + nexts), axis=0)
sig = np.std(np.asarray(states + nexts), axis=0) + 1e-3
norm_states = (np.asarray(states)-mu) / sig
norm_nexts = (np.asarray(nexts)-mu) / sig

In [None]:
states_df = pd.DataFrame(states)
norm_states_df = pd.DataFrame(norm_states)
actions_df = pd.DataFrame(actions)
nexts_df = pd.DataFrame(nexts)
norm_nexts_df = pd.DataFrame(norm_nexts)
rewprodon_df = pd.DataFrame(np.asarray([rewards,probs,dones]).T, columns=["rewards","probs","dones"])

In [None]:
## Save data
states_df.to_csv('./data/states.csv')
norm_states_df.to_csv('./data/norm_states.csv')
actions_df.to_csv('./data/actions.csv')
nexts_df.to_csv('./data/next_states.csv')
norm_nexts_df.to_csv('./data/norm_next_states.csv')
rewprodon_df.to_csv('./data/rewprodon.csv')

In [None]:
rewprodon_df.describe()

In [None]:
nexts_hist = nexts_df.hist(figsize=(12,14))

In [None]:
states_hist = states_df.hist(figsize=(12,14))

In [None]:
states_df.describe()

In [None]:
norm_states_hist = norm_states_df.hist(figsize=(12,14))

In [None]:
actions_hist = actions_df.hist(figsize=(12,14))

In [None]:
rewprodon_hist = rewprodon_df.hist(figsize=(12,14))

In [None]:
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(rewards)+1), rewards)
plt.ylabel('Score')
plt.xlabel('Step #')
plt.show()

In [31]:
rewprodon_df[['rewards','probs']]

Unnamed: 0,rewards,probs
0,-0.120621,1.758758
1,-0.106384,1.787232
2,-0.176796,1.646407
3,-0.249274,1.501451
4,-0.302058,1.395884
5,-0.363889,1.272223
6,-0.442580,1.114841
7,-0.486385,1.027231
8,-0.543145,0.913710
9,-0.574304,0.851391


In [49]:
#import torch.nn as nn
# With Learnable Parameters
m = nn.BatchNorm1d(state_size)
# Without Learnable Parameters
#m = nn.BatchNorm1d(100, affine=False)
batch_size = 32
input = torch.randn(batch_size, state_size)
output = m(input)
input.shape, output.shape

(torch.Size([32, 24]), torch.Size([32, 24]))

In [60]:
#states = 
samps = np.asarray(states_df.sample(n=32, replace=False, weights=rewprodon_df['probs']))
torch.from_numpy(samps)

tensor([[-0.1184,  0.0000, -0.0000, -0.0000, -0.8079, -0.0000,  0.9158,
          0.0000,  1.0000,  1.1192,  0.0000,  0.9319,  0.0000,  1.0000,
          0.2929,  0.2962,  0.3066,  0.3253,  0.3549,  0.4003,  0.4712,
          0.5886,  0.8083,  1.0000],
        [-0.1236, -0.0000,  0.0000, -0.0000, -0.8136, -0.0000,  0.9166,
          0.0000,  1.0000,  1.1346,  0.0000,  0.9309,  0.0000,  1.0000,
          0.2894,  0.2927,  0.3029,  0.3214,  0.3506,  0.3955,  0.4655,
          0.5816,  0.7986,  1.0000],
        [-0.1228,  0.0000, -0.0000, -0.0000, -0.8070, -0.0000,  0.9001,
          0.0000,  1.0000,  1.1346,  0.0000,  0.9320,  0.0000,  1.0000,
          0.2887,  0.2920,  0.3022,  0.3206,  0.3498,  0.3945,  0.4644,
          0.5802,  0.7967,  1.0000],
        [ 0.3165, -0.0207,  0.1434, -0.0353, -0.8344,  0.0000,  0.9017,
          0.0000,  0.0000, -0.8098, -0.0000,  0.9027, -0.0000,  1.0000,
          0.4357,  0.4407,  0.4561,  0.4839,  0.5279,  0.5955,  0.7010,
          0.8757,  1.0000

In [66]:
btchnrm = nn.BatchNorm1d(state_size)
s = torch.from_numpy(np.asarray(sampstates).astype(np.float))
btchnrm(s)#.shape#, btchnrm(s).shape
#m(s)

RuntimeError: Expected object of type torch.DoubleTensor but found type torch.FloatTensor for argument #2 'weight'

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
agent.actor_local.load_state_dict(torch.load('./data/highscore_actor_bip.pth'))
agent.critic_local.load_state_dict(torch.load('./data/highscore_critic_bip.pth'))

In [None]:
#### Record Frames from Episodes
episodes = []
n_episodes = 3
for ep in range(n_episodes):
    epiframes = []
    epirew = 0.
    max_t = 1600
    state, info = env.reset(seed=seed)
    for t in range(max_t):
        frame = env.render()
        action = agent.act(state)
        state, steprew, done, trun, info = env.step(action)
        if steprew<=-3.: steprew = -3.
        epirew += steprew
        epiframes.append([t+1, np.round(steprew,3), np.round(epirew,3), frame])
        if done or trun:
            break 
    episodes.append(epiframes)
    print("Total episode ", ep+1," rewards: ", np.round(epirew,3))

In [None]:
fig = plt.Figure(figsize=(12,10))
plt.axis('off')
for epiframes in episodes:
    img = plt.imshow(epiframes[0][3])
    for step, steprew, epirew, frame in epiframes[1:]:
        img.set_data(frame) 
        title = "Step: "+str(step)+"  Step Reward: "+str(steprew)+"   Episode Reward: "+str(epirew)
        plt.title(title)
        display.display(plt.gcf())
        display.clear_output(wait=True)


In [None]:
len(epiframes)

In [None]:
#### Get data
rewards = []
final_rewards = []
steps = []
actions = []
tries = 100
max_t = 400
for i in range(tries):
    step_count = 0
    reward_sum = 0
    state, info = env.reset(seed=seed)
    for j in range(max_t):
        action = agent.act(state)
        actions += [action]
        state, reward, done, trun, info = env.step(action)
        if done or trun:
            final_rewards += [reward]
            break 
        else:
            final_rewards += [0]
            reward_sum += reward
            step_count += 1
    steps += [step_count]
    rewards += [reward_sum]
actions = np.asarray(actions)
data = np.asarray([(int(s),int(r),int(f)) for s,r,f in zip(steps, np.round(rewards), final_rewards)])

In [None]:
#data = np.asarray([(int(s),int(r),int(f)) for s,r,f in zip(steps, np.round(rewards), final_rewards)])
[d for d in data if d[1]<0 ]

In [None]:
actions[-10:]

In [None]:
np.mean(data[:,1]/data[:,0])

In [None]:
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(rewards)+1), rewards)
plt.ylabel('Score')
plt.xlabel('Episode')
plt.show()
print("Total Rewards[:-1]", sum(rewards), "Average Reward:", np.mean(rewards))

In [None]:
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(rewards[:-1])+1), rewards[:-1])
plt.ylabel('Score')
plt.xlabel('Step #')
plt.show()
print("Total Rewards[:-1]", sum(rewards[:-1]), "Final Reward:", rewards[-1])

# 5. Explore

In this exercise, we have provided a sample DDPG agent and demonstrated how to use it to solve an OpenAI Gym environment.  To continue your learning, you are encouraged to complete any (or all!) of the following tasks:
- **Amend the various hyperparameters and network architecture to see if you can get your agent to solve the environment faster than this benchmark implementation.**  Once you build intuition for the hyperparameters that work well with this environment, try solving a different OpenAI Gym task!
- Write your own DDPG implementation.  Use this code as reference only when needed -- try as much as you can to write your own algorithm from scratch.
- You may also like to implement **prioritized experience replay**, to see if it speeds learning.  
- The current implementation adds Ornsetein-Uhlenbeck noise to the action space.  However, it has [been shown](https://blog.openai.com/better-exploration-with-parameter-noise/) that **adding noise to the parameters of the neural network policy can improve performance.  Make this change to the code, to verify it for yourself!**
- Write a blog post explaining the intuition behind the DDPG algorithm and demonstrating how to use it to solve an RL environment of your choosing.  

## How well does DQN with Tile Coding how well does work?
* Reuse DQN from Project 1, but use Tile Coding to turn continuous into discrete actions
* Implement improvements on DQN from Project 1 first