In [None]:
!pip install pybullet
import torch
import torch.nn as nn
from torch.distributions import Normal
import numpy as np
import gym
import pybullet_envs
from IPython import display

device = torch.device('cpu')
if(torch.cuda.is_available()): 
  device = torch.device('cuda:0')
display.clear_output

Custom OpenAI Gym Waler2D environment extending the PyBullet implementation

In [None]:
from pybullet_envs.gym_locomotion_envs import WalkerBaseBulletEnv
from pybullet_envs.robot_locomotors import Walker2D

class Walker2DCustomBulletEnv(WalkerBaseBulletEnv):

  def __init__(self, render=False):
    self.robot = Walker2D()
    WalkerBaseBulletEnv.__init__(self, self.robot, render)
    self.direction = 0                                               #direction forward:1 stop:0 reverse:-1
    self.mean_abs_p = 0                                              
    self.m=0.0
    self.counter = 0
    
  def step(self,a):
    self.counter-=1
    if (self.counter <= 0):
      self.direction=np.random.choice([-1,0,1],1,[0.4,0.2,0.4]).item()  #randomize direction for training
      self.counter = np.random.randint(60,600)                          #randomize direction durations 
    return self.enjoy(a)
    
  def enjoy(self,a):
    if(self.m < 100):
      self.m+=1
    potential_old = self.potential
    state , reward, done, info = super().step(a)
    potential_new = self.potential
    progress = float(potential_new - potential_old)
    abs_p = abs(progress)
    self.mean_abs_p = (self.mean_abs_p*(self.m-1)+ abs_p)/self.m

    state[2] *= self.direction                                      #target bearing observation
    reward -= 0.5                                                   #reduce stayalive bonus
    reward -= progress                                              #extract underlying velocity reward
    reward += 1.5*max(progress*(self.direction),0)                  #correct for direction
    reward += min(progress*(self.direction),0)                      #punish moving in wrong direction
    reward -= 3*abs_p*(1-abs(self.direction))                       #punish moving when direction 0
    reward += 1.5*self.mean_abs_p*(1-abs(self.direction))           #reward not moving when direction 0   
    
    return state, reward, done, info


In [None]:
%%capture --no-stderr
from gym.envs.registration import register
  
register(
    id="Walker2DCustomBulletEnv-v0",
    entry_point=Walker2DCustomBulletEnv,
    max_episode_steps=1000
)

Network Details

In [None]:
class ActorCritic(nn.Module):
  def __init__(self,state_dim,action_dim,hidden_dim):
    super(ActorCritic, self).__init__()

    self.action_dim = action_dim
    action_var = torch.full((action_dim,), 0.25, dtype=torch.float32)
    self.action_var = nn.Parameter(action_var,requires_grad=True).to(device)

    self.actor = nn.Sequential(
                    nn.Linear(state_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, action_dim),
                )

    self.critic = nn.Sequential(
                    nn.Linear(state_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, 1)
                )
 
  def act(self, state):
    with torch.no_grad():
      action_loci = self.actor(state)
      dist = Normal(action_loci, self.action_var)

      action = dist.sample()
      logprob = dist.log_prob(action).sum(dim=-1)
      value = self.critic(state)

    return action, logprob, value
    
  def evaluate(self, state, action):
    action_loci = self.actor(state)
    dist = Normal(action_loci, self.action_var)
        
    logprobs = dist.log_prob(action).sum(dim=-1)
    values = self.critic(state)
    
    return logprobs, values

Continuous PPO Implemented with GAE Avantages, Clipped Surogate Loss and Gradients


In [None]:
class PPO():
  def __init__(self,
                state_dim,
                action_dim,
                learning_rate,
                gamma,
                gae_lambda,
                n_epochs,
                clip_rangege,
                max_grad_norm,
                hidden_dim,
                vf_coef):
    super().__init__()

    
    self.gamma = gamma
    self.gae_lambda = gae_lambda
    self.clip_range = clip_range
    self.n_epochs = n_epochs
    self.vf_coef = vf_coef
    
    #Buffers
    self.actions = []
    self.states = []
    self.logprobs = []
    self.rewards = []
    self.gae_advs = []
    self.values = []
    self.traj_start = 0
    
    self.policy = ActorCritic(state_dim, action_dim, hidden_dim).to(device)
    self.optimizer = torch.optim.Adam([
                                        {'params':self.policy.actor.parameters()},
                                        {'params':self.policy.critic.parameters()},
                                        {'params':self.policy.action_var}
                                      ],lr =  learning_rate)

    self.value_loss = nn.MSELoss()
    
  def clear(self):
    self.actions.clear()
    self.states.clear()
    self.logprobs.clear()
    self.rewards.clear()
    self.gae_advs.clear()
    self.values.clear()
    self.traj_start=0
      
  def traj_update(self,padding):
    if self.traj_start >= len(self.rewards):
      return
    
    traj_rewards = self.rewards[self.traj_start:]
    traj_values = self.values[self.traj_start:]
    traj_values.append(padding)
    
    deltas = self.calculate_deltas(traj_rewards,traj_values,self.gamma)
    
    self.gae_advs += self.discounted_sum(deltas,self.gamma*self.gae_lambda)
    self.rewards[self.traj_start:] = self.discounted_sum(traj_rewards,self.gamma)
    
    self.traj_start=len(self.rewards)      
        
  def discounted_sum(self,series, coef):
    result=[]
    accum=0
    for term in reversed(series):
      accum*=coef
      accum+=term
      result.insert(0,accum)
    return result

  def calculate_deltas(self,rew,vals,gamma):
    gamma_v_1 = [v_i * gamma for v_i in vals[1:]]
    return [r + g_v_1 - v for r , g_v_1, v in zip(rew, gamma_v_1, vals[:-1])]


  def act(self, state):
    state = torch.tensor(state,dtype=torch.float32).to(device)
    self.states.append(state)

    with torch.no_grad():    
        action, logprob, value = self.policy.act(state)

    self.actions.append(action)
    self.logprobs.append(logprob)
    self.values.append(value)

    return action.detach().cpu().numpy()
    
  def get_value(self,state):
    with torch.no_grad():
      state = torch.tensor(state,dtype=torch.float32).to(device)
      value =self.policy.critic(state)
    return value        

  def update(self):
    rewards = torch.tensor(self.rewards, dtype=torch.float32).to(device)
    advantages = torch.tensor(self.gae_advs, dtype=torch.float32).to(device)
    advantages = (advantages - advantages.mean()) / (advantages.std()+1e-6)
    
    old_states = torch.stack(self.states).to(device)
    old_actions = torch.stack(self.actions).to(device)
    old_logprobs = torch.tensor(self.logprobs, dtype=torch.float32).to(device)
  
 
    for _ in range(self.n_epochs):

      logprobs, new_values = self.policy.evaluate(old_states, old_actions)
      new_values = torch.squeeze(new_values)
      ratios = torch.exp(logprobs - old_logprobs)

      surrogate = ratios * advantages
      cliped_surrogate = torch.clamp(ratios, 1-self.clip_range, 1+self.clip_range) * advantages

      loss = -torch.min(surrogate, cliped_surrogate)
      loss+= self.vf_coef*self.value_loss(new_values, rewards)
      loss=loss.mean()
        
      self.optimizer.zero_grad()

      loss.backward()
      nn.utils.clip_grad_norm_(self.policy.parameters(),max_grad_norm)
      self.optimizer.step()

    self.clear()


Hyper Parameters

In [None]:
%%capture --no-display
env = gym.make("Walker2DCustomBulletEnv-v0")

max_episode = env._max_episode_steps       
max_inters = int(3e6)                      # max environment interacitons
print_freq = 10000
print_trials = 500          
save_freq = int(1e5)                      
hidden_dim = 256                          
buffer_size = 1024                        # number of interactions to buffer
n_epochs = 20                             # number of gradient descent steps
clip_range = 0.4                          # PPO clipped parameter
gamma = 0.99                              # reward discount factor
gae_lambda = .90                          # GAE factor
max_grad_norm = 0.5                       # Gradient clipping factor
learning_rate = 3e-5                      # learning rate
vf_coef = 0.5                             # Value function loss coef

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

Intantiate PPO agent

In [None]:
agent = PPO(state_dim,
                action_dim,
                learning_rate,
                gamma,
                gae_lambda,
                n_epochs,
                clip_range,
                max_grad_norm,
                hidden_dim,
                vf_coef
                )

Training

In [None]:
rewards_history=[]

def train(start_iters, end_iters, start_episode):
    i = start_iters
    episode = start_episode
    
    while i < end_iters:
      episode_reward = 0

      state = env.reset()

      for t in range(1, max_episode+1):

        action = agent.act(state)
        
        state, reward, done, _ = env.step(action)

        agent.rewards.append(reward)

        i +=1
        episode_reward += reward

        if i % buffer_size == 0:                        #PPO Update once buffer Full
          value = agent.get_value(state)
          agent.traj_update(value)
          agent.update()

        if i % save_freq == 0:
          torch.save(agent.policy.state_dict(),'agent.pt')

        if i % print_freq == 0:

          mean_reward = np.mean(rewards_history[-print_trials:])

          print(f"Interaction :{i:7}\tEpisode :{episode:5}\tMean Reward : {mean_reward:5.1f}")

        if done:
          agent.traj_update(0)
          break

      episode += 1
      rewards_history.append(episode_reward)

    env.close()



In [None]:
train(0,max_inters,0)

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

def plot_rewards(reward_list,trials,start_epoch):
    data_list = np.array(reward_list)
    mean_data_list = np.convolve(data_list,np.ones(trials),'valid')/trials
    plt.figure(figsize=(8,8))
    x = np.arange(start_epoch, start_epoch + mean_data_list.size)
    plt.plot(x,mean_data_list, label='Mean Train Reward')
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Mean Reward', fontsize=20)
    plt.legend(loc='lower right')
    plt.grid()

In [None]:
plot_rewards(rewards_history,print_trials,0)

Load Pretrained Model

In [None]:
del agent
agent = PPO(state_dim,
                action_dim,
                learning_rate,
                gamma,
                gae_lambda,
                n_epochs,
                clip_range,
                max_grad_norm,
                hidden_dim,
                vf_coef
                )
agent.policy.load_state_dict(torch.load('agent.pt'))

Rendering of Environment

In [None]:
from IPython import display
import time

env = gym.make('Walker2DCustomBulletEnv-v0')
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))
for i in range(200):
  try:
    rendering = env.render(mode='rgb_array')
  except:
    break
  if rendering.shape != (240,320,3):
    break
  img.set_data(rendering)
  display.display(plt.gcf())    
  display.clear_output(wait=True)
  for _ in range(20):
    action = agent.act(state)
    state , r, done, _ = env.step(action)
    if done: break
    time.sleep(1/20.)

env.close()

#References

Barhate, N.. (2021). Minimal PyTorch Implementation of Proximal Policy Optimization. https://github.com/nikhilbarhate99/PPO-PyTorch.

Achiam, J. 2018. Spinning Up in Deep Reinforcement Learning. 

Antonin Raffin, Ashley Hill, Adam Gleave, Anssi Kanervisto, Maximilian Ernestus, and Noah Dormann 2021. Stable-Baselines3: Reliable Reinforcement Learning Implementations. Journal of Machine Learning Research, 22(268), p.1-8.

