<a href="https://colab.research.google.com/github/gauravjain14/rl-implementations-pytorch/blob/master/a2c_pytorchv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install stable-baselines[mpi]==2.9.0
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x

In [0]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines.common import set_global_seeds
from stable_baselines.common.cmd_util import *
from stable_baselines.common.vec_env import VecFrameStack

In [0]:
import os
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.distributions.categorical import Categorical
import torch.distributed as dist
from torch.multiprocessing import Process

import gym

render = False
update_size = 5
num_processes = 16
n_stack = 4
env_id = 'PongNoFrameskip-v4'


In [0]:
# feed forward network as described in Asynchronous methods for deep reinforcement learning
class ActorCriticFF(nn.Module):
  def __init__(self,inp_channels,dimh,dimw,actor_dim,critic_dim):
    super(ActorCriticFF,self).__init__()
    self.conv1 = nn.Conv2d(inp_channels,32,kernel_size=8,stride=4)
    self.conv2 = nn.Conv2d(32,32,kernel_size=4,stride=2)
    self.relu = nn.ReLU()

    def conv2d_size_out(size,kernel_size,stride):
      return (size - (kernel_size - 1) - 1) // stride + 1    

    convw = conv2d_size_out(conv2d_size_out(dimw,8,4),4,2)
    convh = conv2d_size_out(conv2d_size_out(dimh,8,4),4,2)

    self.linear1 = nn.Linear(convh*convw*32, 256)
    self.softmax = nn.Softmax(dim=1)
    self.actor = nn.Linear(256,actor_dim)
    self.critic = nn.Linear(256,1)

  def forward(self,x):
    x = self.relu(self.conv1(x))
    x = self.relu(self.conv2(x))
    x = x.view(x.size(0),-1)
    x = self.relu(self.linear1(x))

    actor_out = self.softmax(self.actor(x))
    critic_out = self.critic(x)
    return actor_out,critic_out


In [0]:
# Is any kind of preprocessing required?
def preprocess_pong(x):
  resize = transforms.Compose([
		transforms.ToPILImage(), # because pytorch tutorial does so
		transforms.CenterCrop(80),
		transforms.ToTensor()])
  
  return resize(x).unsqueeze(0)

""" Gradient averaging. """
def average_gradients(model):
    size = float(dist.get_world_size())
    for param in model.parameters():
        dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
        param.grad.data /= size

def discount_rewards_per_proc(rewards,dones,last_value,gamma=0.99):
  discounted_r = np.zeros_like(rewards) # should be 1-D
  running_add = 0.
  if dones[-1] == 0:
    running_add = last_value
  for t in reversed(range(0,len(rewards))):
    running_add = rewards[t] + (1-dones[t])*gamma*running_add
    discounted_r[t] = running_add

  return discounted_r

def discount_rewards_batch(rewards,dones,last_values,gamma=0.99):
  # assume rewards shape - num_proc x update_length
  batch_size = rewards.shape[0]
  discounted_r = np.zeros_like(rewards)
  for i in range(batch_size):
    discounted_r[i] = discount_rewards_per_proc(rewards[i,:],dones[i,:], \
                                                last_values[i])
  return discounted_r

def discount_rewards(rewards,dones,last_value,gamma=0.99):
  discounted_r = np.zeros_like(rewards)
  #running_add = (1-dones[-1,:])*last_value
  running_add = np.zeros((1,rewards.shape[1]))
  for t in reversed(range(0,update_size)):
    running_add = rewards[t,:] + (1-dones[t,:])*(gamma*running_add)
    discounted_r[t] = running_add

  return discounted_r

In [0]:
class Storage():
  def __init__(self,obs_dim,num_processes=1,max_depth=5,dtype=np.float32):
    self.obs_buf = np.ndarray((max_depth+1,num_processes,*obs_dim),dtype=dtype)
    self.rew_buf = np.ndarray((max_depth,num_processes,),dtype=dtype)
    self.logp_buf = np.ndarray((max_depth,num_processes,),dtype=dtype)
    self.values_buf = np.ndarray((max_depth,num_processes,),dtype=dtype)
    self.done_buf = np.ndarray((max_depth,num_processes,),dtype=np.int)
    self.last_actions = None
    self.curr_idx,self.max_depth = 0,max_depth

  def store(self,obs,rew,logp,done,value):
    assert(self.curr_idx < self.max_depth)
    self.obs_buf[self.curr_idx+1] = obs
    self.rew_buf[self.curr_idx] = rew
    self.logp_buf[self.curr_idx] = logp
    self.done_buf[self.curr_idx] = done
    self.values_buf[self.curr_idx] = value
    self.curr_idx += 1

  def store_last_actions(self,last_actions):
    self.last_actions = last_actions

  def get(self):
    # create tensors
    data = {}
    data['obs'] = self.obs_buf[1:,].swapaxes(0,1)
    data['rew'] = self.rew_buf.swapaxes(0,1)
    data['logp'] = self.logp_buf.swapaxes(0,1)
    data['done'] = self.done_buf.swapaxes(0,1)
    data['value'] = self.values_buf.swapaxes(0,1)

    return {k: torch.tensor(v) for k,v in data.items()}

  def get_last_actions(self):
    return self.last_actions

  # store only obs
  def store_obs(self,obs,pos=0):
    self.obs_buf[pos] = obs

  def get_obs(self,pos=0):
    return torch.tensor(self.obs_buf[pos])

  def rollover(self):
    self.obs_buf[0,:] = self.obs_buf[-1,:]    
    self.curr_idx = 0

In [0]:
def trajectory(env,model,preprocess_fn,storage,update_size=5):
  reward_infos = []

  x = storage.get_obs(0)
  for i in range(update_size):
    with torch.no_grad():
      action_probs,critic_value = model(x)
      m = Categorical(action_probs)
      action = m.sample()

    next_x,rew,done,infos = env.step(action)
    for info in infos:
      if 'episode' in info.keys():
        reward_infos.append(info['episode']['r'])

    storage.store(preprocess_fn(next_x), rew, -m.log_prob(action), \
                  done, critic_value.squeeze())
    x = storage.get_obs(i)

  # store last actions
  storage.store_last_actions(action)    
  return reward_infos


In [0]:
def update(vec_env,obs,rews,dones,model,optimizer,last_actions, \
            preprocess_fn,value_coeff=0.5,beta=0.01):
  action_probs,values = model(obs)
  values = values.squeeze()
  m = Categorical(action_probs)
  actions = m.sample()
  logProbs = -m.log_prob(actions)
  entropy = m.entropy()

  # what my understanding from A3C/A2C is 
  # perform an action to obtain the next value - for non terminal state
  last_obs,last_rew,_,_ = vec_env.step(last_actions)
  last_obs = torch.tensor(preprocess_fn(last_obs),dtype=torch.float32)
  with torch.no_grad():
    _,last_value = model(last_obs)

  #new_values = torch.cat((values,torch.zeros(1))).detach()
  last_value = last_value.numpy()
  dones = dones.numpy()
  returns = torch.as_tensor(discount_rewards_batch(rews.numpy(),dones, \
                                  last_value))
  returns = returns.view(-1).detach()
  advantage = returns - values

  # we need to detach advantage, right?
  actor_loss = torch.mean(-logProbs*advantage.detach())
  mseLoss = nn.MSELoss()
  critic_loss = mseLoss(returns,values)

  total_loss = actor_loss + value_coeff*critic_loss #- beta*entropy.mean() 
  total_loss.backward()

  optimizer.step()

In [0]:
def preprocess_reshape(inp):
  # expects numpy array
  return inp.transpose((0,3,1,2))

In [0]:
def train(rank,preprocess_fn, num_epochs):
  # vec_env resets after done. So we need to do this only once
  # x is now - num_processes, height, width, num_channels
  env = make_atari_env(env_id=env_id,num_env=num_processes,seed=23456)
  vec_env = VecFrameStack(env,n_stack=n_stack)  
  x = preprocess_fn(vec_env.reset())
  n_acts = vec_env.action_space.n

  storage = Storage(x.shape[1:],num_processes)
  ac = ActorCriticFF(x.shape[1],x.shape[2],x.shape[3],n_acts,1)
  optimizer = optim.Adam(ac.parameters(),lr=1e-3)
  optimizer.zero_grad()

  storage.store_obs(x)
  running_reward_sum = np.zeros((num_processes,1),dtype=np.float32)
  running_mean_reward = np.empty((num_processes,1),dtype=np.float32)
  running_mean_values = np.empty((num_processes,1),dtype=np.float32)
  running_mean_reward.fill(np.inf)
  running_mean_values.fill(np.inf)

  reward_infos = []

  for epoch in range(num_epochs):
    reward_info = trajectory(vec_env,ac,preprocess_fn,storage)
    if len(reward_info) > 0:
      print("reward after %d epochs %f "% \
            (epoch,sum(reward_info)/len(reward_info)))

    data = storage.get()
    obs,rews,logp,dones,values = data['obs'], data['rew'], \
                data['logp'],data['done'],data['value']

    obs_shape = obs.shape[2:]
    update(vec_env,obs.view(-1,*obs_shape),rews,dones,ac,optimizer, \
           storage.get_last_actions(),preprocess_fn)
    optimizer.zero_grad()

    storage.rollover()

In [0]:
train(0,preprocess_reshape,100000)