# Set up environment

In [1]:
import os
import sys

In [2]:
ROOT_DIR = os.path.dirname(os.getcwd())
sys.path.append(ROOT_DIR)
os.environ["LD_LIBRARY_PATH"] ="/home/daniel/anaconda3/envs/dev/lib/python3.11/site-packages/cv2/../../lib64:/home/daniel/.mujoco/mujoco210/bin:/usr/lib/nvidia"

# Continuous Action Space

In [3]:
import torch.nn as nn
import torch
from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.data import  IterableDataset,DataLoader
import lightning as L
import numpy as np
from lightning.pytorch.loggers import TensorBoardLogger
from torch.distributions.normal import Normal
from utility import create_test_env, create_train_env, test_agent

In [4]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1500, 1000)).start()

<pyvirtualdisplay.display.Display at 0x7faffe39ad10>

In [5]:
ALG_ID = 'a2c_continuous'
ENV_ID='Pendulum-v1'
# ENV_ID='MountainCarContinuous-v0'
VIDEO_DIR =os.path.join(ROOT_DIR,'videos',ALG_ID,ENV_ID)
LOG_DIR = os.path.join(ROOT_DIR,'tboard',ALG_ID,ENV_ID)

In [6]:
NUM_ENVS=64
ENTROPY_COEFF = 0.01
DISCOUNT_FACTOR = 0.99
MAX_STEP = 200
MAX_EPOCHS = 2001
BATCH_SIZE = 16
VALUE_LR = 0.001
POLICY_LR = 0.0001

# Models

## Policy Model

In [7]:
class Policy(nn.Module):
    def __init__(self, num_features, num_outputs,hidden_size=128) -> None:
        super().__init__()
        self.input = nn.Linear(in_features=num_features, out_features=hidden_size)
        self.hidden = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.mu = nn.Linear(in_features=hidden_size, out_features=num_outputs)
        self.sigma = nn.Linear(in_features=hidden_size, out_features=num_outputs)

    def forward(self,x):
        x = (x if torch.is_tensor(x) else torch.FloatTensor(x)).cuda()
        x = F.relu(self.input(x))
        x = F.relu(self.hidden(x))

        mu = F.tanh(self.mu(x))*2
        sigma = F.softplus(self.sigma(x)) + 0.001
        return mu, sigma

    @torch.no_grad()
    def pi(self,state):
        p = state if torch.is_tensor(state) else torch.FloatTensor(state)
        mu,sigma = self.forward(p)
        actions = torch.normal(mu, sigma)
        actions = actions.cpu().numpy()
        return actions


## Value Model

In [8]:
class Value(nn.Module):
    def __init__(self, num_features,hidden_size=128) -> None:
        super().__init__()
        self.input = nn.Linear(in_features=num_features, out_features=hidden_size)
        self.hidden = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.out = nn.Linear(in_features=hidden_size, out_features=1)
    
    def forward(self, x):
        x = (x if torch.is_tensor(x) else torch.FloatTensor(x)).cuda()
        x = F.relu(self.input(x))
        x = F.relu(self.hidden(x))
        x = self.out(x)
        return x

# Dataset

In [9]:
class MyDataset(IterableDataset):
    def __init__(self,env,max_step,policy,discount_factor):
        super().__init__()
        self.env = env
        self.max_step = max_step
        self.policy = policy
        self.discount_factor = discount_factor
        
    def __iter__(self):
        state,_ = self.env.reset()
        for step in range(self.max_step):
            action = self.policy(state)
            # obs, rews, terminateds, truncateds, infos
            next_state,reward,done, truncated ,infos = self.env.step(action)
            yield state.astype(np.float32) , action.astype(np.float32) , reward.astype(np.float32) , done.astype(np.int32) , next_state.astype(np.float32) 
            state = next_state

In [10]:
import copy


class A2C(L.LightningModule):
    def __init__(self,env_id, num_envs,value_lr = VALUE_LR,policy_lr=POLICY_LR, entropy_coeff=0.01, hidden_size=64, discount_factor=DISCOUNT_FACTOR, max_step=100, batch_size=64):
        super().__init__()
        self.env_id = env_id
        self.train_env = create_train_env(env_name=env_id,num_envs=num_envs)
        self.test_env = create_test_env(env_name=self.env_id, obs_rms=self.train_env.obs_rms, video_dir=VIDEO_DIR,episode_trigger=lambda e: e%100==0)
        num_features = self.train_env.unwrapped.single_observation_space.shape[-1]
        self.action_dims = self.train_env.unwrapped.single_action_space.shape[-1]
        self.policy_model=Policy(num_features, self.action_dims,hidden_size=hidden_size)
        self.value_model = Value(num_features, hidden_size)
        self.num_features = num_features
        self.target_value_model = copy.deepcopy(self.value_model)
        self.value_lr = value_lr
        self.policy_lr = policy_lr
        self.automatic_optimization = False
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        state, action, reward, done, next_state = batch
        state = state.reshape(-1,self.num_features).to(self.device)
        action = action.reshape(-1,self.action_dims).to(self.device)
        reward = reward.reshape(-1,1).to(self.device)
        done = done.reshape(-1,1).to(self.device)
        next_state=next_state.reshape(-1,self.num_features)
        v_opt, p_opt = self.optimizers()
        
        state_value = self.value_model(state)

        with torch.no_grad():
            next_state_value = self.target_value_model(next_state)
            next_state_value[done] = 0.0
            target = reward + self.hparams.discount_factor * next_state_value

        v_loss = F.smooth_l1_loss(state_value,target)
        self.log("episode/Value Loss", v_loss)

        

        advantages = (target-state_value).detach()

        mu, sigma = self.policy_model(state) 
        dist = Normal(mu, sigma)
        log_prob = dist.log_prob(action).sum(dim=1, keepdim=True)

        policy_loss = - log_prob * advantages
        # entropy = dist.entropy().sum(dim=-1, keepdim=True)
        # p_loss = (policy_loss - self.hparams.entropy_coeff*entropy).mean()
        
        p_loss = policy_loss.mean()
        self.log("episode/Policy Loss", policy_loss.mean())
        # self.log("episode/Entropy", entropy.mean())

        v_opt.zero_grad()
        self.manual_backward(v_loss)
        v_opt.step()

        p_opt.zero_grad()
        self.manual_backward(p_loss)
        p_opt.step()
    
    def on_train_epoch_end(self):
        self.test_env.obs_rms = self.train_env.obs_rms
        average_return = test_agent(self.test_env, self.policy_model.pi, episodes=1, max_steps=MAX_STEP,video_dir=VIDEO_DIR)
        self.log("episode/Average Return", average_return)

        if self.current_epoch > 0 and self.current_epoch % 10 == 0:
            self.target_value_model.load_state_dict(self.value_model.state_dict())

    def train_dataloader(self):
        train_ds = MyDataset(env=self.train_env, discount_factor=self.hparams.discount_factor, max_step=self.hparams.max_step,policy=self.policy_model.pi,)
        train_dl = DataLoader(train_ds, batch_size=self.hparams.batch_size)
        return train_dl

    def configure_optimizers(self):
        policy_optimizer = AdamW(self.policy_model.parameters(), lr=self.policy_lr)
        value_optimizer = AdamW(self.value_model.parameters(), lr=self.value_lr)
        return value_optimizer,policy_optimizer
    

In [11]:
a2c = A2C(env_id=ENV_ID, 
                  policy_lr=POLICY_LR, 
                  value_lr=VALUE_LR,
                  num_envs=NUM_ENVS, 
                  discount_factor=DISCOUNT_FACTOR,
                  batch_size=BATCH_SIZE, 
                  max_step=MAX_STEP,
                  entropy_coeff=ENTROPY_COEFF)

  gym.logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [12]:
a2c

A2C(
  (policy_model): Policy(
    (input): Linear(in_features=3, out_features=64, bias=True)
    (hidden): Linear(in_features=64, out_features=64, bias=True)
    (mu): Linear(in_features=64, out_features=1, bias=True)
    (sigma): Linear(in_features=64, out_features=1, bias=True)
  )
  (value_model): Value(
    (input): Linear(in_features=3, out_features=64, bias=True)
    (hidden): Linear(in_features=64, out_features=64, bias=True)
    (out): Linear(in_features=64, out_features=1, bias=True)
  )
  (target_value_model): Value(
    (input): Linear(in_features=3, out_features=64, bias=True)
    (hidden): Linear(in_features=64, out_features=64, bias=True)
    (out): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [13]:
trainer = L.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=MAX_EPOCHS,
    logger=TensorBoardLogger(save_dir=os.path.dirname(LOG_DIR), name=ENV_ID)
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
!rm -r $VIDEO_DIR
!rm -r $LOG_DIR
!mkdir -p $LOG_DIR
!mkdir -p $VIDEO_DIR

In [15]:
%load_ext tensorboard
%tensorboard --logdir $LOG_DIR

Reusing TensorBoard on port 6006 (pid 771948), started 9:05:27 ago. (Use '!kill 771948' to kill it.)

In [16]:
trainer.fit(model=a2c,)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type   | Params
----------------------------------------------
0 | policy_model       | Policy | 4.5 K 
1 | value_model        | Value  | 4.5 K 
2 | target_value_model | Value  | 4.5 K 
----------------------------------------------
13.5 K    Trainable params
0         Non-trainable params
13.5 K    Total params
0.054     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  logger.warn(


Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-0.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-0.mp4


  logger.warn(


Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-100.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-100.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-100.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-200.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-200.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-200.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-300.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-300.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-300.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-400.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-400.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-400.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-500.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-500.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-500.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-600.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-600.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-600.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-700.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-700.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-700.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-800.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-800.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-800.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-900.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-900.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-900.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1000.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1000.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1000.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1100.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1100.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1100.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1200.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1200.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1200.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1300.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1300.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1300.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1400.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1400.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1400.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1500.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1500.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1500.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1600.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1600.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1600.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1700.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1700.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1700.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1800.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1800.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1800.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1900.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1900.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-1900.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-2000.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-2000.mp4



`Trainer.fit` stopped: `max_epochs=2001` reached.


Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/a2c_continuous/Pendulum-v1/rl-video-episode-2000.mp4
