# Set up environment

In [1]:
import os
import sys

In [2]:
ROOT_DIR = os.path.dirname(os.getcwd())

In [3]:
sys.path.append(ROOT_DIR)

In [4]:
os.environ["LD_LIBRARY_PATH"] ="/home/daniel/anaconda3/envs/dev/lib/python3.11/site-packages/cv2/../../lib64:/home/daniel/.mujoco/mujoco210/bin:/usr/lib/nvidia"

# Continuous Action Space

In [5]:
import torch.nn as nn
import torch
from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.data import  IterableDataset,DataLoader
import lightning as L
import numpy as np
from lightning.pytorch.loggers import TensorBoardLogger
from torch.distributions.normal import Normal
from utility import create_test_env, create_train_env, test_agent

In [6]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1500, 1000)).start()

<pyvirtualdisplay.display.Display at 0x7f817c76f190>

In [7]:
ALG_ID = 'reinforce_continuous'
ENV_ID='InvertedDoublePendulum-v4'
# ENV_ID='MountainCarContinuous-v0'
VIDEO_DIR =os.path.join(ROOT_DIR,'videos',ALG_ID,ENV_ID)
LOG_DIR = os.path.join(ROOT_DIR,'tboard',ALG_ID,ENV_ID)

In [8]:
NUM_ENVS=24
ENTROPY_COEFF = 0.01
DISCOUNT_FACTOR = 0.99
MAX_STEP = 10000
MAX_EPOCHS = 101
BATCH_SIZE = 1024
LR = 0.001

# Policy Model

In [9]:
from typing import Any

class Policy(nn.Module):
    def __init__(self, num_features, num_outputs,hidden_size=128) -> None:
        super().__init__()
        self.input = nn.Linear(in_features=num_features, out_features=hidden_size)
        self.hidden = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.mu = nn.Linear(in_features=hidden_size, out_features=num_outputs)
        self.sigma = nn.Linear(in_features=hidden_size, out_features=num_outputs)

    def forward(self,x):
        x = x if torch.is_tensor(x) else torch.FloatTensor(x)
        x = F.relu(self.input(x))
        x = F.relu(self.hidden(x))

        mu = F.tanh(self.mu(x))
        sigma = F.softplus(self.sigma(x)) + 0.001
        return mu, sigma

    @torch.no_grad()
    def pi(self,state):
        p = state if torch.is_tensor(state) else torch.FloatTensor(state)
        mu,sigma = self.forward(p)
        actions = torch.normal(mu, sigma)
        actions = actions.numpy()
        return actions


# Dataset

In [10]:
class MyDataset(IterableDataset):
    def __init__(self,env,max_step,policy,discount_factor):
        super().__init__()
        self.env = env
        self.max_step = max_step
        self.policy = policy
        self.discount_factor = discount_factor

    def __iter__(self):
        rewards = []
        states = []
        actions = []
        returns = []
        dones = []
        state,_ = self.env.reset()
        for step in range(self.max_step):
            action = self.policy(state)
            # obs, rews, terminateds, truncateds, infos
            next_state,reward,done, truncated ,infos = self.env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            state = next_state

        next_return =  np.zeros(self.env.unwrapped.num_envs)
        for t in range(self.max_step-1,-1,-1):
            reward = rewards[t]
            return_ = reward + (1 - dones[t])*self.discount_factor*next_return
            returns.insert(0,return_)
            next_return = return_
        
        states =  np.concatenate(states, axis=0).astype(np.float32) 
        returns = np.concatenate(returns, axis=0).astype(np.float32) 
        actions = np.concatenate(actions, axis=0).astype(np.float32) 

        indices = np.arange(returns.shape[0])
        np.random.shuffle(indices)
        
        for i in indices:
            yield states[i],actions[i],returns[i]
            

# Training Model

In [11]:
class Reinforce(L.LightningModule):
    def __init__(self,env_id, num_envs,lr = 1e-3, entropy_coeff=0.01, hidden_size=64, discount_factor=0.99, max_step=100, batch_size=64):
        super().__init__()
        self.env_id = env_id
        self.train_env = create_train_env(env_name=env_id,num_envs=num_envs)
        self.test_env = create_test_env(env_name=self.env_id, obs_rms=self.train_env.obs_rms, video_dir=VIDEO_DIR)
        num_features = self.train_env.unwrapped.single_observation_space.shape[-1]
        self.action_dims = self.train_env.unwrapped.single_action_space.shape[-1]
        self.policy=Policy(num_features, self.action_dims,hidden_size=hidden_size)
        self.lr = lr
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        state,action,returns = batch
        
        action = action.reshape(-1,self.action_dims)
        returns = returns.reshape(-1,1)

        mu, sigma = self.policy(state) 
        dist = Normal(mu, sigma)
        log_prob = dist.log_prob(action).sum(dim=1, keepdim=True)

        policy_loss = - log_prob * returns
        entropy = dist.entropy().sum(dim=-1, keepdim=True)

        loss = (policy_loss - self.hparams.entropy_coeff*entropy).mean()
        self.log("episode/Policy Loss", policy_loss.mean())
        self.log("episode/Entropy", entropy.mean())
        return loss
    
    def on_train_epoch_end(self):
        self.test_env.obs_rms = self.train_env.obs_rms
        average_return = test_agent(self.test_env, self.policy.pi, episodes=1, max_steps=MAX_STEP,video_dir=VIDEO_DIR)
        self.log("episode/Average Return", average_return)

    def train_dataloader(self):
        train_ds = MyDataset(env=self.train_env, discount_factor=self.hparams.discount_factor, max_step=self.hparams.max_step,policy=self.policy.pi,)
        train_dl = DataLoader(train_ds, batch_size=self.hparams.batch_size)
        return train_dl

    def configure_optimizers(self):
        optimizer = AdamW(self.policy.parameters(), lr=self.lr)
        return optimizer

In [12]:
reinforce = Reinforce(env_id=ENV_ID, 
                  lr=LR, 
                  num_envs=NUM_ENVS, 
                  discount_factor=DISCOUNT_FACTOR,
                  batch_size=BATCH_SIZE, 
                  max_step=MAX_STEP,
                  entropy_coeff=ENTROPY_COEFF)

  gym.logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [13]:
reinforce

Reinforce(
  (policy): Policy(
    (input): Linear(in_features=11, out_features=64, bias=True)
    (hidden): Linear(in_features=64, out_features=64, bias=True)
    (mu): Linear(in_features=64, out_features=1, bias=True)
    (sigma): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [14]:
trainer = L.Trainer(
    accelerator='cpu',
    max_epochs=MAX_EPOCHS,
    logger=TensorBoardLogger(save_dir=os.path.dirname(LOG_DIR), name=ENV_ID)
)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [15]:
!rm -r $VIDEO_DIR
!rm -r $LOG_DIR
!mkdir -p $LOG_DIR
!mkdir -p $VIDEO_DIR

In [16]:
%load_ext tensorboard
%tensorboard --logdir $LOG_DIR

Launching TensorBoard...

In [17]:
trainer.fit(model=reinforce,)


  | Name   | Type   | Params
----------------------------------
0 | policy | Policy | 5.1 K 
----------------------------------
5.1 K     Trainable params
0         Non-trainable params
5.1 K     Total params
0.020     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  logger.warn(


Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-0.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-0.mp4


  logger.warn(


Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-10.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-10.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-10.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-20.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-20.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-20.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-30.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-30.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-30.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-40.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-40.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-40.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-50.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-50.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-50.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-60.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-60.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-60.mp4
Moviepy - Building video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-70.mp4.
Moviepy - Writing video /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-70.mp4





Moviepy - Done !
Moviepy - video ready /home/daniel/src/rl4fun/videos/reinforce_continuous/InvertedDoublePendulum-v4/rl-video-episode-70.mp4
