In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import gym
from copy import deepcopy
from tqdm.auto import tqdm
import os
import random
import math

In [None]:
EPISODES = 1000
BATCH_SIZE = 128
SEED = 88
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

'cuda'

# Utils

In [None]:
def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Actor and Critic Networks

In [None]:
class Actor(nn.Module):
    def __init__(self, in_features, action_size, action_high):
        super().__init__()
        self.l1 = nn.Linear(in_features, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_size)
        self.action_high = action_high

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = torch.tanh(self.l3(x)) * self.action_high
        return x


class Critic(nn.Module):
    def __init__(self, in_features, action_size):
        super().__init__()
        self.l1 = nn.Linear(in_features + action_size, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

    def forward(self, s, a):
        x = torch.cat([s, a], dim=-1)
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.l3(x)
        return x

# Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size=100000):
        self.state = []
        self.action = []
        self.next_state = []
        self.reward = []
        self.done_bool = []
        self.buffer_size = buffer_size

    def store(self, state, action, next_state, reward, done_bool):
        if len(self.state) == self.buffer_size:
            self.state = self.state[1:]
            self.action = self.action[1:]
            self.next_state = self.next_state[1:]
            self.reward = self.reward[1:]
            self.done_bool = self.done_bool[1:]

        self.state.append(state)
        self.action.append(action)
        self.next_state.append(next_state)
        self.reward.append(reward)
        self.done_bool.append(done_bool)

    def sample_batch(self, batch_size):
        idxs = np.random.randint(low=0, high=len(self.state), size=batch_size)
        state = torch.tensor(np.array(self.state)[idxs], dtype=torch.float).squeeze(1)
        action = torch.tensor(np.array(self.action)[idxs], dtype=torch.float).squeeze(1)
        next_state = torch.tensor(
            np.array(self.next_state)[idxs], dtype=torch.float
        ).squeeze(1)
        reward = torch.tensor(np.array(self.reward)[idxs], dtype=torch.float)
        done_bool = torch.tensor(np.array(self.done_bool)[idxs], dtype=torch.float)
        return (state, action, next_state, reward, done_bool)

    def __len__(self):
        return len(self.state)

# Deep-Deterministic Policy Gradient (DDPG)

In [None]:
class DDPG:
    def __init__(
        self,
        env,
        env_name,
        device,
        gamma=0.99,
        tau=0.02,
        act_lr=3e-4,
        critic_lr=3e-4,
    ):
        self.env_name = env_name
        self.obs_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]

        self.actor = Actor(self.obs_size, self.action_size, self.upper_bound).to(device)
        self.actor_target = deepcopy(self.actor)
        self.actor_target.eval()

        self.critic = Critic(self.obs_size, self.action_size).to(device)
        self.critic_target = deepcopy(self.critic)
        self.critic_target.eval()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=act_lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)

        self.buffer = ReplayBuffer()

        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.rewards = []

    def critic_criterion(self, pred, target):
        return F.smooth_l1_loss(pred, target)

    def eval(self):
        self.actor.eval()
        self.critic.eval()

    def train(self):
        self.actor.train()
        self.critic.train()
    
    def save_checkpoint(self):
        torch.save(
            {
                "actor": self.actor.state_dict(),
                "actor_target": self.actor_target.state_dict(),
                "actor_optim": self.actor_optim.state_dict(),
                "critic": self.critic.state_dict(),
                "critic_target": self.critic_target.state_dict(),
                "critic_optim": self.critic_optim.state_dict(),
            },
            f"DDPG_{self.env_name}.pth",
        )

    def load_checkpoint(self, path):
        ckpt = torch.load(path, map_location=self.device)
        self.actor.load_state_dict(ckpt["actor"])
        self.actor_target.load_state_dict(ckpt["actor_target"])
        self.actor_optim.load_state_dict(ckpt["actor_optim"])
        self.critic.load_state_dict(ckpt["critic"])
        self.critic_target.load_state_dict(ckpt["critic_target"])
        self.critic_optim.load_state_dict(ckpt["critic_optim"])
        print("Checkpoint loaded successfuly")
    
    def evaluate_policy(self, env, render=False, create_gif=False):
        obs = env.reset()
        if create_gif:
            import imageio

            img = env.render(mode="rgb_array")
            images = [img]
            
        done = False
        steps = 0
        rewards = 0.0
        while not done:
            with torch.no_grad():
                obs = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                action = self.actor(obs).cpu().squeeze().numpy()

            if render:
                env.render()

            if create_gif:
                img = env.render(mode="rgb_array")
                images.append(img)

            obs, r, done, _ = env.step(action)
            rewards += r
            steps += 1

            if done:
                break

        print(f"Steps: {steps}, Total reward: {rewards}")
        if create_gif:
            imageio.mimsave(
                f"{self.env_name}_agent.gif",
                [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
                fps=29,
            )
            print("saved gif")

    def update_target(self):
        self.eval()
        for param1, param2 in zip(
            self.actor.parameters(), self.actor_target.parameters()
        ):
            param2.data.copy_(self.tau * param1.data + (1 - self.tau) * param2.data)

        for param1, param2 in zip(
            self.critic.parameters(), self.critic_target.parameters()
        ):
            param2.data.copy_(self.tau * param1.data + (1 - self.tau) * param2.data)
        self.train()

    def update(self, iterations, batch_size):
        loss_tot = 0.0
        for _ in range(iterations):
            state, action, next_state, reward, done_bool = self.buffer.sample_batch(
                batch_size
            )
            state = state.to(self.device)
            action = action.to(self.device)
            next_state = next_state.to(self.device)
            reward = reward.unsqueeze(1).to(self.device)
            done_bool = done_bool.unsqueeze(1).to(self.device)

            Q = self.critic(state, action)

            Q_target = self.critic_target(
                next_state, self.actor_target(next_state).detach()
            )
            Q_target = reward + (1 - done_bool) * self.gamma * Q_target.detach()

            self.critic_optim.zero_grad()
            critic_loss = self.critic_criterion(Q, Q_target)
            critic_loss.backward()
            self.critic_optim.step()

            self.actor_optim.zero_grad()
            actor_loss = -self.critic(state, self.actor(state))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optim.step()

            self.update_target()

            loss_tot += critic_loss.item() + actor_loss.item()
        return loss_tot / iterations

    def learn(self, env, episodes, batch_size):
        writer = SummaryWriter()
        steps = 0
        for eps in range(episodes):
            loss_tracker = AverageMeter()
            reward_tracker = AverageMeter()
            obs = env.reset()
            t = 0
            while t < env._max_episode_steps:
                if steps < int(1e4):
                    action = env.action_space.sample()
                else:
                    with torch.no_grad():
                        action = self.actor(
                            torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                        )
                    action = action.squeeze(0).cpu().numpy()
                    noise = np.random.normal(0, 0.1, size=self.action_size)
                    action = (action + noise).clip(self.lower_bound, self.upper_bound)
                next_obs, reward, done, _ = env.step(action)
                self.buffer.store(obs, action, next_obs, reward, float(done))
                reward_tracker.update(reward)
                steps += 1

                if done:
                    break

                obs = next_obs
                t += 1

            self.rewards.append(reward_tracker.sum)
            if len(self.buffer) >= int(1e4):
                loss = self.update(iterations=t, batch_size=batch_size)
                loss_tracker.update(loss)

            writer.add_scalar("reward", reward_tracker.sum, eps + 1)
            print(
                "Episode: {}/{}, steps: {}/{}, total steps: {}, loss: {:.2f}, current reward: {:.2f}, running reward: {:.2f}".format(
                    eps,
                    episodes,
                    t + 1,
                    env._max_episode_steps,
                    steps,
                    loss_tracker.avg,
                    reward_tracker.sum,
                    np.mean(self.rewards[-100:]),
                )
            )
            
            self.save_checkpoint()

            if(np.mean(self.rewards[-100:]) >= 200):
                print("########## LunarLander Env Solved ###########")
                break

In [None]:
## Load tensorboard for visualization of reward
%load_ext tensorboard
%tensorboard --logdir runs

In [13]:
seed_everything(seed=SEED)
env = gym.make("LunarLanderContinuous-v2")
env.seed(SEED)
agent = DDPG(env, "LunarLanderContinuous", device=device)
agent.learn(env, EPISODES, BATCH_SIZE)

Episode: 0/1000, steps: 80/1000, total steps: 80, loss: 0.00, current reward: -122.35, running reward: -122.35
Episode: 1/1000, steps: 107/1000, total steps: 187, loss: 0.00, current reward: -362.11, running reward: -242.23
Episode: 2/1000, steps: 160/1000, total steps: 347, loss: 0.00, current reward: -298.99, running reward: -261.15
Episode: 3/1000, steps: 78/1000, total steps: 425, loss: 0.00, current reward: -50.61, running reward: -208.52
Episode: 4/1000, steps: 156/1000, total steps: 581, loss: 0.00, current reward: -61.30, running reward: -179.07
Episode: 5/1000, steps: 90/1000, total steps: 671, loss: 0.00, current reward: -48.57, running reward: -157.32
Episode: 6/1000, steps: 86/1000, total steps: 757, loss: 0.00, current reward: -214.01, running reward: -165.42
Episode: 7/1000, steps: 83/1000, total steps: 840, loss: 0.00, current reward: -162.67, running reward: -165.08
Episode: 8/1000, steps: 116/1000, total steps: 956, loss: 0.00, current reward: -420.57, running reward: 