# Debug VPG

In [49]:
from datetime import datetime
import gym
import gym.spaces
import numpy as np
import os
import random
import torch
from torch.distributions.categorical import Categorical
from torch.distributions.normal import Normal
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from typing import Any, Dict, List, Tuple, Union
import wandb


def shape(space: gym.Space):
    if isinstance(space, gym.spaces.Discrete):
        return tuple()
    elif isinstance(space, gym.spaces.Box):
        return space.sample().shape
    else:
        raise Exception(f"Unsupported space type: {type(space)}")


class ScaleLayer1d(nn.Module):
    def __init__(self, scale: torch.Tensor):
        super(ScaleLayer1d, self).__init__()

        self.scale = scale

    def forward(self, x: torch.Tensor):
        return torch.matmul(x, self.scale)


class OneHot1d(nn.Module):
    def __init__(self, num_classes: int):
        super(OneHot1d, self).__init__()

        self.num_classes = num_classes

    def forward(self, x: torch.Tensor):
        y = torch.as_tensor(x, dtype=torch.int64)
        return F.one_hot(y, num_classes=self.num_classes).to(torch.float32)


class VPGModel(nn.Module):
    def __init__(
        self,
        env: gym.Env,
        layers: List[Tuple[str, int, str]],
        device: torch.DeviceObjType,
    ):
        super(VPGModel, self).__init__()

        self.env = env
        self.device = device

        if isinstance(env.observation_space, gym.spaces.Box):
            self.layers = [("input", shape(env.observation_space)[0])] + layers
        elif isinstance(env.observation_space, gym.spaces.Discrete):
            self.layers = [
                ("input", 1),
            ] + layers

        self.model = None

    def forward(self, x: torch.Tensor):
        return self.model(x)

    def build_model(self, conf: List[Union[Tuple[str, int], Tuple[str, int, str]]]):
        layers = []
        t, prev_size = conf[0]
        assert t == "input"
        for layer in conf[1:]:
            if layer[0] == "linear":
                size = layer[1]
                layers.append(nn.Linear(prev_size, size))
                if layer[2] == "relu":
                    layers.append(nn.ReLU())
                elif layer[2] == "none":
                    pass
                elif layer[2] == "tanh":
                    layers.append(nn.Tanh())
                else:
                    raise NotImplementedError(
                        f"Unrecognized activation type: {layer[2]}"
                    )
            elif layer[0] == "scaling":
                assert isinstance(layer[1], torch.Tensor)
                layers.append(ScaleLayer1d(layer[1]))
            elif layer[0] == "onehot":
                size = layer[1]
                layers.append(OneHot1d(size))
            elif layer[0] == "embed":
                num_classes = layer[1]
                size = layer[2]
                layers.append(
                    nn.Embedding(
                        num_classes,
                        size,
                        sparse=False,
                        dtype=torch.float32,
                        device=self.device,
                    )
                )
            else:
                raise ValueError(f"Unrecognized layer type: {layer[0]}")
            prev_size = size
        return nn.Sequential(*layers).to(self.device)


class VPGValueModel(VPGModel):
    def __init__(
        self,
        env: gym.Env,
        layers: List[Tuple[str, int, str]],
        device: torch.DeviceObjType,
    ):
        super(VPGValueModel, self).__init__(env, layers, device)

        self.layers += [("linear", 1, "none")]
        self.model = self.build_model(self.layers)


class VPGPolicyModel(VPGModel):
    def __init__(
        self,
        env: gym.Env,
        layers: List[Tuple[str, int, str]],
        device: torch.DeviceObjType,
    ):
        super(VPGPolicyModel, self).__init__(env, layers, device)


class VPGGaussianPolicyModel(VPGPolicyModel):
    def __init__(
        self,
        env: gym.Env,
        layers: List[Tuple[str, int, str]],
        std_logits: float,
        device: torch.torch.DeviceObjType,
    ):
        super(VPGGaussianPolicyModel, self).__init__(env, layers, device)

        assert isinstance(env.action_space, gym.spaces.Box)

        signal_count = env.action_space.shape[0]

        output_layers = [("linear", signal_count, "tanh")]

        # TOOD: Support infinite sized boxes.
        signal_scale = torch.as_tensor(
            (env.action_space.high - env.action_space.low) / 2.0,
            dtype=torch.float32,
            device=device,
        )
        if not torch.all(torch.isclose(signal_scale, torch.ones_like(signal_scale))):
            output_layers.append(("scaling", signal_scale))

        if not np.all(
            np.isclose(env.action_space.high, -env.action_space.low, equal_nan=True)
        ):
            # TODO: add offset layer.
            raise NotImplementedError(
                "Box ranges which are not centered at 0 are not yet implemented."
            )

        self.layers += output_layers
        self.model = self.build_model(self.layers)

        self.std = torch.exp(
            std_logits * torch.ones((signal_count,), dtype=torch.float32, device=device)
        )

    def distribution(self, output: torch.Tensor):
        return Normal(output, self.std)


class VPGCategoricalPolicyModel(VPGPolicyModel):
    def __init__(
        self,
        env: gym.Env,
        layers: List[Tuple[str, int, str]],
        device: torch.torch.DeviceObjType,
    ):
        super(VPGCategoricalPolicyModel, self).__init__(env, layers, device)

        assert isinstance(env.action_space, gym.spaces.Discrete)

        output_layers = [("linear", env.action_space.n, "none")]
        self.layers += output_layers
        self.model = self.build_model(self.layers)

    def distribution(self, output: torch.Tensor):
        return Categorical(logits=output)

In [50]:
conf = {
    "cartpole": {
        "env": "CartPole-v0",
        "save_max_eps": False,
        "vf_layers": [
            ("linear", 16, "relu"),
            ("linear", 16, "relu"),
            ("linear", 16, "relu"),
            ("linear", 16, "relu"),
        ],
        "pi_layers": [
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
        ],
        "pi_lr": 0.003,
        "vf_lr": 0.0015,
        "vf_train_iters": 80,
        "gamma": 0.995,
        "lambda": 0.95,
        "batch_size": 1024,
        "steps": 163840,
        "log_step": 8192,
        "seed": 42,
    },
    "invertedpendulum": {
        "env": "InvertedPendulum-v2",
        "save_max_eps": False,
        "vf_layers": [
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
        ],
        "pi_layers": [
            ("linear", 64, "relu"),
            ("linear", 64, "relu"),
            ("linear", 64, "relu"),
        ],
        "pi_lr": 0.004,
        "vf_lr": 0.1,
        "vf_train_iters": 320,
        "std_logits": -0.5,
        "gamma": 0.995,
        "lambda": 0.99,
        "batch_size": 1024,
        "steps": 163840,
        "log_step": 8192,
        "seed": 42,
    },
    "frozenlake": {
        "env": "FrozenLake-v1",
        "env_args": {
            "is_slippery": False,
        },
        "save_max_eps": False,
        "vf_layers": [
            ("embed", 16, 8),
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
        ],
        "pi_layers": [
            ("embed", 16, 8),
            ("linear", 256, "relu"),
            ("linear", 256, "relu"),
            ("linear", 256, "relu"),
        ],
        "pi_lr": 0.01,
        "vf_lr": 0.004,
        "vf_train_iters": 80,
        "gamma": 0.99,
        "lambda": 0.97,
        "batch_size": 4096,
        "steps": 163840,
        "log_step": 8192,
        "seed": 42,
    },
    "halfcheetah": {
        "env": "HalfCheetah-v2",
        "save_max_eps": False,
        "vf_layers": [
            ("linear", 128, "relu"),
            ("linear", 128, "relu"),
        ],
        "pi_layers": [
            ("linear", 64, "relu"),
            ("linear", 64, "relu"),
            ("linear", 64, "relu"),
        ],
        "pi_lr": 0.004,
        "vf_lr": 0.1,
        "vf_train_iters": 320,
        "std_logits": -0.5,
        "gamma": 0.995,
        "lambda": 0.99,
        "batch_size": 1024,
        "steps": 163840,
        "log_step": 8192,
        "seed": 42,
    },
}

In [51]:
import argparse
import gym
import random
import spin_class.algos.vpg as vpg
import torch
import wandb

from typing import Any, Dict

import spin_class.utils as utils
# import spin_class.config as conf


env_name = 'frozenlake'

i = random.randrange(torch.cuda.device_count())
device = torch.device(f"cuda:{i}")
print("Using GPU", i)

config = conf[env_name]

kwargs = config["env_args"] if "env_args" in config else {}
env = gym.make(config["env"], **kwargs)

config["seed"] = 0

# run = wandb.init(
#     project=f"vpg-{args.env}",
#     config=config,
#     name=f"reproducibility-{seed}",
# )

run_id = ''
run_name = 'debug'

Using GPU 1


In [52]:
# Make the training reproducible.
env.seed(config["seed"])
random.seed(config["seed"])
np.random.seed(config["seed"])
torch.random.manual_seed(config["seed"])
torch.cuda.manual_seed_all(config["seed"])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

if isinstance(env.action_space, gym.spaces.Box):
    pi = VPGGaussianPolicyModel(
        env, config["pi_layers"], config["std_logits"], device
    )
elif isinstance(env.action_space, gym.spaces.Discrete):
    pi = VPGCategoricalPolicyModel(env, config["pi_layers"], device)
else:
    raise NotImplementedError(
        f"Action space type not yet supported: {type(env.action_space)}"
    )
vf = VPGValueModel(env, config["vf_layers"], device)

# wandb.watch(pi)
# wandb.watch(vf)

pi_opt = Adam(pi.parameters(), lr=config["pi_lr"])
vf_opt = Adam(vf.parameters(), lr=config["vf_lr"])

save_max_eps = config["save_max_eps"]
gamma = config["gamma"]
lam = config["lambda"]
batch_size = config["batch_size"]
avg_eps_len = 0
max_performance = False
epsilon = 1e-6
obs_dtype = (
    torch.int64
    if isinstance(env.observation_space, gym.spaces.Discrete)
    else torch.float32
)


model_dir = f"models/vpg/{env.spec.id.lower()}"
os.makedirs(f"{model_dir}/pi", mode=0o755, exist_ok=True)
os.makedirs(f"{model_dir}/vf", mode=0o755, exist_ok=True)

for k in range(0, config["steps"], batch_size):
    done = False
    obs = env.reset()
    obss = np.zeros([batch_size] + list(shape(env.observation_space)))
    if len(obss.shape) == 1:
        obss = np.expand_dims(obss, axis=1)
    rets = torch.zeros(batch_size, dtype=torch.float32, device=device)
    advs = torch.zeros(batch_size, dtype=torch.float32, device=device)
    as_ = torch.zeros(batch_size, dtype=torch.float32, device=device)
    rs = torch.zeros(batch_size, dtype=torch.float32, device=device)
    eps_vs, eps_rs = torch.zeros(
        batch_size, dtype=torch.float32, device=device
    ), torch.zeros(batch_size, dtype=torch.float32, device=device)
    ptr = 0
    eps_lens = []
    eps_len = 0
    total_rews = []
    total_rew = 0
    for i in range(batch_size):
        eps_len += 1

        obss[i, :] = obs

        with torch.no_grad():
            obs_t = torch.as_tensor(
                obs, dtype=obs_dtype, device=device
            ).unsqueeze(0)
            p = pi(obs_t)[0]
            v = vf(obs_t)[0]
            dist = pi.distribution(p)
            a = dist.sample()
            obs, r, done, _ = env.step(a.item())

        eps_vs[eps_len - 1] = v
        as_[i] = a
        eps_rs[eps_len - 1] = r
        rs[i] = r

        total_rew += r

        if done or i == batch_size - 1:
            if done:
                eps_lens.append(eps_len)
                total_rews.append(total_rew)
            ret = 0
            for i in range(eps_len - 1, -1, -1):
                ret = eps_rs[i] + gamma * ret
                rets[ptr + i] = ret
            adv = 0
            for i in range(eps_len - 1, 0, -1):
                adv = (
                    eps_rs[i - 1] + gamma * eps_vs[i] - eps_vs[i - 1]
                ) + lam * gamma * adv
                advs[ptr + i] = adv
            advs[ptr + eps_len - 1] = eps_rs[-1] - eps_vs[-1]
            ptr += eps_len
            done = False
            eps_len = 0
            total_rew = 0
            obs = env.reset()

    step = k + batch_size

    avg_total_rew = sum(total_rews) / len(total_rews)
    max_total_rew = max(total_rews)
    min_total_rew = min(total_rews)
    std_total_rew = np.std(total_rews).tolist()
    avg_eps_len = sum(eps_lens) / len(eps_lens)
    max_eps_len = max(eps_lens)
    min_eps_len = min(eps_lens)
    std_eps_len = np.std(eps_lens).tolist()

    obs_b = torch.as_tensor(obss.squeeze(), dtype=obs_dtype, device=device)
    a_b = as_
    ret_b = rets

    adv_b = advs
    std, mean = adv_b.std(dim=0), adv_b.mean()
    adv_b = (adv_b - mean) / (std + epsilon)
    
    pi_opt.zero_grad()

    logp_b = pi.distribution(pi(obs_b)).log_prob(a_b)
    pi_loss = -(logp_b * adv_b).mean()
    pi_loss.backward()
    pi_opt.step()

    for i in range(config["vf_train_iters"]):
        vf_opt.zero_grad()
        v_b = vf(obs_b).squeeze()
        vf_loss = ((v_b - ret_b) ** 2).mean()
        vf_loss.backward()
        vf_opt.step()

    if step % config["log_step"] == 0:
        pass
        # wandb.log(
        #     {
        #         "avg_total_rew": avg_total_rew,
        #         "max_total_rew": max_total_rew,
        #         "min_total_rew": min_total_rew,
        #         "std_total_rew": std_total_rew,
        #         "avg_eps_len": avg_eps_len,
        #         "max_eps_len": max_eps_len,
        #         "min_eps_len": min_eps_len,
        #         "std_eps_len": std_eps_len,
        #         "pi_loss": pi_loss.item(),
        #         "vf_loss": vf_loss.item(),
        #         "adv_std": std,
        #         "adv_mean": mean,
        #         "steps": step,
        #     }
        # )

    print(
        f"steps: {step}, avg total rew: {avg_total_rew:.4f}, avg eps length: {avg_eps_len:.2f}, min eps length: {min_eps_len}, pi loss: {pi_loss.item():.6f}, vf_loss: {vf_loss:.6f}"
    )

steps: 4096, avg total rew: 0.0203, avg eps length: 7.54, min eps length: 2, pi loss: 0.002584, vf_loss: 0.021279
steps: 8192, avg total rew: 0.0000, avg eps length: 6.67, min eps length: 2, pi loss: 0.041287, vf_loss: 0.000000
steps: 12288, avg total rew: 0.0070, avg eps length: 7.10, min eps length: 2, pi loss: 0.088946, vf_loss: 0.014411
steps: 16384, avg total rew: 0.0028, avg eps length: 11.44, min eps length: 2, pi loss: 0.015576, vf_loss: 0.001599
steps: 20480, avg total rew: 0.0699, avg eps length: 10.98, min eps length: 2, pi loss: -0.024131, vf_loss: 0.069208
steps: 24576, avg total rew: 0.3925, avg eps length: 8.98, min eps length: 2, pi loss: -0.074563, vf_loss: 0.209422
steps: 28672, avg total rew: 0.7974, avg eps length: 7.54, min eps length: 2, pi loss: -0.091247, vf_loss: 0.123361
steps: 32768, avg total rew: 0.9799, avg eps length: 6.34, min eps length: 4, pi loss: -0.052225, vf_loss: 0.016370
steps: 36864, avg total rew: 1.0000, avg eps length: 6.03, min eps length: 6

KeyboardInterrupt: 

In [18]:
l = OneHot1d(num_classes=16)
l.forward(obs_b)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]], device='cuda:1')

In [9]:
print(vf.model)

Sequential(
  (0): OneHot1d()
  (1): Linear(in_features=16, out_features=128, bias=True)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=128, bias=True)
  (4): ReLU()
  (5): Linear(in_features=128, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=128, bias=True)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)


In [16]:
num_eps = 10000
rets = []
for i in range(num_eps):
    obs = env.reset()
    ret = 0
    done = False
    while not done:
        obs, r, done, _ = env.step(env.action_space.sample())
        ret += r
    rets.append(ret)
    
print(f'average return: {sum(rets) / len(rets)}')

average return: 0.0149


In [54]:
obs_dtype = torch.int64 if isinstance(env.observation_space, gym.spaces.Discrete) else torch.float32
obs = env.reset()
env.render()
done = False
while not done:
    with torch.no_grad():
        obs_t = torch.as_tensor(obs, dtype=obs_dtype, device=device).unsqueeze(0)
        p = pi(obs_t)[0]
        print(p.cpu().numpy().tolist())
        dist = pi.distribution(p)
        print(dist.probs)
        a = dist.sample()
        obs, r, done, info = env.step(a.item())
        print(info)
        env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG
[-4.136449813842773, 20.60943031311035, 4.27929162979126, -24.616153717041016]
tensor([1.7906e-11, 1.0000e+00, 8.0893e-08, 2.2844e-20], device='cuda:1')
{'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
[-4.060048580169678, 15.172551155090332, 4.42076301574707, -19.144397735595703]
tensor([4.4400e-09, 9.9998e-01, 2.1407e-05, 1.2483e-15], device='cuda:1')
{'prob': 1.0}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
[-1.4209874868392944, -8.478757858276367, 8.982975006103516, -6.1250128746032715]
tensor([3.0311e-05, 2.6089e-08, 9.9997e-01, 2.7458e-07], device='cuda:1')
{'prob': 1.0}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
[-1.3930362462997437, -8.16796875, 9.203563690185547, -6.526035308837891]
tensor([2.5000e-05, 2.8551e-08, 9.9997e-01, 1.4747e-07], device='cuda:1')
{'prob': 1.0}
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
[-5.386871337890625, 26.826488494873047, 4.038366794586182, -30.372730255126953]
tensor([1.0231e-14, 1.0000e+00, 1.2684e-10, 1.4411e-25], device='cud

In [20]:
dir(env.spec)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_env_name',
 '_kwargs',
 'entry_point',
 'id',
 'make',
 'max_episode_steps',
 'nondeterministic',
 'order_enforce',
 'reward_threshold']