# Vanilla Policy Gradient (VPG)

In [5]:
import os

os.environ['LD_LIBRARY_PATH'] = '/home/user/.mujoco/mjpro150/bin:/usr/lib/nvidia'

In [42]:
import json
import torch

state = torch.load('models/vpg/halfcheetah-v2/generous-sweep-83_rt9y9o20_1000000_20220130T034020.pth')
print(json.dumps(state['config'], indent=2, sort_keys=True))

{
  "batch_size": 4000,
  "entropy_eta": 0,
  "env": "HalfCheetah-v2",
  "env_args": {},
  "gamma": 0.965848101705409,
  "lambda": 0.97,
  "log_step": 8000,
  "max_episode_steps": 200,
  "pi_activation": "tanh",
  "pi_embed": false,
  "pi_embed_size": 0,
  "pi_layer_size": 24,
  "pi_lr": 0.0033899641429487636,
  "pi_num_layers": 4,
  "save_final": true,
  "save_max_eps": false,
  "seed": 42,
  "std_logits": -0.3459644251042939,
  "steps": 1000000,
  "trunk_activation": "relu",
  "trunk_embed": false,
  "trunk_embed_size": 0,
  "trunk_layer_size": 256,
  "trunk_num_layers": 0,
  "trunk_shared": false,
  "vf_activation": "tanh",
  "vf_embed": false,
  "vf_embed_size": 0,
  "vf_layer_size": 328,
  "vf_lr": 0.0001363970382540586,
  "vf_num_layers": 4,
  "vf_train_iters": 294
}


In [43]:
import gym
from spin_class.algos.vpg import make_models

kwargs = state['config']['env_args'] if 'env_args' in state['config'] else {}
env = gym.make(state['config']['env'], **kwargs)
device = torch.device('cpu')
pi, vf = make_models(env, device, state['config'])
pi.load_state_dict(state['pi_state_dict'])
vf.load_state_dict(state['vf_state_dict'])
pi.eval(), vf.eval()

(VPGGaussianPolicyMLP(
   (head): Sequential(
     (0): Linear(in_features=24, out_features=6, bias=True)
     (1): Tanh()
   )
   (trunk): VPGTrunkMLP(
     (head): Sequential(
       (0): Linear(in_features=17, out_features=24, bias=True)
       (1): Tanh()
       (2): Linear(in_features=24, out_features=24, bias=True)
       (3): Tanh()
       (4): Linear(in_features=24, out_features=24, bias=True)
       (5): Tanh()
       (6): Linear(in_features=24, out_features=24, bias=True)
       (7): Tanh()
     )
   )
 ),
 VPGValueMLP(
   (head): Sequential(
     (0): Linear(in_features=328, out_features=1, bias=True)
   )
   (trunk): VPGTrunkMLP(
     (head): Sequential(
       (0): Linear(in_features=17, out_features=328, bias=True)
       (1): Tanh()
       (2): Linear(in_features=328, out_features=328, bias=True)
       (3): Tanh()
       (4): Linear(in_features=328, out_features=328, bias=True)
       (5): Tanh()
       (6): Linear(in_features=328, out_features=328, bias=True)
       (7

In [44]:
import gym
from gym import wrappers
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from torch.distributions.normal import Normal

# start virtual display
if 'display' not in globals():
    display = Display(visible=False, size=(1400, 900))
    display.start()

def play(env, pi, vf, steps=1000):
    env = wrappers.Monitor(env, "./video", force=True)
    obs_dtype = (
        torch.int64
        if isinstance(env.observation_space, gym.spaces.Discrete)
        else torch.float32
    )
    obs = env.reset()
    for _ in range(steps):
        with torch.no_grad():
            obs_t = torch.as_tensor(obs, dtype=obs_dtype, device=device).unsqueeze(0)
            p = pi(obs_t)[0]
            dist = pi.distribution(p, 0.99)
            action = dist.sample().cpu().numpy().tolist()
        obs, reward, done, info = env.step(action)
        if done:
            print(_)
            break
    env.close()

    video = io.open('./video/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''
        <video alt="test" autoplay loop controls style="height: 400px;">
            <source src="data:video/mp4;base64,{0}" type="video/mp4" />
        </video>'''.format(encoded.decode('ascii'))))

    #HTML(data='''
    #    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>
    #'''.format(encoded.decode('ascii')))

In [45]:
play(env, pi, vf, steps=500)

In [16]:
num_eps = 100
rets = []
for i in range(num_eps):
    obs = env.reset()
    ret = 0
    done = False
    while not done:
        obs, r, done, _ = env.step(env.action_space.sample())
        ret += r
    rets.append(ret)
    
print(f'average return of random agent: {sum(rets) / len(rets)}')

average return: 0.0149


In [None]:
import random
import torch

def play_frozenlake(env, pi):
    obs_dtype = (
        torch.int64
        if isinstance(env.observation_space, gym.spaces.Discrete)
        else torch.float32
    )
    obs = env.reset()
    print('====== step 0 ======')
    env.render()
    for i in range(100):
        with torch.no_grad():
            obs_t = torch.as_tensor(obs, dtype=obs_dtype, device=device).unsqueeze(0)
            p = pi(obs_t)[0]
            dist = pi.distribution(p)
            a = dist.sample()
        obs, r, done, info = env.step(a.item())
        print(f'====== step {i + 1} ======')
        env.render()
        if done:
            print(_)
            break
    env.close()

In [54]:
play_frozenlake(env, pi)


[41mS[0mFFF
FHFH
FFFH
HFFG
[-4.136449813842773, 20.60943031311035, 4.27929162979126, -24.616153717041016]
tensor([1.7906e-11, 1.0000e+00, 8.0893e-08, 2.2844e-20], device='cuda:1')
{'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
[-4.060048580169678, 15.172551155090332, 4.42076301574707, -19.144397735595703]
tensor([4.4400e-09, 9.9998e-01, 2.1407e-05, 1.2483e-15], device='cuda:1')
{'prob': 1.0}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
[-1.4209874868392944, -8.478757858276367, 8.982975006103516, -6.1250128746032715]
tensor([3.0311e-05, 2.6089e-08, 9.9997e-01, 2.7458e-07], device='cuda:1')
{'prob': 1.0}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
[-1.3930362462997437, -8.16796875, 9.203563690185547, -6.526035308837891]
tensor([2.5000e-05, 2.8551e-08, 9.9997e-01, 1.4747e-07], device='cuda:1')
{'prob': 1.0}
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
[-5.386871337890625, 26.826488494873047, 4.038366794586182, -30.372730255126953]
tensor([1.0231e-14, 1.0000e+00, 1.2684e-10, 1.4411e-25], device='cud