In [6]:
import os
import gym
import pickle
import argparse
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from tensorboardX import SummaryWriter 

from utils.utils import *

import matplotlib.pyplot as plt

print('you are using PyTorch version ',torch.__version__)

if torch.cuda.is_available():
    use_cuda = True
    print("you have", torch.cuda.device_count(), "GPUs")
    device = torch.device("cuda:0")
    print(device)
else:
    use_cuda = False
    print('no GPUs detected')
    device = torch.device("cpu")

%load_ext autoreload
%autoreload 2
%matplotlib inline

you are using PyTorch version  1.4.0
you have 2 GPUs
cuda:0
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Variational Adversarial Inverse Reinforcement Learning

## Environment

`env` is the environment and will take our actions and return the next state

## Running mean and variance

`ZFilter`  incorporates the input into a running estimate of mean and variance, then 
returns the z-score of the input 

When you get a large number of inputs x in sequence, but cannot store every x, yet would like to update M and V which are the running mean and variance.

initialize
$M_1 = x_i$

$V_1 = 0$

$M_t = M_{t-1} + \frac{(x_t + M_{t-1})}{t}$

$S_t = S_{t-1} + \frac{(x_t – M_{t-1})(x_t – M_t)}{t}$


In [7]:
env = gym.make('BipedalWalker-v3')
#env = gym.make('Hopper-v2')
env.seed(0)
torch.manual_seed(0)
print("state space", env.observation_space, "action space", env.action_space)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
running_state = ZFilter((num_inputs,), clip=5)

state space Box(24,) action space Box(4,)


In [8]:
class Actor(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_outputs)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        mu = self.fc3(x)
        logstd = torch.zeros_like(mu)
        std = torch.exp(logstd)
        return mu, std


class Critic(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc3(x)
        return v


class VDB(nn.Module):
    def __init__(self, num_inputs, hidden_size, z_size):
        super(VDB, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, z_size)
        self.fc3 = nn.Linear(hidden_size, z_size)
        self.fc4 = nn.Linear(z_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, 1)
        
        self.fc5.weight.data.mul_(0.1)
        self.fc5.bias.data.mul_(0.0)

    def encoder(self, x):
        h = torch.tanh(self.fc1(x))
        return self.fc2(h), self.fc3(h)
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(logvar/2)
        eps = torch.randn_like(std)
        return mu + std * eps

    def discriminator(self, z):
        h = torch.tanh(self.fc4(z))
        return torch.sigmoid(self.fc5(h))
    
    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        prob = self.discriminator(z)
        return prob, mu, logvar

In [9]:
actor = Actor(num_inputs, num_actions, hidden_size=128)
critic = Critic(num_inputs, hidden_size=128)
vdb = VDB(num_inputs + num_actions, hidden_size=128, z_size=4)

learning_rate = 3e-4
l2_rate = 1e-3

actor_optim = optim.Adam(actor.parameters(), lr=learning_rate)
critic_optim = optim.Adam(critic.parameters(), lr=learning_rate, weight_decay=l2_rate) 
vdb_optim = optim.Adam(vdb.parameters(), lr=learning_rate)

In [10]:
# load demonstrations
expert_demo, _ = pickle.load(open('expert_demo.p', "rb"))
demonstrations = np.array(expert_demo)
print("demonstrations.shape", demonstrations.shape, demonstrations[:2])

demonstrations.shape (50000, 14) [[-0.48224705 -1.18786003  1.84605944  0.62223241 -0.39152268 -3.21709328
   0.05523458 -0.0175782   0.14056332  0.08432692  0.01398241  2.57012254
   2.16022653  1.25368368]
 [-0.48457226 -1.11279922  1.86942212  0.62266743 -0.38204572 -3.11900995
  -0.0192135   0.62860545  0.72578217  0.08719617  0.25489085  2.5566931
   2.40988924  1.14469644]]


In [12]:
episodes = 0
train_discrim_flag = True
max_iter_num = 4
total_sample_size = 512

for iter in range(max_iter_num):
    actor.eval(), critic.eval()
    memory = deque()
    steps = 0
    scores = []
    
    while steps < total_sample_size: 
        
        state = env.reset()
        score = 0

        state = running_state(state)
        print(state)
        break
    break

[-0.69259548  0.70470214 -0.70708828 -0.45938691  0.70703177  0.70709881
 -0.70703864 -0.70709676  0.          0.70700392  0.70709881 -0.70695971
 -0.70709507  0.         -0.65528151 -0.65528151 -0.66221509 -0.66221509
 -0.66751234 -0.66751234 -0.67507248 -0.68020843  0.          0.        ]
