In [7]:
import os
import gym
import pickle
import argparse
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter 

from utils.utils import *

# Variational Adversarial Inverse Reinforcement Learning

## Environment

`env` is the environment and will take our actions and return the next state

## Running mean and variance

`ZFilter`  incorporates the input into a running estimate of mean and variance, then 
returns the z-score of the input 

When you get a large number of inputs x in sequence, but cannot store every x, yet would like to update M and V which are the running mean and variance.

initialize
$M_1 = x_i$

$V_1 = 0$

$M_t = M_{t-1} + \frac{(x_t + M_{t-1})}{t}$

$S_t = S_{t-1} + \frac{(x_t – M_{t-1})(x_t – M_t)}{t}$


In [6]:
env = gym.make('BipedalWalker-v3')
env.seed(0)
torch.manual_seed(0)
print("state space", env.observation_space, "action space", env.action_space)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
running_state = ZFilter((num_inputs,), clip=5)

state space Box(24,) action space Box(4,)


In [12]:
class Actor(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_outputs)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        mu = self.fc3(x)
        logstd = torch.zeros_like(mu)
        std = torch.exp(logstd)
        return mu, std


class Critic(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc3(x)
        return v


class VDB(nn.Module):
    def __init__(self, num_inputs, hidden_size, z_size):
        super(VDB, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, z_size)
        self.fc3 = nn.Linear(hidden_size, z_size)
        self.fc4 = nn.Linear(z_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, 1)
        
        self.fc5.weight.data.mul_(0.1)
        self.fc5.bias.data.mul_(0.0)

    def encoder(self, x):
        h = torch.tanh(self.fc1(x))
        return self.fc2(h), self.fc3(h)
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(logvar/2)
        eps = torch.randn_like(std)
        return mu + std * eps

    def discriminator(self, z):
        h = torch.tanh(self.fc4(z))
        return torch.sigmoid(self.fc5(h))
    
    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        prob = self.discriminator(z)
        return prob, mu, logvar

In [13]:
actor = Actor(num_inputs, num_actions, hidden_size=128)
critic = Critic(num_inputs, hidden_size=128)
vdb = VDB(num_inputs + num_actions, hidden_size=128, z_size=4)