In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

ENVS = ['Ant-v2', 'Hopper-v2', 'HalfCheetah-v2', 'Humanoid-v2', 'Reacher-v2', 'Walker2d-v2']
EXPERTS_DIR = 'experts/'

In [2]:
import tensorflow as tf
import tf_util
import gym
import warnings

REFERENCE_SEED = 42

def simulation(task_name, policy_fn, max_steps=None, max_episodes=1, seed=REFERENCE_SEED):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        env = gym.make(task_name)
        with tf.Session():
            tf_util.initialize()
            env.seed(seed)
            episode_ix = 0
            step_ix = 0
            while not max_episodes or episode_ix < max_episodes:
                observation = env.reset()
                done = False                

                while not done:
                    action = policy_fn(observation[None, :])

                    sample = {
                        'observation': observation,
                        'action': action.reshape(-1)
                    }

                    observation, reward, done, _ = env.step(action)

                    sample['reward'] = reward

                    yield sample

                    if max_steps and step_ix >= max_steps:
                        break

                    step_ix += 1

                if max_steps and step_ix >= max_steps:
                    break

                episode_ix += 1

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
from load_policy import load_policy
import os
import pickle


DEMONSTRATION_STEPS = 100000
DEMONSTRATIONS_DIR = 'demonstrations'


def load_expert_policy(task_name):
    policy_file_path = os.path.join(EXPERTS_DIR, f'{task_name}.pkl')
    return load_policy(policy_file_path)


def save_demonstration(demonstration_file_path, demonstration):
    with open(demonstration_file_path, 'wb') as fp:
        pickle.dump(demonstration, fp)

        
def load_demonstration(demonstration_file_path):
    with open(demonstration_file_path, 'rb') as fp:
        return pickle.load(fp)


def get_demonstration(task_name, steps=100000, seed=42):
    os.makedirs(DEMONSTRATIONS_DIR, exist_ok=True)
    demonstration_file_path = os.path.join(DEMONSTRATIONS_DIR, f'{task_name}-demonstration-steps-{steps}-seed-{seed}.pkl')
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        if not os.path.exists(demonstration_file_path):
            demonstration = simulation(task_name, load_expert_policy(task_name), max_episodes=None, max_steps=steps, seed=seed)
            save_demonstration(demonstration_file_path, list(demonstration))
    
    return load_demonstration(demonstration_file_path)

In [4]:
import torch

def get_model(observation_size, action_size, hidden_sizes=None, divider=5):   
    if hidden_sizes is None:
        hidden_sizes = []
        hidden_size = int(observation_size / divider)
        while hidden_size > action_size*1.5:
            hidden_sizes.append(hidden_size)
            hidden_size = int(hidden_size / divider)
        
    shapes = [observation_size] + hidden_sizes + [action_size]
    
    layers = []
    for ix in range(len(shapes) - 2):
        layers.append(torch.nn.Linear(shapes[ix], shapes[ix + 1]))
        layers.append(torch.nn.ReLU())
        
    layers.append(torch.nn.Linear(shapes[-2], shapes[-1]))
    
    return torch.nn.Sequential(*layers)

In [56]:
from torch.utils.data import Dataset

class Simulation(Dataset):
    def __init__(self, simulation_data):
        self.simulation_data = simulation_data
        
    def __len__(self):
        return len(self.simulation_data)

    def __getitem__(self, idx):
        return {
            'observation': self.simulation_data[idx]['observation'].astype('float32'),
            'action': self.simulation_data[idx]['action'].astype('float32')
        }
    
    def __add__(self, other):
        return Simulation(self.simulation_data + other.simulation_data)
    
    def __iadd__(self, other):
        return self + other

In [43]:
from torch.utils.data import DataLoader

def model_to_policy(model):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    def policy_fn(observation):
        with torch.no_grad():
            return model(torch.tensor(observation.reshape(1, -1).astype('float32')).to(device)).cpu().numpy().reshape(1, -1)
    
    return policy_fn

def get_eval_fn(task_name, seed=111):
    # TODO - evaluate on multiple seeds
    def eval_fn(model):
        results = simulation(task_name, model_to_policy(model), seed=seed, max_episodes=1)
        rewards = [result['reward'] for result in results]
        return sum(rewards)
    return eval_fn

def train_model(model, dataset, epochs, batch_size=1024, shuffle=True, learning_rate=1e-3, loss_fn=None, eval_fn=None, verbose=True):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=True)
    
    if loss_fn is None:
        loss_fn = torch.nn.MSELoss(reduction='sum')
        
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    losses = []
    test_rewards = []
    for epoch in range(epochs):
        epoch_losses = []
        for batch in loader:
            action_pred = model(batch['observation'].to(device))
            loss = loss_fn(action_pred, batch['action'].to(device))
            optimizer.zero_grad()
            loss.backward()        
            optimizer.step()
            epoch_losses.append(loss.item())
        
        epoch_loss = sum(epoch_losses)/len(epoch_losses) if len(epoch_losses) > 0 else 0
        losses.append(epoch_loss)
        
        if eval_fn is not None:
            test_reward = eval_fn(model)
            test_rewards.append(test_reward)
            if verbose:
                print(f'Epoch {epoch + 1}: loss = {round(epoch_loss, 2)}, test_reward = {round(test_reward, 2)}')
        else:
            if verbose:
                print(f'Epoch {epoch + 1}: loss = {round(epoch_loss, 2)}')
        
    return losses, test_rewards
    

def train_behavioral_cloning(demonstration, epochs, batch_size=1024, learning_rate=1e-3, eval_fn=None, verbose=True):
    observation_size = demonstration[0]['observation'].shape[0]
    action_size = demonstration[0]['action'].shape[0]
    dataset = Simulation(demonstration)
    model = get_model(observation_size, action_size)
    
    lossed, test_rewards = train_model(model, dataset, epochs, batch_size=batch_size, learning_rate=learning_rate, eval_fn=eval_fn, verbose=verbose)
    return model, lossed, test_rewards

In [7]:
models_bc = {}
for env in ENVS:
    print(f'Training {env}')
    model, losses, test_rewards = train_behavioral_cloning(get_demonstration(env), epochs=15, eval_fn=get_eval_fn(env), verbose=True)
    test_results = simulation(env, model_to_policy(model), seed=REFERENCE_SEED)
                              
    models_bc[env] = {
        'model': model,
        'losses': losses,
        'test_reward': sum([result['reward'] for result in test_results])
    }
    
    expert_results = simulation(env, load_expert_policy(env), max_episodes=1, seed=REFERENCE_SEED)
    expert_reward = sum([result['reward'] for result in expert_results])
    
    print(f'Trained with test reward {models_bc[env]["test_reward"]} (expert has {expert_reward}).')

W0816 14:06:36.326490 140180404811200 deprecation_wrapper.py:119] From /home/michaldvorak/projects/courses/cs294/cs294-hws/hw1/load_policy.py:57: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



Training Ant-v2


W0816 14:06:37.212540 140180404811200 deprecation.py:323] From /home/michaldvorak/projects/courses/cs294/cs294-hws/hw1/tf_util.py:118: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
W0816 14:06:37.213543 140180404811200 deprecation_wrapper.py:119] From /home/michaldvorak/projects/courses/cs294/cs294-hws/hw1/tf_util.py:97: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0816 14:06:37.214074 140180404811200 deprecation.py:323] From /home/michaldvorak/.cache/pypoetry/virtualenvs/cs294-hw1-py3.7/lib/python3.7/site-packages/tensorflow/python/util/tf_should_use.py:193: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


Epoch 1: loss = 381.93, test_reward = 981.31
Epoch 2: loss = 159.68, test_reward = 1272.92
Epoch 3: loss = 117.01, test_reward = 124.19
Epoch 4: loss = 96.12, test_reward = 1678.24
Epoch 5: loss = 83.5, test_reward = 1928.89
Epoch 6: loss = 73.5, test_reward = 2082.23
Epoch 7: loss = 64.66, test_reward = 1849.33
Epoch 8: loss = 56.92, test_reward = 3510.02
Epoch 9: loss = 50.51, test_reward = 3782.06
Epoch 10: loss = 45.31, test_reward = 4443.77
Epoch 11: loss = 41.16, test_reward = 4276.66
Epoch 12: loss = 37.87, test_reward = 4217.24
Epoch 13: loss = 35.15, test_reward = 4307.15
Epoch 14: loss = 32.92, test_reward = 4668.71
Epoch 15: loss = 31.31, test_reward = 4626.72
Trained with test reward 4632.286540747935 (expert has 4916.8448828056).
Training Hopper-v2
Epoch 1: loss = 12514.63, test_reward = 176.38
Epoch 2: loss = 6587.25, test_reward = 57.58
Epoch 3: loss = 4309.12, test_reward = 38.8
Epoch 4: loss = 3483.54, test_reward = 40.26
Epoch 5: loss = 3151.11, test_reward = 40.14
Ep

In [61]:
import random


def train_dagger(task_name, episodes=20):
    # Set up an expert policy for the task
    expert_policy = load_expert_policy(task_name)
    
    # Train a model using behavioral cloning first
    expert_results = list(simulation(task_name,  expert_policy, max_episodes=1, seed=random.randint(0,1000)))
    observation_size = expert_results[0]['observation'].shape[0]
    action_size = expert_results[0]['action'].shape[0]
    dataset = Simulation(expert_results)
    model = get_model(observation_size, action_size)
    dataset = Simulation(expert_results)
    batch_size = min(512, 2 ** int(np.log2(len(dataset))))
    train_model(model, dataset, batch_size=batch_size, epochs=15, eval_fn=get_eval_fn(task_name), verbose=False)
    
    # Compute reference reward for the task
    reference_results = simulation(env, load_expert_policy(env), max_episodes=1, seed=REFERENCE_SEED)
    reference_reward = sum([result['reward'] for result in expert_results])
    
    # For several episodes do
    for episode in range(episodes):
        # Simulate the models behaviour in the gym
        model_results = simulation(task_name,  model_to_policy(model), max_episodes=1, seed=random.randint(0,1000))
        
        # For the observations encountered in the simulation, find ground truths using an expert policy
        expert_labeling = [
            {
                'observation': result['observation'],
                'action': expert_policy(result['observation'].reshape(1, -1)).reshape(-1)
            }
            for result in model_results
        ]
        
        # Aggregate datasets
        dataset += Simulation(expert_labeling)
        
        # Train model using the aggregated dataset
        batch_size = min(512, 2 ** int(np.log2(len(dataset))))
        total_steps = 100000
        epochs = int(total_steps / len(dataset)) + 1
        train_model(model, dataset, batch_size=batch_size, epochs=epochs, eval_fn=get_eval_fn(task_name), verbose=False)
        
        test_results = simulation(task_name, model_to_policy(model), seed=REFERENCE_SEED)
    
        print(f'Current test reward {sum([result["reward"] for result in test_results])} (expert has {expert_reward}).')

In [63]:
for env in ENVS:
    print(f'Training DAgger {env}')
    train_dagger(env)

Training Ant-v2
Current test reward 1024.1180524975766 (expert has 5573.229261637097).
Current test reward 875.364491336192 (expert has 5573.229261637097).
Current test reward 1117.3069519722367 (expert has 5573.229261637097).
Current test reward 3606.7900773617393 (expert has 5573.229261637097).
Current test reward 4224.554977761744 (expert has 5573.229261637097).
Current test reward 4509.974596031048 (expert has 5573.229261637097).
Current test reward 4462.199129061074 (expert has 5573.229261637097).
Current test reward 4413.0848901239315 (expert has 5573.229261637097).
Current test reward 4607.194981554653 (expert has 5573.229261637097).
Current test reward 4165.257273123266 (expert has 5573.229261637097).
Current test reward 4721.180581602416 (expert has 5573.229261637097).
Current test reward 4756.178484701388 (expert has 5573.229261637097).
Current test reward 4588.653749438652 (expert has 5573.229261637097).
Current test reward 4620.150873353496 (expert has 5573.229261637097).
C