Scratch development for continual learning loop

What I want to do:
- There is a continual learning env that lets an agent learn on a stream of tasks
- At specified intervals, we can evaluate the continual learner on _every_ task in the training set

Things to consider:
- can I use the ML10 environments in this setting? goals are obscured, what does that mean?
- how many steps per env do I want? I think CW uses 1 million? check this, also, with PPO is this enough? (CW uses SAC)
- randomisation of task goals - CW seems to have randomisation settings for the benchmark. MT and ML have randomised environments. How should we handle these tasks?

### Other thoughts
Next step is to integrate the continual environment and evaluation loop into a training loop. We want to use the PPO algorithm provided (or something similar), but provide it with arbitrary policy / value networks (i.e. our own).

Testing learning in this way might require setting up some command line 'args' stuff - might be easier to match it with PPO
But is this the simplest testing method?

Should look at policy storage - how can I use that?

Also - continue to look at how to get continual environments to work. Continual world has randomization handlers, success counters etc.



In [1]:
import gym
import metaworld
import random
import numpy as np
import torch

from copy import deepcopy
from typing import Any, Dict, List, Tuple

In [5]:
from algorithms.custom_ppo import PPO

In [3]:
import sys
sys.path.append('./algorithms/')

In [None]:
#1. get prior at start for base latent
# (does this reset the hidden state? I think so)
#2. feed policy observation + latent -> gets action, obs, reward, done
#3. feed encoder action, obs, reward, done and hidden state to get next action


In [None]:
        
# evaluation loop across all environments
for env_name, env, in environments.items():

    # reset encoder to prior
    latent, hidden_state = self.get_prior_latent()

    for episode in range(num_episodes):
        obs, _ = env.reset()

        done = False
        episode_reward = 0
        episode_steps = 0
        success = 0

        while not done:

            _, act = self.policy.act(obs, latent, None, None)
            action = act.cpu().detach().numpy()[0]

            next_obs, reward, terminated, truncated, info = env.step(action)
            obs = next_obs
            done = terminated or truncated
            episode_reward += reward
            episode_steps += 1
            success = 1 if info['success']==1 else 0

            latent, hidden_state = self.update_latent(obs, act, reward, hidden_state)

            # break loop during eval if task success
            if success == 1:
                done = True

        episode_log[env_name]['episode_reward'].append(episode_reward / episode_steps)
        episode_log[env_name]['episode_len'].append(episode_steps)
        episode_log[env_name]['successes'].append(success)



In [3]:
### also some wrappers used 
# pop successes records the successes during training
# randomisation settings - seems that they set all tasks at the start
class ContinualEnv(gym.Env):
    """
    Based on continual world env design:
    https://github.com/awarelab/continual_world/blob/main/continualworld/envs.py
    """
    def __init__(self, envs: List[gym.Env], steps_per_env: int):

        ## good check to do
        for i in range(len(envs)):
            assert envs[0].action_space == envs[i].action_space

        self.action_space = envs[0].action_space
        self.observation_space = deepcopy(envs[0].observation_space)
        # what is remove goal bounds? don't think need for meta-learning

        self.envs = envs
        self.num_envs = len(envs)
        self.steps_per_env = steps_per_env
        self.steps_limit = self.num_envs * self.steps_per_env
        self.cur_step = 0
        self.cur_seq_idx = 0

    def _get_envs(self):
        return self.envs

    def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]:
        obs, reward, terminated, truncated, info = self.envs[self.cur_seq_idx].step(action)
        done = terminated or truncated
        info["seq_idx"] = self.cur_seq_idx

        self.cur_step += 1
        if self.cur_step % self.steps_per_env == 0:
            done = True
            info["TimeLimit.truncated"] = True

            self.cur_seq_idx += 1

        return obs, reward, done, info

    def reset(self) -> np.ndarray:
        obs, _, self.envs[self.cur_seq_idx].reset()
        return obs


In [4]:
import metaworld
import random

ml10 = metaworld.ML10() # Construct the benchmark, sampling tasks

training_envs = []
for name, env_cls in ml10.test_classes.items():
  env = env_cls()
  task = random.choice([task for task in ml10.test_tasks
                        if task.env_name == name])
  env.set_task(task)
  training_envs.append(env)

In [5]:
from a2c_ppo_acktr.algo import ppo
from a2c_ppo_acktr import storage, model

# get RL2 trained policy for example
RUN_FOLDER = './logs/logs_ML10-v2/rl2_73__25:10_21:13:08'

encoder_net = torch.load(RUN_FOLDER + '/models/encoder.pt')



In [6]:
import copy
import glob
import os
import time
from collections import deque

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# example training
### set the environment
env = training_envs[0]
### create a policy / actor critic
# policy_net = torch.load(RUN_FOLDER + '/models/policy.pt')
# we can update this
actor_critic = model.Policy(
    env.observation_space.shape,
    env.action_space,
    base_kwargs={'recurrent': True})
actor_critic.to(device)


### set algorithm
ppo_agent = ppo.PPO(
    actor_critic = actor_critic,
    clip_param = 0.2,
    ppo_epoch = 4,
    num_mini_batch = 10,
    value_loss_coef = 0.5,
    entropy_coef = 1.0e-5,
    lr = 1.0e-4,
    eps = 0.99,
    max_grad_norm = 0.5,
    use_clipped_value_loss = False
)

## create rollouts
rollouts = storage.RolloutStorage(
    10, 1, env.observation_space.shape, env.action_space, actor_critic.recurrent_hidden_state_size
)

# don't know
obs, _ = env.reset()
rollouts.obs[0].copy_(torch.from_numpy(obs))
rollouts.to(device)
episode_rewards = deque(maxlen = 10)

# actual training loop
start = time.time()
num_updates = 10
for j in range(num_updates):
    done = False
    step = 0
    while not done:
        with torch.no_grad():
            # this might be funny
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], None)
            step += 1

        # step the env
        obs, reward, truncated, terminated, info = env.step(action.detach().to('cpu').numpy()[0])
        done = truncated or terminated


IndexError: index 11 is out of bounds for dimension 0 with size 11

In [None]:
value
action
action_log_prob
recurrent_hidden_states

In [None]:
def evaluate_all_envs(envs_to_do, num_episodes = 10, agent = None):

    ## TODO: find a way to insert the agent
    results = {i: dict() for i in range(len(envs_to_do))}
    for i, env in enumerate(envs_to_do):
        results[i] = evaluate_single_env(env, num_episodes)
        _, _ = env.reset()

    return results

def evaluate_single_env(env, num_episodes):
    results = {'episode_reward': [], 'success': []} 
    for episode in range(num_episodes):
        done = False
        episode_reward = 0
        success = 0
        episode_len = 0
        obs, _ = env.reset()

        while not done:

            action = env.action_space.sample()
            next_obs, reward, truncated, terminated, info = env.step(action)
            done = truncated or terminated
            obs = next_obs

            episode_reward += reward
            success = info['success']
            episode_len += 1

            # stop on success for eval
            if success == 1:
                done = True

        results['episode_reward'].append(episode_reward / episode_len)
        results['success'].append(success)

    return results



In [None]:
## training loop
# while the whole continual env is not done
# train on each env sequentially
# periodically evaluate on all envs
cont_env = ContinualEnv(training_envs, 10)
eval_freq = 10
num_steps = 0
eval_results = dict()
while cont_env.cur_step < cont_env.steps_limit:
    # do each env
    obs = cont_env.reset()
    done = False
    while not done:
        action = cont_env.action_space.sample()
        next_obs, reward, done, info = cont_env.step(action)
        obs = next_obs

        # periodically evaluate
        if  (cont_env.cur_step + 1) % eval_freq == 0:
            print(cont_env.cur_step, 'EVALUATING')
            all_envs = cont_env._get_envs()
            eval_results[cont_env.cur_step] = evaluate_all_envs(all_envs, num_episodes = 3)
            eval_results[cont_env.cur_step]['task'] = cont_env.cur_seq_idx
        
    print(cont_env.cur_step, cont_env.cur_seq_idx)


        

In [None]:
def unpack_results(results_dict, _iter):
    df = pd.DataFrame(results_dict[_iter])
    task = np.unique(df.task).item()
    res =df\
        .loc[:,[col for col in df.columns if col != 'task']]\
        .reset_index(
            names = 'metric'
        )\
        .melt(
            id_vars = 'metric',
            var_name = 'task'
        )\
        .explode('value')
    res.loc[:, 'iter'] = _iter
    res.loc[:, 'current_task'] = task
    return res


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

res = pd.concat([unpack_results(eval_results, i) for i in eval_results.keys()])

successes = res\
.query('metric=="success"')\
.groupby(['task', 'iter'])\
.agg(
    {
        'current_task':'max',
        'value': lambda x: sum(x) / len(x)
    }
)\
.reset_index()

rewards = res.query('metric=="episode_reward"')

fig, ax = plt.subplots(1,2, figsize = (15, 7))

sns.lineplot(
    data = rewards,
    x = 'iter',
    y = 'value',
    hue = 'task',
    ax = ax[0]
)

sns.lineplot(
    data = successes,
    x = 'iter',
    y = 'value',
    hue = 'task',
    ax = ax[1]
)

plt.show();

In [None]:
pd.json_normalize(eval_results)

In [None]:
import pandas as pd
df = pd.DataFrame(eval_results)
df

In [None]:
pd.json_normalize(df.iloc[0, 0], max_level=0)

In [None]:
[(k, [(i, pd.v) for i, v in v.items()]) for k, v in eval_results.items()]

In [None]:
{{'iter': k, 'value': pd.json_normalize(v)} for k, v in eval_results.items()}

In [None]:
eval_results.items()

In [None]:
obs, _ = env.reset()

action = env.action_space.sample()
env.step(action)