Scratch development for continual learning loop

What I want to do:
- There is a continual learning env that lets an agent learn on a stream of tasks
- At specified intervals, we can evaluate the continual learner on _every_ task in the training set

Things to consider:
- can I use the ML10 environments in this setting? goals are obscured, what does that mean?
- how many steps per env do I want? I think CW uses 1 million? check this, also, with PPO is this enough? (CW uses SAC)
- randomisation of task goals - CW seems to have randomisation settings for the benchmark. MT and ML have randomised environments. How should we handle these tasks?

In [21]:
import gym
import metaworld
import random
import numpy as np

from copy import deepcopy
from typing import Any, Dict, List, Tuple

In [41]:
### also some wrappers used 
# pop successes records the successes during training
# randomisation settings - seems that they set all tasks at the start
class ContinualEnv(gym.Env):
    """
    Based on continual world env design:
    https://github.com/awarelab/continual_world/blob/main/continualworld/envs.py
    """
    def __init__(self, envs: List[gym.Env], steps_per_env: int):

        ## good check to do
        for i in range(len(envs)):
            assert envs[0].action_space == envs[i].action_space

        self.action_space = envs[0].action_space
        self.observation_space = deepcopy(envs[0].observation_space)
        # what is remove goal bounds? don't think need for meta-learning

        self.envs = envs
        self.num_envs = len(envs)
        self.steps_per_env = steps_per_env
        self.steps_limit = self.num_envs * self.steps_per_env
        self.cur_step = 0
        self.cur_seq_idx = 0

    def _get_envs(self):
        return self.envs

    def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]:
        obs, reward, terminated, truncated, info = self.envs[self.cur_seq_idx].step(action)
        done = terminated or truncated
        info["seq_idx"] = self.cur_seq_idx

        self.cur_step += 1
        if self.cur_step % self.steps_per_env == 0:
            done = True
            info["TimeLimit.truncated"] = True

            self.cur_seq_idx += 1

        return obs, reward, done, info

    def reset(self) -> np.ndarray:
        obs, _, self.envs[self.cur_seq_idx].reset()
        return obs


In [26]:
import metaworld
import random

ml10 = metaworld.ML10() # Construct the benchmark, sampling tasks

training_envs = []
for name, env_cls in ml10.test_classes.items():
  env = env_cls()
  task = random.choice([task for task in ml10.test_tasks
                        if task.env_name == name])
  env.set_task(task)
  training_envs.append(env)

In [40]:
training_envs[0].reset()

(array([0.00615235, 0.60018983, 0.19430118, 1.        , 0.0357309 ,
        0.72999998, 0.09      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.00615235, 0.60018983,
        0.19430118, 1.        , 0.0357309 , 0.72999998, 0.09      ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]),
 {})

In [34]:
def evalute_envs(envs_to_do, eval_episodes = 10, agent = None):

    ## TODO: find a way to insert the agent

    for env in envs_to_do:

        for episode in range(eval_episodes):
            done = False
            episode_count = 0
            episode_reward = 0
            success = 0
            obs, _ = env.reset()

            while not done:

                action = env.action_space.sample()
                next_obs, reward, done, info = cont_env.step(action)
                obs = next_obs
                episode_reward += reward




def evaluate_env(env, num_episodes):
    results = {i:{'episode_reward': 0, 'success': 0} for i in range(num_episodes)}
    for episode in range(num_episodes):
        done = False
        episode_reward = 0
        success = 0
        obs, _ = env.reset()

        while not done:

            action = env.action_space.sample()
            next_obs, reward, truncated, terminated, info = env.step(action)
            done = truncated or terminated
            obs = next_obs

            episode_reward += reward
            success = info['success']

            # stop on success for eval
            if success == 1:
                done = True
                
        results[episode]['episode_reward'] = episode_reward
        results[episode]['success'] = success



50

In [38]:
## training loop
# while the whole continual env is not done
# train on each env sequentially
# periodically evaluate on all envs
cont_env = ContinualEnv(training_envs, 10)
num_steps = 0
while cont_env.cur_step < cont_env.steps_limit:
    # do each env
    obs = cont_env.reset()
    done = False
    while not done:
        action = cont_env.action_space.sample()
        next_obs, reward, done, info = cont_env.step(action)
        obs = next_obs
        
    print(cont_env.cur_step, cont_env.cur_seq_idx)

    # periodically evaluate
    all_envs = cont_env._get_envs()
    eval_results = evaluate_envs(all_envs)
        

10 1
20 2
30 3
40 4
50 5


In [7]:
env = training_envs[0]
env.action_space

Box(-1.0, 1.0, (4,), float64)

In [10]:
obs, _ = env.reset()

action = env.action_space.sample()
env.step(action)

(array([ 5.71689176e-03,  6.00775945e-01,  1.94656428e-01,  1.00000000e+00,
         2.16917725e-02,  6.75859067e-01,  1.99142488e-02,  3.53522718e-04,
         1.89404037e-04,  2.59762152e-09,  9.99999920e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  6.15235164e-03,  6.00189803e-01,
         1.94301175e-01,  1.00000000e+00,  2.16845459e-02,  6.75872549e-01,
         1.99999996e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -7.60647727e-02,  8.09090702e-01,  2.14756129e-01]),
 1.4191217005275105,
 False,
 False,
 {'success': 0.0,
  'near_object': 0.2327727045644655,
  'grasp_success': 1.0,
  'grasp_reward': 0.2327727045644655,
  'in_place_reward': 0.14191217005275106,
  'obj_to_target': 0.2327727045644655,
  'unscaled_reward':