Scratch development for continual learning loop

What I want to do:
- There is a continual learning env that lets an agent learn on a stream of tasks
- At specified intervals, we can evaluate the continual learner on _every_ task in the training set

Things to consider:
- can I use the ML10 environments in this setting? goals are obscured, what does that mean?
- how many steps per env do I want? I think CW uses 1 million? check this, also, with PPO is this enough? (CW uses SAC)
- randomisation of task goals - CW seems to have randomisation settings for the benchmark. MT and ML have randomised environments. How should we handle these tasks?

### Other thoughts
Next step is to integrate the continual environment and evaluation loop into a training loop. We want to use the PPO algorithm provided (or something similar), but provide it with arbitrary policy / value networks (i.e. our own).

Testing learning in this way might require setting up some command line 'args' stuff - might be easier to match it with PPO
But is this the simplest testing method?

Should look at policy storage - how can I use that?

Also - continue to look at how to get continual environments to work. Continual world has randomization handlers, success counters etc.



In [26]:
# import sys
# sys.path.append('./algorithms/')
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from copy import deepcopy
from typing import Any, Dict, List, Tuple

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from algorithms.custom_ppo import CustomPPO
from models.combined_actor_critic import ActorCritic
RUN_FOLDER = './rl2_baseline/rl2_double_baseline'
policy_net_agent = torch.load(RUN_FOLDER + '/models/policy.pt', map_location=device)
encoder_net_agent = torch.load(RUN_FOLDER + '/models/encoder.pt', map_location=device)
actor_critic = ActorCritic(policy_net_agent, encoder_net_agent)

# agent = CustomPPO(
#                  actor_critic=actor_critic,
#                  value_loss_coef=1,
#                  entropy_coef=1,
#                  policy_optimiser='Adam',
#                  lr=0.1,
#                  clip_param=0.2,
#                  ppo_epoch=5,
#                  num_mini_batch=5,
#                  max_grad_norm = 0.5,
#                  eps=None,
#                  use_huber_loss=True,
#                  use_clipped_value_loss=True,
#                  context_window = None
#                  )

In [117]:
## create envs
from environments.custom_env_utils import prepare_parallel_envs, prepare_base_envs
from environments.custom_metaworld_benchmark import ML3
num_processes=2
raw_envs = prepare_base_envs(
    ['push-v2'], benchmark=ML3(), task_set='train', randomization='deterministic')
    # task_names, benchmark = MT50, task_set = 'train', randomization="random_init_fixed20"
envs = prepare_parallel_envs(
    envs = raw_envs,
    steps_per_env=500,
    num_processes=num_processes,
    gamma=0.99,
    normalise_rew=True,
    device=device,
    seed = 73
)

In [28]:
class dummy_agent1:
    def __init__(self, actor_critic):
        self.actor_critic = actor_critic

    def act(self, state, latent, belief, task, deterministic = False):
        return self.actor_critic.act(state, latent, belief, task, deterministic)
    
    def get_value(self, state, latent, belief, task):
        return self.actor_critic.get_value(state, latent, belief, task)
    
    def get_latent(self, action, state, reward, hidden_state, return_prior = False):
        _, latent_mean, latent_logvar, hidden_state = self.actor_critic.encoder(action, state, reward, hidden_state, return_prior = return_prior)
        latent = torch.cat((latent_mean.clone(), latent_logvar.clone()), dim = -1).squeeze()
        ## assume always add non-linearity to latent
        # return F.relu(latent[None,:]), hidden_state
        return F.relu(latent), hidden_state
    
    def get_prior(self, num_processes):
        _, latent_mean, latent_logvar, hidden_state = self.actor_critic.encoder.prior(num_processes)
        latent = torch.cat((latent_mean.clone(), latent_logvar.clone()), dim=-1).squeeze()
        ## assume always add non-linearity to latent
        return F.relu(latent), hidden_state

class dummy_agent2:
    def __init__(self, actor_critic):
        self.actor_critic = actor_critic

    def act(self, state, latent, belief, task, deterministic = False):
        return self.actor_critic.act(state, latent, belief, task, deterministic)
    
    def get_value(self, state, latent, belief, task):
        return self.actor_critic.get_value(state, latent, belief, task)
    
    def get_latent(self, action, state, reward, hidden_state, return_prior = False):
        _, latent_mean, latent_logvar, hidden_state = self.actor_critic.encoder(action, state, reward, hidden_state, return_prior = return_prior)
        latent = torch.cat((latent_mean.clone(), latent_logvar.clone()), dim = -1)
        ## assume always add non-linearity to latent
        return F.relu(latent[None,:]), hidden_state
    
    def get_prior(self, num_processes):
        _, latent_mean, latent_logvar, hidden_state = self.actor_critic.encoder.prior(num_processes)
        latent = torch.cat((latent_mean.clone(), latent_logvar.clone()), dim=-1)
        ## assume always add non-linearity to latent
        return F.relu(latent), hidden_state

In [119]:
RUN_FOLDER = './rl2_baseline/rl2_double_baseline'
policy_net_agent = torch.load(RUN_FOLDER + '/models/policy.pt', map_location=device)
encoder_net_agent = torch.load(RUN_FOLDER + '/models/encoder.pt', map_location=device)
actor_critic = ActorCritic(policy_net_agent, encoder_net_agent)
agent = dummy_agent1(actor_critic)
eps = 0
episode_reward=[]
episode_norm_reward=[]

# steps limit is parameter for whole continual env
while envs.get_env_attr('cur_step') < envs.get_env_attr('steps_limit'):

    step = 0
    obs = envs.reset() # we reset all at once as metaworld is time limited
    current_task = envs.get_env_attr("cur_seq_idx")
    episode_reward = []
    successes = []
    gating_values = []
    done = [False for _ in range(num_processes)]

    with torch.no_grad():

        latent, hidden_state = agent.get_prior(num_processes)
        # print(latent.size(), hidden_state.size())

    while not all(done):
        with torch.no_grad():
            value, action = agent.act(None, latent, None, None, deterministic=True)

        next_obs, (rew_raw, rew_normalised), done, info = envs.step(action.squeeze(0))
        assert all(done) == any(done), "Metaworld envs should all end simultaneously"
        print(next_obs.size(), rew_raw.size(), done, info, action.size())
        raise ValueError

        # create mask for episode ends
        masks_done = torch.FloatTensor([[0.0] if _done else [1.0] for _done in done]).to(device)

        ## combine all rewards
        episode_reward.append(rew_raw)
        episode_norm_reward.append(rew_normalised)
        # if we succeed at all then the task is successful
        successes.append(torch.tensor([i['success'] for i in info]))

        latent, hidden_state = agent.get_latent(
            action, next_obs, rew_raw, hidden_state, return_prior = False
        )
        # _, mu, logvar, hidden_state = (
        #     agent
        #     .actor_critic
        #     .encoder(action, next_obs, rew_raw, hidden_state, return_prior=False)
        # )
        # latent = F.relu(torch.cat((mu, logvar), dim = -1).squeeze())
            

        obs = next_obs

        step += 1
    eps+=1
envs.close()

torch.Size([2, 40]) torch.Size([2, 1]) [False False] ({'success': 0.0, 'near_object': 0.0, 'grasp_success': 0.0, 'grasp_reward': 0.01785945696830044, 'in_place_reward': 0.16334190586893652, 'obj_to_target': 0.2035419559095121, 'unscaled_reward': 0.03571891393660088, 'seq_idx': 0, 'env_name': 'push-v2', 'env': '<metaworld.envs.mujoco.sawyer_xyz.v2.sawyer_push_v2.SawyerPushEnvV2 object at 0x0000021AE2F8F2B0>'}, {'success': 0.0, 'near_object': 0.0, 'grasp_success': 0.0, 'grasp_reward': 0.01785945696830044, 'in_place_reward': 0.16334190586893652, 'obj_to_target': 0.2035419559095121, 'unscaled_reward': 0.03571891393660088, 'seq_idx': 0, 'env_name': 'push-v2', 'env': '<metaworld.envs.mujoco.sawyer_xyz.v2.sawyer_push_v2.SawyerPushEnvV2 object at 0x0000020DD986F2B0>'}) torch.Size([2, 4])


ValueError: 

In [114]:
torch.stack(episode_reward).mean()

tensor(0.0368)

In [115]:
torch.stack(successes).mean()

tensor(0.)

In [105]:
import random
## try creating a base env
env_name='pick-place-v2'
ml3 = ML3()
envs = ml3.test_classes[env_name]()
task = random.choice([task for task in ml3.test_tasks if task.env_name==env_name])
envs.set_task(task)

In [106]:
RUN_FOLDER = './rl2_baseline/rl2_double_baseline'
policy_net_agent = torch.load(RUN_FOLDER + '/models/policy.pt', map_location=device)
encoder_net_agent = torch.load(RUN_FOLDER + '/models/encoder.pt', map_location=device)
actor_critic = ActorCritic(policy_net_agent, encoder_net_agent)
agent = dummy_agent1(actor_critic)
for i in range(10):
    episode_reward=[]

    step = 0
    obs = envs.reset() # we reset all at once as metaworld is time limited
    episode_reward = []
    successes = []
    done=False


    with torch.no_grad():

        latent, hidden_state = agent.get_prior(1)
        # print(latent.size(), hidden_state.size())

    while not done:
        with torch.no_grad():
            value, action = agent.act(None, latent, None, None, deterministic=False)

        next_obs, reward,truncated, terminated, info = envs.step(action.numpy())
        done = truncated or terminated
        next_obs = np.concatenate((next_obs,[0.0]))

        # create mask for episode ends
        # masks_done = torch.FloatTensor([[0.0] if _done else [1.0] for _done in done]).to(device)

        ## combine all rewards
        episode_reward.append(reward)
        # if we succeed at all then the task is successful
        successes.append(info['success'])

        latent, hidden_state = agent.get_latent(
            torch.tensor(action).float().unsqueeze(0), 
            torch.tensor(next_obs).float().unsqueeze(0), 
            torch.tensor(reward).float().unsqueeze(0).unsqueeze(0), 
            hidden_state, 
            return_prior = False
        )
            

        obs = next_obs

        step += 1
    print(torch.tensor(episode_reward).mean())

envs.close()


  torch.tensor(action).float().unsqueeze(0),


tensor(0.2656)
tensor(0.0224, dtype=torch.float64)
tensor(0.0783)
tensor(0.0195, dtype=torch.float64)
tensor(0.0249)
tensor(0.0410)
tensor(0.0130)
tensor(0.0562, dtype=torch.float64)
tensor(0.0373)
tensor(0.0290, dtype=torch.float64)


In [101]:
torch.tensor(episode_reward).mean()

tensor(9.2883)

In [97]:
print(
    torch.tensor(action).float().unsqueeze(0).size(), 
    torch.tensor(next_obs).float().unsqueeze(0).size(), 
    torch.tensor(reward).float().unsqueeze(0).unsqueeze(0).size()
)

torch.Size([1, 4]) torch.Size([1, 40]) torch.Size([1, 1])


  torch.tensor(action).float().unsqueeze(0).size(),


In [88]:
np.concatenate((next_obs,[0.0]))

array([ 6.24888659e-03,  5.99651668e-01,  1.94001019e-01,  1.00000000e+00,
        4.65829553e-02,  6.01204335e-01,  1.99124146e-02, -1.95931737e-04,
       -2.96176911e-04, -1.62033349e-08,  9.99999937e-01,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  6.15235164e-03,  6.00189803e-01,
        1.94301175e-01,  1.00000000e+00,  4.65942126e-02,  6.01196870e-01,
        1.99999996e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [130]:
random.choice([task for task in ml3.train_classes.keys() if task == 'reach-v2'])

'reach-v2'

In [140]:
import gym
import metaworld
import random

from environments.custom_metaworld_benchmark  import ML3

class ML3SingleEnv(gym.Env):

    def __init__(self, env_name, train=False):
        self.benchmark= ML3()
        self.train = train
        self.env_name = env_name
        if train:
            self.tasks = [task for task in self.benchmark.train_tasks if task.env_name==self.env_name]
            self.env_cls = self.benchmark.train_classes[self.env_name]
        else:
            self.tasks = [task for task in self.benchmark.test_tasks if task.env_name==self.env_name]
            self.env_cls = self.benchmark.test_classes[self.env_name]

        ## set the task
        self.env = self.env_cls()
        self.task = random.choice(self.tasks)
        self.env.set_task(self.task)
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

        # metaworld max steps - hardcoded
        self._max_episode_steps = 500

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        info['task'] = self.task
        return obs, reward, done, info
    
    def reset(self):
        obs, _ = self.env.reset()
        return obs
    
    def get_task(self):
        return self.task
    
    ## reset_task is automatically created in make_env using set_task
    def set_task(self, task = None):
        if task is None:
            if self.train:
                task = random.choice(
                    [task for task in self.benchmark.train_tasks if task.env_name==self.env_name]
                    )
            else: 
                task = random.choice(
                    [task for task in self.benchmark.test_tasks if task.env_name==self.env_name]
                    )

        self.task = task
        self.env.set_task(self.task)

    # duplicated for varibad temporarily
    def reset_task(self, task = None):
        if task is None:
            if self.train:
                task = random.choice(
                    [task for task in self.benchmark.train_tasks if task.env_name==self.env_name]
                    )
            else: 
                task = random.choice(
                    [task for task in self.benchmark.test_tasks if task.env_name==self.env_name]
                    )

        self.task = task
        self.env.set_task(self.task)


In [141]:
train_ml3_reach = ML3SingleEnv('reach-v2', train=False)

In [143]:
train_ml3_reach.get_task()

Task(env_name='reach-v2', data=b'\x80\x04\x954\x01\x00\x00\x00\x00\x00\x00}\x94(\x8c\x08rand_vec\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x06\x85\x94h\x05\x8c\x05dtype\x94\x93\x94\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C0\xba\xf1\xc6\xc4}L\xb7?\x0f\xbdUV\x05f\xe5?\x00\x00\x00@\xe1z\x94?dM/\x18~\xb5\xa1?\x9a\xf4\x93(Mf\xeb?\xe0b\xebH<\x96\xce?\x94t\x94b\x8c\x07env_cls\x94\x8c3metaworld.envs.mujoco.sawyer_xyz.v2.sawyer_reach_v2\x94\x8c\x10SawyerReachEnvV2\x94\x93\x94\x8c\x14partially_observable\x94\x89u.')

In [5]:
import torch
from environments.parallel_envs import make_vec_envs, make_env
from environments.env_utils.running_mean_std import RunningMeanStd

# from models import encoder, policy
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
# env_id, seed, rank, episodes_per_task, tasks, add_done_info, **kwargs
env_fn = [make_env(
    env_id='ML_3_single-v2',
    seed=73,
    rank=0+i,
    episodes_per_task=1,
    tasks=None,
    add_done_info=True,
    **{'task_name':'reach-v2', 'train':True}
    ) for i in range(5)]


In [16]:
from environments.parallel_envs import SubprocVecEnv

envs = SubprocVecEnv(env_fn)

In [18]:
envs.close()

In [11]:
env.reset()

array([ 0.00615235,  0.6001898 ,  0.19430117,  1.        , -0.02596853,
        0.68842153,  0.02      ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00615235,  0.6001898 ,
        0.19430117,  1.        , -0.02596853,  0.68842153,  0.02      ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [35]:
envs.close()

In [58]:
from algorithms.custom_ppo import CustomPPO
from models.combined_actor_critic import ActorCritic

num_processes=5
envs = make_vec_envs(
    env_name='ML_3_single-v2',
    seed=73, 
    num_processes=num_processes, 
    gamma=0.99,
    device=device, 
    episodes_per_task=1,
    normalise_rew=True, 
    ret_rms=None, 
    tasks=None,
    rank_offset=0,
    add_done_info=True,
    **{'task_name':'pick-place-v2', 'train':False}
)

RUN_FOLDER = './rl2_baseline/rl2_double_baseline'
policy_net_agent = torch.load(RUN_FOLDER + '/models/policy.pt', map_location=device)
encoder_net_agent = torch.load(RUN_FOLDER + '/models/encoder.pt', map_location=device)
actor_critic = ActorCritic(policy_net_agent, encoder_net_agent)
agent = dummy_agent1(actor_critic)
for i in range(10):
    episode_reward=[]

    step = 0
    obs = envs.reset() # we reset all at once as metaworld is time limited
    episode_reward = []
    successes = []
    done=[False for _ in range(num_processes)]


    with torch.no_grad():

        latent, hidden_state = agent.get_prior(num_processes)

    while not all(done):
        with torch.no_grad():
            value, action = agent.act(None, latent, None, None, deterministic=False)

        next_obs, (rew_raw, rew_normalised),done, info = envs.step(action)


        # create mask for episode ends
        # masks_done = torch.FloatTensor([[0.0] if _done else [1.0] for _done in done]).to(device)

        ## combine all rewards
        episode_reward.append(rew_raw)
        # if we succeed at all then the task is successful
        successes.append(torch.tensor([i['success'] for i in info]))

        latent, hidden_state = agent.get_latent(
            action, 
            next_obs, 
            rew_raw, 
            hidden_state, 
            return_prior = False
        )
            

        obs = next_obs

        step += 1
    print("REW:", torch.stack(episode_reward).mean(0))
    print("SUCC:", torch.stack(successes).max(0)[0])
    print("===============================================")


envs.close()

REW: tensor([[0.0141],
        [0.0207],
        [0.0176],
        [0.0136],
        [0.2409]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[1.6174],
        [0.0083],
        [0.0157],
        [5.6966],
        [3.5217]])
SUCC: tensor([1., 0., 0., 1., 1.])
REW: tensor([[0.0202],
        [0.0129],
        [0.1073],
        [0.0165],
        [0.0139]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[0.0461],
        [0.0147],
        [0.0148],
        [0.0150],
        [0.0148]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[0.0142],
        [0.0144],
        [0.0385],
        [0.0182],
        [0.0163]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[0.0156],
        [0.0151],
        [0.2548],
        [0.0350],
        [0.0187]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[0.0357],
        [0.0313],
        [0.0153],
        [0.0258],
        [0.0143]])
SUCC: tensor([0., 0., 0., 0., 0.])
REW: tensor([[0.0289],
        [0.0118],
        [0.0138],
        [0.0179],
      

In [55]:
torch.stack(episode_reward).mean(0)
torch.stack(successes).max(0)[0]

tensor([[0.0152],
        [2.6538],
        [0.0139],
        [0.3873],
        [0.0132]])

In [57]:
torch.stack(successes).max(0)[0]

tensor([0., 0., 0., 0., 0.])

In [47]:
torch.tensor(successes).sum()

tensor(223.)

In [34]:
torch.stack(episode_reward).mean()

tensor(9.6111)