In [1]:
import copy
import sys

from torch import nn
from torch.nn import functional as F

sys.path.append('..')

from src.agents import RainbowAgent, EzExplorerAgent
from src.agents import SFPredictor
from src.agents.Rainbow import DEFAULT_RAINBOW_ARGS
from src.envs import *
from src.training import *
from src.models import *

In [2]:
# env = create_simple_gridworld_env(True, 100)
env = create_crazy_climber_env()

In [3]:
custom_encoder = None
if env.observation_space.shape[1] <= 42:
  custom_encoder = create_gridworld_convs(env.observation_space.shape[0])

In [4]:
class SFNetwork(nn.Module):
    def __init__(self, obs_dim, embed_dim=64):
        super().__init__()
        convs = create_atari_convs(obs_dim[0])
        # self.add_module('convs', self.convs)

        test_input = torch.zeros(1, *obs_dim)
        with torch.no_grad():
            self.encoder_output_size = convs(test_input).view(-1).shape[0]

        self.encoder = nn.Sequential(
            convs,
            # nn.Dropout2d(0.6),
            nn.Flatten(),
            nn.Linear(self.encoder_output_size, embed_dim))


        self.sf_predictor = nn.Sequential(
        nn.Linear(embed_dim, embed_dim),
        # nn.Dropout(0.6),
        nn.ReLU(),
        nn.Linear(embed_dim, embed_dim),
        nn.ReLU(),
        nn.Linear(embed_dim, embed_dim))

    def forward(self, x):
        z = self.encoder(x)
        embeds = F.normalize(self.sf_predictor(z), dim=-1)
        sfs = self.sf_predictor(embeds)
        return embeds, sfs

    def encode_obs(self, x):
        return F.normalize(self.encoder(x), dim=-1)

# sf_model = SFNetwork(list(env.observation_space.shape), 64)
# lstate, sfs = sf_model(torch.zeros([2] + list(env.observation_space.shape)))
# print(lstate.shape, sfs.shape)

In [12]:
embed_dim = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
rainbow_args.device = device
# rainbow_args.replay_frequency = 8

sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
repr_learner = SFPredictor(
    sf_model,
    batch_size = 32,
    update_freq = 16,
    log_freq = 200,
    target_net_update_freq = 64,
    discount_factor = 0.99,
    lr = 1e-4)

Target model updated!


In [13]:
explore_agent = EzExplorerAgent(env, repr_learner=repr_learner)
train_exploration_model(explore_agent, env, int(1e5))

  [torch.stack([torch.tensor(se, dtype=torch.float32) for se in e], \
  [torch.tensor(e, dtype=torch.float32).to(device) for e in batch_data]


Target model updated!
Representation loss: 0.0015
Target model updated!
Target model updated!
Representation loss: 0.0006
Target model updated!
Representation loss: 0.0004
Target model updated!
Target model updated!
Representation loss: 0.0004
Target model updated!
Representation loss: 0.0002
Target model updated!
Target model updated!
Representation loss: 0.0004
Target model updated!
Representation loss: 0.0002
Target model updated!
Target model updated!
Representation loss: 0.0003
Target model updated!
Target model updated!
Representation loss: 0.0002
Target model updated!
Representation loss: 0.0002
Target model updated!
Target model updated!
Representation loss: 0.0003
Target model updated!
Representation loss: 0.0002
Target model updated!
Target model updated!
Representation loss: 0.0003
Target model updated!
Representation loss: 0.0002
Target model updated!
Target model updated!
Representation loss: 0.0003
Target model updated!
Target model updated!
Representation loss: 0.0001
Ta

KeyboardInterrupt: 

In [None]:
agent = RainbowAgent(rainbow_args, env, sf_model.encoder, None) # repr_learner)
sf_model = sf_model.to(device)
repr_learner._update_target_model()

In [6]:
train_task_model(agent, env, int(1e5))

KeyboardInterrupt: 

In [5]:
print('-'*16 + '\nWith representation learning\n' + '-'*16)
for _ in range(5):
    print('Starting new training loop')

    embed_dim = 64
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
    rainbow_args.device = device
    # rainbow_args.replay_frequency = 8

    sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
    repr_learner = SFPredictor(sf_model, lr=1e-4)

    agent = RainbowAgent(rainbow_args, env, sf_model.encoder, repr_learner)
    sf_model = sf_model.to(device)
    repr_learner._update_target_model()

    train_task_model(agent, env, int(1e5))

print('-'*16 + '\nWithout representation learning\n' + '-'*16)
for _ in range(5):
    print('Starting new training loop')

    embed_dim = 64
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
    rainbow_args.device = device
    # rainbow_args.replay_frequency = 8

    sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
    repr_learner = SFPredictor(sf_model, lr=1e-4)

    agent = RainbowAgent(rainbow_args, env, sf_model.encoder, None)
    sf_model = sf_model.to(device)
    repr_learner._update_target_model()

    train_task_model(agent, env, int(1e5))

----------------
With representation learning
----------------
Starting new training loop


  [torch.stack([torch.tensor(se, dtype=torch.float32) for se in e], \
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Step: 5000	# Episodes: 0	Avg ep reward: nan
Step: 10000	# Episodes: 2	Avg ep reward: 5400.00
Step: 15000	# Episodes: 2	Avg ep reward: 6000.00
Step: 20000	# Episodes: 1	Avg ep reward: 4900.00
Step: 25000	# Episodes: 2	Avg ep reward: 10200.00
Step: 30000	# Episodes: 2	Avg ep reward: 5900.00
Step: 35000	# Episodes: 2	Avg ep reward: 6650.00
Step: 40000	# Episodes: 2	Avg ep reward: 5350.00
Step: 45000	# Episodes: 2	Avg ep reward: 5300.00
Step: 50000	# Episodes: 2	Avg ep reward: 6400.00
Step: 55000	# Episodes: 1	Avg ep reward: 6800.00
Step: 60000	# Episodes: 1	Avg ep reward: 11000.00
Step: 65000	# Episodes: 1	Avg ep reward: 8100.00
Step: 70000	# Episodes: 1	Avg ep reward: 7200.00
Step: 75000	# Episodes: 2	Avg ep reward: 3850.00
Step: 80000	# Episodes: 1	Avg ep reward: 4100.00
Step: 85000	# Episodes: 1	Avg ep reward: 3400.00
Step: 90000	# Episodes: 2	Avg ep reward: 5300.00
Step: 95000	# Episodes: 1	Avg ep reward: 4400.00
Step: 100000	# Episodes: 1	Avg ep reward: 3400.00
Starting new training 

In [28]:
# agent = TestRL(agent)
# agent.start_task(1000)
# obs = env.reset()
# act = agent.sample_act(obs)
# print('Act:', act)
# obs, reward, done, _ = env.step(act)
# agent.process_step_data((obs, act, reward, obs, done))
# agent.end_step()