In [1]:
import copy
import sys

from torch import nn
from torch.nn import functional as F

sys.path.append('..')

from src.agents import RainbowAgent, EzExplorerAgent, SurprisalExplorerAgent
from src.agents import SFPredictor
from src.agents.Rainbow import DEFAULT_RAINBOW_ARGS
from src.envs import *
from src.training import *
from src.models import *

In [2]:
# env = create_simple_gridworld_env(True, 100)
env = create_crazy_climber_env()

In [3]:
custom_encoder = None
if env.observation_space.shape[1] <= 42:
  custom_encoder = create_gridworld_convs(env.observation_space.shape[0])

In [4]:
class PolicyNetwork(nn.Module):
    def __init__(self, obs_dim, n_acts):
        super().__init__()
        convs = create_atari_convs(obs_dim[0])

        test_input = torch.zeros(1, *obs_dim)
        with torch.no_grad():
            self.encoder_output_size = convs(test_input).view(-1).shape[0]

        self.layers = nn.Sequential(
            convs,
            nn.Flatten(),
            nn.Linear(self.encoder_output_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_acts))

    def forward(self, x):
        return self.layers(x)

class SFNetwork(nn.Module):
    def __init__(self, obs_dim, embed_dim=64):
        super().__init__()
        convs = create_atari_convs(obs_dim[0])

        test_input = torch.zeros(1, *obs_dim)
        with torch.no_grad():
            self.encoder_output_size = convs(test_input).view(-1).shape[0]

        self.encoder = nn.Sequential(
            convs,
            nn.Flatten(),
            nn.Linear(self.encoder_output_size, embed_dim),
            nn.LayerNorm(embed_dim))

        self.sf_predictor = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim))

    def forward(self, x):
        embeds = self.encoder(x)
        sfs = self.sf_predictor(embeds)
        return embeds, sfs

# sf_model = SFNetwork(list(env.observation_space.shape), 64)
# lstate, sfs = sf_model(torch.zeros([2] + list(env.observation_space.shape)))
# print(lstate.shape, sfs.shape)

In [5]:
embed_dim = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
rainbow_args.device = device
# rainbow_args.replay_frequency = 8

sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
repr_learner = SFPredictor(
    sf_model,
    batch_size = 32,
    update_freq = 16,
    log_freq = 200,
    target_net_update_freq = 64,
    discount_factor = 0.99,
    lr = 1e-4)

In [6]:
policy_net = PolicyNetwork(list(env.observation_space.shape), env.action_space.n)
explore_agent = SurprisalExplorerAgent(env, policy_net, repr_learner, log_freq=100)

In [7]:
# explore_agent = EzExplorerAgent(env, repr_learner=repr_learner)
train_exploration_model(explore_agent, env, int(1e6))

  obs = torch.tensor(obs, dtype=torch.float32, device=self.policy_device)
  [torch.stack([torch.tensor(se, dtype=torch.float32) for se in e], \
  [torch.tensor(e, dtype=torch.float32).to(self.policy_device) \
  act_idxs = torch.tensor(act_idxs, dtype=torch.long).to(self.policy_device)
  [torch.tensor(e, dtype=torch.float32).to(device) for e in batch_data]


torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
Step: 160 | Avg policy loss: 234.2974
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) torch.Size([32]) SHOULD BE THE SAME SHAPE
torch.Size([32]) t

KeyboardInterrupt: 

In [7]:
agent = RainbowAgent(rainbow_args, env, sf_model.encoder, None) # repr_learner)
sf_model = sf_model.to(device)
repr_learner._update_target_model()

In [None]:
train_task_model(agent, env, int(1e5))

Step: 5000	# Episodes: 1	Avg ep reward: 3900.00
Step: 10000	# Episodes: 1	Avg ep reward: 8500.00
Step: 15000	# Episodes: 1	Avg ep reward: 7700.00
Step: 20000	# Episodes: 2	Avg ep reward: 5550.00
Step: 25000	# Episodes: 2	Avg ep reward: 5550.00
Step: 30000	# Episodes: 1	Avg ep reward: 3100.00
Step: 35000	# Episodes: 2	Avg ep reward: 8000.00
Step: 40000	# Episodes: 1	Avg ep reward: 6100.00
Step: 45000	# Episodes: 1	Avg ep reward: 11000.00
Step: 50000	# Episodes: 1	Avg ep reward: 11000.00
Step: 55000	# Episodes: 2	Avg ep reward: 11000.00
Step: 60000	# Episodes: 1	Avg ep reward: 11000.00
Step: 65000	# Episodes: 2	Avg ep reward: 11000.00
Step: 70000	# Episodes: 1	Avg ep reward: 11300.00
Step: 75000	# Episodes: 1	Avg ep reward: 17100.00
Step: 80000	# Episodes: 2	Avg ep reward: 20400.00
Step: 85000	# Episodes: 2	Avg ep reward: 14950.00
Step: 90000	# Episodes: 1	Avg ep reward: 13900.00
Step: 95000	# Episodes: 1	Avg ep reward: 17900.00
Step: 100000	# Episodes: 1	Avg ep reward: 18300.00


In [21]:
sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
repr_learner = SFPredictor(
    sf_model,
    batch_size = 32,
    update_freq = 16,
    log_freq = 200,
    target_net_update_freq = 64,
    discount_factor = 0.99,
    lr = 1e-4)

explore_agent = EzExplorerAgent(env, repr_learner=repr_learner)
train_exploration_model(explore_agent, env, int(1e5))
agent = RainbowAgent(rainbow_args, env, sf_model.encoder, None) # repr_learner)
sf_model = sf_model.to(device)
repr_learner._update_target_model()

Step #3199 | Repr loss: 0.2400
Step #6399 | Repr loss: 0.0778
Step #9599 | Repr loss: 0.0721
Step #12799 | Repr loss: 0.0637
Step #15999 | Repr loss: 0.0634
Step #19199 | Repr loss: 0.0572
Step #22399 | Repr loss: 0.0541
Step #25599 | Repr loss: 0.0511
Step #28799 | Repr loss: 0.0568
Step #31999 | Repr loss: 0.0477
Step #35199 | Repr loss: 0.0433
Step #38399 | Repr loss: 0.0404
Step #41599 | Repr loss: 0.0376
Step #44799 | Repr loss: 0.0358
Step #47999 | Repr loss: 0.0334
Step #51199 | Repr loss: 0.0316
Step #54399 | Repr loss: 0.0355
Step #57599 | Repr loss: 0.0303
Step #60799 | Repr loss: 0.0276
Step #63999 | Repr loss: 0.0257
Step #67199 | Repr loss: 0.0240
Step #70399 | Repr loss: 0.0223
Step #73599 | Repr loss: 0.0209
Step #76799 | Repr loss: 0.0200
Step #79999 | Repr loss: 0.0225
Step #83199 | Repr loss: 0.0196
Step #86399 | Repr loss: 0.0176
Step #89599 | Repr loss: 0.0162
Step #92799 | Repr loss: 0.0150
Step #95999 | Repr loss: 0.0142
Step #99199 | Repr loss: 0.0136


In [22]:
train_task_model(agent, env, int(1e5))

Step: 5000	# Episodes: 1	Avg ep reward: 800.00
Step: 10000	# Episodes: 1	Avg ep reward: 1200.00
Step: 15000	# Episodes: 1	Avg ep reward: 1300.00
Step: 20000	# Episodes: 2	Avg ep reward: 5200.00
Step: 25000	# Episodes: 1	Avg ep reward: 4300.00
Step: 30000	# Episodes: 2	Avg ep reward: 1850.00
Step: 35000	# Episodes: 1	Avg ep reward: 2700.00
Step: 40000	# Episodes: 1	Avg ep reward: 3400.00
Step: 45000	# Episodes: 2	Avg ep reward: 4000.00
Step: 50000	# Episodes: 1	Avg ep reward: 6200.00
Step: 55000	# Episodes: 1	Avg ep reward: 7800.00
Step: 60000	# Episodes: 2	Avg ep reward: 7100.00
Step: 65000	# Episodes: 2	Avg ep reward: 7100.00
Step: 70000	# Episodes: 1	Avg ep reward: 9600.00
Step: 75000	# Episodes: 1	Avg ep reward: 9100.00
Step: 80000	# Episodes: 2	Avg ep reward: 10950.00
Step: 85000	# Episodes: 1	Avg ep reward: 9300.00
Step: 90000	# Episodes: 2	Avg ep reward: 11000.00
Step: 95000	# Episodes: 2	Avg ep reward: 11000.00
Step: 100000	# Episodes: 1	Avg ep reward: 11000.00


In [23]:
print('-'*16 + '\nWith representation learning\n' + '-'*16)
for _ in range(5):
    print('Starting new training loop')

    embed_dim = 64
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
    rainbow_args.device = device
    # rainbow_args.replay_frequency = 8

    sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
    repr_learner = SFPredictor(sf_model, lr=1e-4)

    agent = RainbowAgent(rainbow_args, env, sf_model.encoder, repr_learner)
    sf_model = sf_model.to(device)
    repr_learner._update_target_model()

    train_task_model(agent, env, int(1e5))

print('-'*16 + '\nWithout representation learning\n' + '-'*16)
for _ in range(5):
    print('Starting new training loop')

    embed_dim = 64
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    rainbow_args = copy.copy(DEFAULT_RAINBOW_ARGS)
    rainbow_args.device = device
    # rainbow_args.replay_frequency = 8

    sf_model = SFNetwork(list(env.observation_space.shape), embed_dim)
    repr_learner = SFPredictor(sf_model, lr=1e-4)

    agent = RainbowAgent(rainbow_args, env, sf_model.encoder, None)
    sf_model = sf_model.to(device)
    repr_learner._update_target_model()

    train_task_model(agent, env, int(1e5))

----------------
With representation learning
----------------
Starting new training loop
Step: 5000	# Episodes: 1	Avg ep reward: 10400.00
Step: 10000	# Episodes: 1	Avg ep reward: 7000.00
Step: 15000	# Episodes: 1	Avg ep reward: 4100.00
Step: 20000	# Episodes: 2	Avg ep reward: 7900.00
Step: 25000	# Episodes: 1	Avg ep reward: 6000.00
Step: 30000	# Episodes: 1	Avg ep reward: 3100.00
Step: 35000	# Episodes: 2	Avg ep reward: 3850.00
Step: 40000	# Episodes: 1	Avg ep reward: 6000.00
Step: 45000	# Episodes: 2	Avg ep reward: 6250.00
Step: 50000	# Episodes: 2	Avg ep reward: 10950.00
Step: 55000	# Episodes: 2	Avg ep reward: 12600.00
Step: 60000	# Episodes: 2	Avg ep reward: 7300.00
Step: 65000	# Episodes: 1	Avg ep reward: 24400.00
Step: 70000	# Episodes: 1	Avg ep reward: 24800.00
Step: 75000	# Episodes: 2	Avg ep reward: 24600.00
Step: 80000	# Episodes: 2	Avg ep reward: 15700.00


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Step: 85000	# Episodes: 0	Avg ep reward: nan
Step: 90000	# Episodes: 2	Avg ep reward: 14900.00
Step: 95000	# Episodes: 2	Avg ep reward: 24800.00
Step: 100000	# Episodes: 1	Avg ep reward: 24900.00
Starting new training loop
Step: 5000	# Episodes: 1	Avg ep reward: 3400.00
Step: 10000	# Episodes: 1	Avg ep reward: 5200.00
Step: 15000	# Episodes: 2	Avg ep reward: 5200.00
Step: 20000	# Episodes: 1	Avg ep reward: 3500.00
Step: 25000	# Episodes: 2	Avg ep reward: 3550.00
Step: 30000	# Episodes: 2	Avg ep reward: 3450.00
Step: 35000	# Episodes: 1	Avg ep reward: 4200.00
Step: 40000	# Episodes: 2	Avg ep reward: 4100.00
Step: 45000	# Episodes: 1	Avg ep reward: 5300.00
Step: 50000	# Episodes: 2	Avg ep reward: 5350.00
Step: 55000	# Episodes: 1	Avg ep reward: 1400.00
Step: 60000	# Episodes: 1	Avg ep reward: 4100.00
Step: 65000	# Episodes: 1	Avg ep reward: 9500.00
Step: 70000	# Episodes: 2	Avg ep reward: 3400.00
Step: 75000	# Episodes: 1	Avg ep reward: 3400.00
Step: 80000	# Episodes: 2	Avg ep reward: 46

KeyboardInterrupt: 

In [None]:
# agent = TestRL(agent)
# agent.start_task(1000)
# obs = env.reset()
# act = agent.sample_act(obs)
# print('Act:', act)
# obs, reward, done, _ = env.step(act)
# agent.process_step_data((obs, act, reward, obs, done))
# agent.end_step()