## TODO

[X] Implement/Port Continual Backprop

[X] Implement "unit tests" initialization, buffer behavior, training, ...

[X] investigate inconsistency between envs

[X] investigate gradient inconsistencies (seems adaptive max pool inherently adds numerical errors to the gradients)

In [1]:
import time, random
from collections import deque
from pathlib import Path
from types import SimpleNamespace as sn

import torch
import numpy as np
from tqdm import trange
from rich import print

import wandb
from wandb.integration.sb3 import WandbCallback

In [27]:
torch.backends.cudnn.deterministic = True

In [106]:
grads = []
losses = []

In [107]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

module = torch.nn.Sequential(
    torch.nn.Conv2d(4, 4, 3),
    #torch.nn.ReLU(),
    torch.nn.AdaptiveMaxPool2d((8, 8)),
    torch.nn.Flatten(),
    torch.nn.Linear(4*8*8, 4)
).cuda()

opt = torch.optim.Adam(module.parameters())
loss_fn = torch.nn.HuberLoss()

batch_n = 128
inp = torch.rand(batch_n, 4, 6, 6).cuda() #4, 84, 84
targ = torch.rand(batch_n, 4).cuda()
    
for _ in range(10):
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)

    ret = module(inp)
    
    opt.zero_grad()
    
    loss = loss_fn(ret, targ)
    losses.append(loss.item())
    
    loss.backward()
    
    grads.append(list(map(lambda x: x.grad.clone().detach(), module.parameters())))

In [108]:
(np.array(losses) == losses[0]).all() # are all losses equal?

True

In [109]:
for n, w in zip(module.named_parameters(), zip(*grads)):
    print(n[0], (torch.stack(w) - w[0]).abs().max())  # are all gradients equal? each row a comparison of one weight grad across retries

In [101]:
module

Sequential(
  (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1))
  (1): AdaptiveMaxPool2d(output_size=(8, 8))
  (2): Flatten(start_dim=1, end_dim=-1)
  (3): Linear(in_features=256, out_features=4, bias=True)
)

The 'AdaptiveMaxPool2d' layer is prone to introduce numerical disturbances into the gradient calculation. These errors are propagated in the backward pass, which quickly causes a divergence in module weights. In conclusion, as these discrepancies compound with every update, we cannot expect two models with the same seed to end up with the same parameters or even the same training progression.

In [2]:
from torch_layers import FactorizedNoisyLinear
from old_rainbow.common.networks import FactorizedNoisyLinear as oldFactorizedNoisyLinear

def same_params(old_rainbow, new_rainbow):
    old_params = torch.nn.utils.parameters_to_vector(list(old_rainbow.q_policy.parameters()) + list(old_rainbow.q_target.parameters())).detach().cpu().numpy()
    new_params = new_rainbow.policy.parameters_to_vector()
    if len(new_params) != len(old_params):
        print('Unequal lengths:', len(new_params), len(old_params))
        return False
    return np.allclose(old_params, new_params)

def sync_norm_buffers(old_rainbow, new_rainbow):
    """
    torch.nn.utils.spectral_norm adds buffers to the Conv modules which affect the results of forward passes and therefore need to be synchronized.
    This function clones the buffers from old_rainbow to new_rainbow.
    """
    for old_q, new_q in zip([old_rainbow.q_policy, old_rainbow.q_target], [new_rainbow.q_net, new_rainbow.q_net_target]):
        for old_block, new_block in zip(old_q.main[:-1], new_q.features_extractor.main[:-3]):
            for i in range(2):
                old_residual = getattr(old_block, f"residual_{i}")
                new_residual = getattr(new_block, f"residual_{i}")
                for j in range(2):
                    old_conv = getattr(old_residual, f"conv_{j}")
                    new_conv = getattr(new_residual, f"conv_{j}")
                    
                    new_conv.weight_u = old_conv.weight_u.clone()
                    new_conv.weight_v = old_conv.weight_v.clone()
                
def same_buffers(old_rainbow, new_rainbow):
    for k, (old_q, new_q) in enumerate(zip([old_rainbow.q_policy, old_rainbow.q_target], [new_rainbow.q_net, new_rainbow.q_net_target])):
        for old_block, new_block in zip(old_q.main[:-1], new_q.features_extractor.main[:-3]):
            for i in range(2):
                old_residual = getattr(old_block, f"residual_{i}")
                new_residual = getattr(new_block, f"residual_{i}")
                for j in range(2):
                    old_conv = getattr(old_residual, f"conv_{j}")
                    new_conv = getattr(new_residual, f"conv_{j}")
                    
                    if (new_conv.weight_u != old_conv.weight_u).any() or (new_conv.weight_v != old_conv.weight_v).any():
                        print(f"{['policy', 'target'][k]}.residual_{i}.conv_{j}")
                        return False
                
        for branch in ['value_branch', 'advantage_branch']:
            old_seq = getattr(old_q.dueling, branch)
            new_seq = getattr(new_q.dueling, branch)
            for old_layer, new_layer in list(zip(old_seq, new_seq))[::2]:
                for buffer in ['weight_epsilon', 'bias_epsilon']:
                    old_val = getattr(old_layer, buffer)
                    new_val = getattr(new_layer, buffer)
                    
                    if (old_val != new_val).any():
                        print(f"{['policy', 'target'][k]}.{branch}.{buffer}")
                        return False
    return True

In [3]:
def same_states(old_env, new_env):
    for env1, env2 in zip(old_env.venv.envs, new_env.venv.envs):
        old_state = env1.np_random.get_state()
        new_state = env2.np_random.get_state()
        if (new_state[1] != old_state[1]).any() or new_state[2:] != old_state[2:]:
            print('Not Same')
            return False
    return True

In [4]:
from rainbow import Rainbow, reset_noise

In [5]:
from env_wrappers import create_env
from utils import get_mean_ep_length
from sb3_logger import configure_logger, WandbOutputFormat

In [6]:
from argparse import Namespace
args = Namespace(env_name='gym:Breakout',
                 parallel_envs=64,
                 subproc_vecenv=False,
                 time_limit=108_000,
                 frame_stack=4,
                 frame_skip=4,
                 grayscale=True,
                 gamma=0.99,
                 resolution=(84, 84),
                 save_dir='tmp',
                 record_every=60*50,
                 decorr=True,
                 seed=3605)

args.burnin = 500 #100_000
args.buffer_size = 2**12 #2**19
args.batch_size = 256
args.sync_dqn_target_every = 320 #32_000
args.training_frames = 64*11 #1000 #2_000_000

args.noisy_linear = True
if args.noisy_linear:
    args.linear_kwargs = {'sigma_0': 0.5}
args.adam_eps = None
args.model_size = 2

In [7]:
args.buffer_size, args.sync_dqn_target_every%args.parallel_envs == 0, args.training_frames

(4096, True, 704)

In [8]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

<torch._C.Generator at 0x1e585eb1230>

In [9]:
print(f'Creating', args.parallel_envs, 'and decorrelating environment instances. This may take up to a few minutes.. ', end='')
decorr_steps = None
if args.env_name == 'gym:Breakout':
    decorr_steps = 160 // args.parallel_envs
if args.decorr and not args.env_name.startswith('procgen:') and decorr_steps is None:
    decorr_steps = get_mean_ep_length(args) // args.parallel_envs
print('Decorr steps:', decorr_steps)
env = create_env(args, decorr_steps=decorr_steps)
#states = env.reset()
print('Done.')

In [10]:
model = Rainbow('CnnPolicy', env, buffer_size=args.buffer_size, batch_size=args.batch_size, learning_starts=args.burnin, target_update_interval=args.sync_dqn_target_every, policy_kwargs={'noisy_linear': True, 'linear_kwargs': {'sigma_0': 0.5}, 'optimizer_kwargs': {'eps': None}, 'features_extractor_kwargs': {'model_size': 2}})

In [68]:
args.wandb_mode = 'offline'
args.wandb_group = 'baseline'
args.name = 'no_train_test'
args.log_dir = './runs/' + args.name + '/'
args.wandb_dir = args.log_dir # 'wandb' added automatically
args.tb_log_dir = args.log_dir + 'tensorboard'
args.wandb_grad_freq = 0

run = wandb.init(
    project="Rainbow",
    group=args.wandb_group,
    name=args.name,
    config=vars(args),
    dir=args.wandb_dir,
    #monitor_gym=True,  # auto-upload the videos of agents playing the game
    mode=args.wandb_mode,
    #reinit=True
)

model.set_logger(configure_logger(0, args.tb_log_dir, args.name, True, extra_formats=[WandbOutputFormat]))
    
callbacks = [WandbCallback(gradient_save_freq=args.wandb_grad_freq)]

In [69]:
model.learn(args.training_frames, callback=callbacks, tb_log_name=args.name, progress_bar=True)

Output()

<rainbow.Rainbow at 0x23e7dbc97c0>

In [70]:
env.close()
model.logger.close()
run.finish()

0,1
rollout/ep_len_mean,▁▁▁▁
rollout/ep_rew_mean,▁▁▁▁
rollout/exploration_rate,▁▁▁▁▁
rollout/prioritized_er_beta,▁▄▅▇█
time/episodes,▁▃▅▆█
time/fps,▁▄▅▇█
time/time_elapsed,▁████
time/total_timesteps,▁▄▅▇█
timestep,▁▄▅▇█

0,1
rollout/ep_len_mean,130.0
rollout/ep_rew_mean,0.0
rollout/exploration_rate,0.0
rollout/prioritized_er_beta,1.0
time/episodes,20.0
time/fps,55.0
time/time_elapsed,12.0
time/total_timesteps,704.0
timestep,704.0


In [None]:
def same_named(old_named, new_named):
    same = True
    for old_p, new_p zip(old_named, new_named):
        if not torch.allclose(old_p[1], new_p[1]):
            same = False
            print(f'{old_p[0]} != {new_p[0]}: {(old_p[1] - new_p[1]).abs().max().item()}')
    return same

def same_net(old_net, new_net):
    same_params = same_named(old_net.named_parameters(), new_net.named_parameters())
    same_buffers = same_named(old_net.named_buffers(), new_net.named_buffers())
    return same_params and same_buffers

def same_model(old_rainbow, new_rainbow):
    same_current = same_net(old_rainbow.q_policy, new_policy.q_net)
    same_target = same_net(old_rainbow.q_target, new_policy.q_net_target)
    return same_current and same_target

def same_replays(old_rainbow, new_rainbow):
    if old_rainbow.buffer.size != new_rainbow.replay_buffer.size():
        print(f'Old replay buffer size {old_rainbow.buffer.size} != new size {new_rainbow.replay_buffer.size()}')
        return False
    if hasattr(old_rainbow.buffer, 'max_priority') != hasattr(new_rainbow.replay_buffer, 'max_priority'):
        print('Using different replay buffers (Uniform/Priority)')
        return False
    priority = hasattr(old_rainbow.buffer, 'max_priority')
    
    o_obs, o_nobs, o_a, o_r, o_d = list(zip(*old_rainbow.buffer.data[:old_rainbow.buffer.size]))
    o_obs = np.array([np.array(o) in o_obs]) #convert lazy frames to numpy arrays
    o_nobs = np.array([np.array(o) in o_nobs])
    o_a = torch.stack(o_a).detach().cpu().numpy()
    o_r = torch.stack(o_r).detach().cpu().numpy()
    o_d = torch.stack(o_d).detach().cpu().numpy()
    
    data = new_rainbow.replay_buffer.observations, new_rainbow.replay_buffer.next_observations, new_rainbow.replay_buffer.actions, new_rainbow.replay_buffer.rewards, new_rainbow.replay_buffer.dones
    n_obs, n_nobs, n_a, n_r, n_d = map(lambda d: d[new_rainbow.replay_buffer.size()], data)
    n_obs = np.array([np.array(o) in n_obs]) #convert lazy frames to numpy arrays
    n_nobs = np.array([np.array(o) in n_nobs])
    
    for name, old_v, new_v in zip(('observations', 'next_observations', 'actions', 'rewards', 'dones'), (o_obs, o_nobs, o_a, o_r, o_d), (n_obs, n_nobs, n_a, n_r, n_d)):
        if not (old_v == new_v).all():
            print(f'{name} are not the same')
            return False
    
    if priority:
        if not np.allclose(old_rainbow.buffer.max_priority, new_rainbow.replay_buffer.max_priority):
            print('Max priorities are not close')
            return False
        if not np.allclose(old_rainbow.buffer.priority_sum, new_rainbow.replay_buffer.priority_sum):
            print('Priority sums are not close')
            return False
        if not np.allclose(old_rainbow.buffer.priority_min, new_rainbow.replay_buffer.priority_min):
            print('Priority mins are not close')
            return False
    return True

In [None]:
def set_random(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

In [None]:
def set_up_env(args, old=False):
    set_random(args)
    
    create = old_create_env if old else create_env
    ep_length = old_get_mean_ep_length if old else get_mean_ep_length
    
    decorr_steps = None
    if args.env_name == 'gym:Breakout':
        decorr_steps = 160 // args.parallel_envs
    if args.decorr and not args.env_name.startswith('procgen:') and decorr_steps is None:
        decorr_steps = get_mean_ep_length(args) // args.parallel_envs
    env = create_env(args, decorr_steps=decorr_steps)
    #states = env.reset()
    
    return env, decorr_steps

In [None]:
def initialize_sb3(args):
    env, _ = set_up_env(args)
    
    model = Rainbow('CnnPolicy',
                    env,
                    buffer_size=args.buffer_size,
                    batch_size=args.batch_size,
                    learning_starts=args.burnin,
                    target_update_interval=args.sync_dqn_target_every,
                    policy_kwargs={
                        'noisy_linear': args.noisy_linear, 
                        'linear_kwargs': args.linear_kwargs, 
                        'optimizer_kwargs': {'eps': args.adam_eps}, 
                        'features_extractor_kwargs': {'model_size': args.model_size}
                    })
    
    return model

In [None]:
def add_default_args(args):
    args.init_eps = 0.002
    args.final_eps = 0.0
    args.eps_decay_frames = max(int(0.002 * args.training_frames), 1)
    args.prioritized_er_beta0 = 0.45
    args.prioritized_er_time = args.training_frames
    args.use_amp = False
    args.network_arch = f'impala_large:{args.model_size}'
    args.spectral_norm = 'all'
    args.noisy_dqn = args.noisy_linear
    args.noisy_sigma0 = args.linear_kwargs['sigma_0']
    args.double_dqn = True
    args.prioritized_er = True
    args.n_step = 3
    args.max_grad_norm = 10
    args.lr = 0.00025
    args.adam_eps = 0.005/args.batch_size if args.adam_eps is None else args.adam_eps
    args.lr_decay_steps = None
    args.loss_fn = 'huber'
    args.train_count = 2
    return args

In [None]:
def initialize_old(args):
    env, _ = set_up_env(args, old=True)
    
    args = add_default_args(args)
    
    rainbow = old_Rainbow(env, args)
    
    return rainbow, env

In [None]:
def compare_initialization(args):
    new_model = initialize_sb3(args)
    old_model, _ = initialize_old(args)
    
    assert same_model(old_model, new_model)

In [None]:
def train_sb3(args):
    model = initialize_sb3(args)
    
    args.wandb_mode = 'offline'
    args.wandb_group = 'baseline'
    args.name = 'no_train_test'
    args.log_dir = './runs/' + args.name + '/'
    args.wandb_dir = args.log_dir # 'wandb' added automatically
    args.tb_log_dir = args.log_dir + 'tensorboard'
    args.wandb_grad_freq = 0
    
    run = wandb.init(
        project="Rainbow",
        group=args.wandb_group,
        name=args.name,
        config=vars(args),
        dir=args.wandb_dir,
        #monitor_gym=True,  # auto-upload the videos of agents playing the game
        mode=args.wandb_mode,
        #reinit=True
    )
    
    model.set_logger(configure_logger(0, args.tb_log_dir, args.name, True, extra_formats=[WandbOutputFormat]))
        
    callbacks = [WandbCallback(gradient_save_freq=args.wandb_grad_freq)]
    
    model.learn(args.training_frames, callback=callbacks, tb_log_name=args.name, progress_bar=True)
    
    env.close()
    model.logger.close()
    run.finish()
    
    return model

In [None]:
def train_old(args):
    rainbow, old_env = initialize_old(args)
    
    eps_schedule = LinearSchedule(0, initial_value=args.init_eps, final_value=args.final_eps, decay_time=args.eps_decay_frames)
    per_beta_schedule = LinearSchedule(0, initial_value=args.prioritized_er_beta0, final_value=1.0, decay_time=args.prioritized_er_time)
    
    episode_count = 0
    returns = deque(maxlen=100)
    discounted_returns = deque(maxlen=10)
    losses = deque(maxlen=10)
    q_values = deque(maxlen=10)
    grad_norms = deque(maxlen=10)
    iter_times = deque(maxlen=10)
    reward_density = 0
    
    returns_all = []
    q_values_all = []
    
    t = trange(0, args.training_frames + 1, args.parallel_envs)
    for game_frame in t:
        iter_start = time.time()
        eps = eps_schedule(game_frame)
        per_beta = per_beta_schedule(game_frame)
    
        # reset the noisy-nets noise in the policy
        if args.noisy_dqn:
            rainbow.reset_noise(rainbow.q_policy)
    
        # compute actions to take in all parallel envs, asynchronously start environment step
        actions = rainbow.act(states, eps)
        old_env.step_async(actions)
    
        # if training has started, perform args.train_count training steps, each on a batch of size args.batch_size
        if rainbow.buffer.burnedin:
            print('Trained')
            for train_iter in range(args.train_count):
                if args.noisy_dqn and train_iter > 0: rainbow.reset_noise(rainbow.q_policy)
                q, loss, grad_norm = rainbow.train(args.batch_size, beta=per_beta)
                losses.append(loss)
                grad_norms.append(grad_norm)
                q_values.append(q)
                q_values_all.append((game_frame, q))
    
        # copy the Q-policy weights over to the Q-target net
        # (see also https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155)
        if game_frame % args.sync_dqn_target_every == 0 and rainbow.buffer.burnedin:
            rainbow.sync_Q_target()
    
        # block until environments are ready, then collect transitions and add them to the replay buffer
        next_states, rewards, dones, infos = old_env.step_wait()
        
        #transitions.append((states, actions, rewards, dones))
        
        for state, action, reward, done, j in zip(states, actions, rewards, dones, range(args.parallel_envs)):
            reward_density = 0.999 * reward_density + (1 - 0.999) * (reward != 0)
            rainbow.buffer.put(state, action, reward, done, j=j)
        states = next_states
    
        # if any of the envs finished an episode, log stats to wandb
        for info, j in zip(infos, range(args.parallel_envs)):
            if 'episode_metrics' in info.keys():
                episode_metrics = info['episode_metrics']
                returns.append(episode_metrics['return'])
                returns_all.append((game_frame, episode_metrics['return']))
                discounted_returns.append(episode_metrics['discounted_return'])
    
                episode_count += 1
                
        if game_frame % (10_000-(10_000 % args.parallel_envs)) == 0:
            print(f' [{game_frame:>8} frames, {episode_count:>5} episodes] running average return = {np.mean(returns)}')
            torch.cuda.empty_cache()
    
        iter_times.append(time.time() - iter_start)
        t.set_description(f' [{game_frame:>8} frames, {episode_count:>5} episodes]', refresh=False)
        
    return rainbow

In [None]:
def compare_pre_optimization(args):
    args.training_frames = args.burnin - 5 # stop procedure before training starts
    
    new_model = train_sb3(args)
    old_model = train_old(args)
    
    assert same_model(old_model, new_model)
    assert same_replays(old_model, new_model)

In [None]:
def compare_first_update(args):
    args.training_frames = args.burnin # stop procedure after first update
    
    new_model = train_sb3(args)
    old_model = train_old(args)
    
    assert same_model(old_model, new_model)
    assert same_replays(old_model, new_model)

In [9]:
#from old_rainbow.common import argp
from old_rainbow.common.rainbow import Rainbow as old_Rainbow
from old_rainbow.common.env_wrappers import create_env as old_create_env #, BASE_FPS_ATARI, BASE_FPS_PROCGEN
from old_rainbow.common.utils import LinearSchedule
from old_rainbow.common.utils import get_mean_ep_length as old_get_mean_ep_length

In [10]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

<torch._C.Generator at 0x1e585eb1230>

In [11]:
args.init_eps = 0.002
args.final_eps = 0.0
args.eps_decay_frames = max(int(0.002 * args.training_frames), 1)
args.prioritized_er_beta0 = 0.45
args.prioritized_er_time = args.training_frames

In [12]:
eps_schedule = LinearSchedule(0, initial_value=args.init_eps, final_value=args.final_eps, decay_time=args.eps_decay_frames)
per_beta_schedule = LinearSchedule(0, initial_value=args.prioritized_er_beta0, final_value=1.0, decay_time=args.prioritized_er_time)

In [13]:
print(f'Creating', args.parallel_envs, 'and decorrelating environment instances. This may take up to a few minutes.. ', end='')
decorr_steps = None
if args.env_name == 'gym:Breakout':
    decorr_steps = 160 // args.parallel_envs
if args.decorr and not args.env_name.startswith('procgen:') and decorr_steps is None:
    decorr_steps = old_get_mean_ep_length(args) // args.parallel_envs
old_env = old_create_env(args, decorr_steps=decorr_steps)
states = old_env.reset()
print('Done.')

In [14]:
args.use_amp = False
args.network_arch = 'impala_large:2'
args.spectral_norm = 'all'
args.noisy_dqn = True
args.noisy_sigma0 = 0.5
args.double_dqn = True
args.prioritized_er = True
args.n_step = 3
args.max_grad_norm = 10
args.lr = 0.00025
args.adam_eps = 0.005/args.batch_size
args.lr_decay_steps = None
args.loss_fn = 'huber'
args.train_count = 2

In [15]:
rainbow = old_Rainbow(old_env, args)

In [78]:
#transitions = []

episode_count = 0
returns = deque(maxlen=100)
discounted_returns = deque(maxlen=10)
losses = deque(maxlen=10)
q_values = deque(maxlen=10)
grad_norms = deque(maxlen=10)
iter_times = deque(maxlen=10)
reward_density = 0

returns_all = []
q_values_all = []

t = trange(0, args.training_frames + 1, args.parallel_envs)
for game_frame in t:
    iter_start = time.time()
    eps = eps_schedule(game_frame)
    per_beta = per_beta_schedule(game_frame)

    # reset the noisy-nets noise in the policy
    if args.noisy_dqn:
        rainbow.reset_noise(rainbow.q_policy)

    # compute actions to take in all parallel envs, asynchronously start environment step
    actions = rainbow.act(states, eps)
    old_env.step_async(actions)

    # if training has started, perform args.train_count training steps, each on a batch of size args.batch_size
    if rainbow.buffer.burnedin:
        for train_iter in range(args.train_count):
            if args.noisy_dqn and train_iter > 0: rainbow.reset_noise(rainbow.q_policy)
            q, loss, grad_norm = rainbow.train(args.batch_size, beta=per_beta)
            losses.append(loss)
            grad_norms.append(grad_norm)
            q_values.append(q)
            q_values_all.append((game_frame, q))

    # copy the Q-policy weights over to the Q-target net
    # (see also https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/launcher.py#L155)
    if game_frame % args.sync_dqn_target_every == 0 and rainbow.buffer.burnedin:
        rainbow.sync_Q_target()

    # block until environments are ready, then collect transitions and add them to the replay buffer
    next_states, rewards, dones, infos = old_env.step_wait()
    
    #transitions.append((states, actions, rewards, dones))
    
    for state, action, reward, done, j in zip(states, actions, rewards, dones, range(args.parallel_envs)):
        reward_density = 0.999 * reward_density + (1 - 0.999) * (reward != 0)
        rainbow.buffer.put(state, action, reward, done, j=j)
    states = next_states

    # if any of the envs finished an episode, log stats to wandb
    for info, j in zip(infos, range(args.parallel_envs)):
        if 'episode_metrics' in info.keys():
            episode_metrics = info['episode_metrics']
            returns.append(episode_metrics['return'])
            returns_all.append((game_frame, episode_metrics['return']))
            discounted_returns.append(episode_metrics['discounted_return'])

            episode_count += 1
            
    if game_frame % (10_000-(10_000 % args.parallel_envs)) == 0:
        print(f' [{game_frame:>8} frames, {episode_count:>5} episodes] running average return = {np.mean(returns)}')
        torch.cuda.empty_cache()

    iter_times.append(time.time() - iter_start)
    t.set_description(f' [{game_frame:>8} frames, {episode_count:>5} episodes]', refresh=False)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


 [     704 frames,     1 episodes]: 100%|██████████████████████████████████████████████| 12/12 [00:01<00:00,  7.25it/s]


In [79]:
same_params(rainbow, model)

True

In [80]:
same_buffers(rainbow, model)

True

In [25]:
model.q_net.features_extractor.main[0].residual_0.conv_0.weight_u

tensor([-0.3160,  0.0357,  0.3253, -0.0654,  0.1095, -0.2212,  0.0019, -0.1697,
         0.2777,  0.0009, -0.0990,  0.1113,  0.1754,  0.0104, -0.1125,  0.2511,
        -0.2908, -0.1716,  0.2703, -0.0557, -0.0961, -0.0856,  0.1631,  0.1496,
        -0.0656, -0.1673, -0.1235, -0.0628, -0.3446,  0.0097,  0.0038,  0.2590],
       device='cuda:0')

In [26]:
model.q_net_target.features_extractor.main[0].residual_0.conv_0.weight_u

tensor([-0.3962, -0.0636,  0.1503, -0.2544,  0.1020, -0.0771,  0.1308, -0.0972,
         0.1890,  0.1150, -0.1789,  0.0398, -0.0186,  0.0095, -0.0066,  0.0507,
         0.0041,  0.1236,  0.2921,  0.2957, -0.0081,  0.1791,  0.1737,  0.1036,
        -0.1971, -0.0676, -0.4756, -0.0704,  0.0238, -0.1666, -0.2429, -0.0118],
       device='cuda:0')

In [27]:
rainbow.q_policy.main[0].residual_0.conv_0.weight_u

tensor([-0.3160,  0.0357,  0.3253, -0.0654,  0.1095, -0.2212,  0.0019, -0.1697,
         0.2777,  0.0009, -0.0990,  0.1113,  0.1754,  0.0104, -0.1125,  0.2511,
        -0.2908, -0.1716,  0.2703, -0.0557, -0.0961, -0.0856,  0.1631,  0.1496,
        -0.0656, -0.1673, -0.1235, -0.0628, -0.3446,  0.0097,  0.0038,  0.2590],
       device='cuda:0')

In [28]:
rainbow.q_target.main[0].residual_0.conv_0.weight_u

tensor([-0.3962, -0.0636,  0.1503, -0.2544,  0.1020, -0.0771,  0.1308, -0.0972,
         0.1890,  0.1150, -0.1789,  0.0398, -0.0186,  0.0095, -0.0066,  0.0507,
         0.0041,  0.1236,  0.2921,  0.2957, -0.0081,  0.1791,  0.1737,  0.1036,
        -0.1971, -0.0676, -0.4756, -0.0704,  0.0238, -0.1666, -0.2429, -0.0118],
       device='cuda:0')

In [25]:
ysdssw

NameError: name 'ysdssw' is not defined

In [None]:
sync_norm_buffers(rainbow, model)

# reset noise as it changes during transition genration

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

rainbow.reset_noise(rainbow.q_policy)
rainbow.sync_Q_target()

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

reset_noise(model.q_net)
print(model.q_net_target.load_state_dict(model.q_net.state_dict()))

same_buffers(rainbow, model)

In [None]:
# add all transitions to the buffer

for states, actions, rewards, dones in transitions:
    model.replay_buffer.add(states, None, actions, rewards, dones, None)

In [27]:
model.replay_buffer.size() == rainbow.buffer.size

True

In [28]:
model.replay_buffer._sum() == rainbow.buffer._sum()

True

In [None]:
model._game_frame = game_frame
model.num_timesteps = model._game_frame
model._total_timesteps = args.training_frames

model._update_current_progress_remaining(model.num_timesteps, model._total_timesteps)
model.exploration_rate = model.exploration_schedule(model._current_progress_remaining)
model.per_beta = model.per_beta_schedule(model._current_progress_remaining)

In [25]:
model.exploration_rate == eps

True

In [26]:
model.per_beta == per_beta

True

In [23]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

samps = rainbow.buffer.sample(args.batch_size, per_beta)
samps = rainbow.buffer.sample(args.batch_size, per_beta)

In [24]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

samples = model.replay_buffer.sample(args.batch_size, per_beta, env=model._vec_normalize_env)
samples = model.replay_buffer.sample(args.batch_size, per_beta, env=model._vec_normalize_env)

In [27]:
len(samps)

3

In [28]:
len(samples)

3

In [29]:
(samps[0] == samples[0]).all()

True

In [30]:
(samps[1] == samples[1]).all()

True

In [31]:
len(samps[2])

5

In [32]:
for a, b in zip(samps[2], samples[2]):
    print(a.shape, b.shape)

In [33]:
(samps[2][0] == samples[2][0]).all()

tensor(True, device='cuda:0')

In [34]:
(samps[2][1] == samples[2][2]).all()

tensor(True, device='cuda:0')

In [35]:
(samps[2][2] == samples[2][1].squeeze()).all()

tensor(True, device='cuda:0')

In [36]:
(samps[2][3] == samples[2][4].squeeze()).all()

tensor(True, device='cuda:0')

In [37]:
(samps[2][4] == samples[2][3].squeeze()).all()

tensor(True, device='cuda:0')

In [25]:
pres = rainbow.q_policy(samps[2][0])

In [26]:
preds = model.q_net(samples[2].observations)

In [27]:
(pres == preds).all()

tensor(True, device='cuda:0')

In [41]:
rainbow.q_policy.training

True

In [3]:
import numpy as np

In [8]:
a = np.full(10, None)

In [9]:
a

array([None, None, None, None, None, None, None, None, None, None],
      dtype=object)

In [10]:
a[1] = np.random.randn(3, 3)

In [11]:
a[3] = np.random.randn(3, 3)

In [12]:
a[4] = np.random.randn(3, 3)

In [None]:
def get_arrays(x):
    return 

In [25]:
import torch

In [None]:
torch.stack()

In [54]:
np.stack(a[[1, 3, 4]])

array([[[-2.26861735,  0.03342497, -0.25643305],
        [-0.48733942, -0.54318875, -0.06592383],
        [-0.34461205, -0.46507621,  0.2386487 ]],

       [[ 0.39604212, -1.18159655,  0.58223131],
        [ 1.30900018, -1.35385212, -0.09264297],
        [-0.80046118,  1.35699888, -1.34499777]],

       [[ 0.85492775, -1.15575657, -2.68788771],
        [-0.49871643, -0.28261214, -1.16439925],
        [ 1.51580178,  0.52357223, -0.85979991]]])

In [28]:
temp_np = np.random.randn(10000)
list_temp_np = [temp_np for _ in range(10000)]

In [33]:
%timeit torch.stack([torch.from_numpy(l) for l in list_temp_np])

208 ms ± 17.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
%timeit torch.from_numpy(np.stack(list_temp_np))

189 ms ± 6.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
j = torch.stack([torch.from_numpy(l) for l in list_temp_np])

In [55]:
k = torch.from_numpy(np.stack(a[[1, 3, 4]]))

In [46]:
(j == k).all()

tensor(False)

In [50]:
k[0,0] += 1

In [51]:
j

tensor([[ 1.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196],
        [ 0.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196],
        [ 0.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196],
        ...,
        [ 0.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196],
        [ 0.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196],
        [ 0.2796,  1.2687,  0.4557,  ..., -0.5271, -0.3413,  0.6196]],
       dtype=torch.float64)

In [56]:
k

tensor([[[-2.2686,  0.0334, -0.2564],
         [-0.4873, -0.5432, -0.0659],
         [-0.3446, -0.4651,  0.2386]],

        [[ 0.3960, -1.1816,  0.5822],
         [ 1.3090, -1.3539, -0.0926],
         [-0.8005,  1.3570, -1.3450]],

        [[ 0.8549, -1.1558, -2.6879],
         [-0.4987, -0.2826, -1.1644],
         [ 1.5158,  0.5236, -0.8598]]], dtype=torch.float64)

In [53]:
list_temp_np[:3]

[array([ 0.2796441 ,  1.2687103 ,  0.45571017, ..., -0.52710646,
        -0.3413414 ,  0.61959864]),
 array([ 0.2796441 ,  1.2687103 ,  0.45571017, ..., -0.52710646,
        -0.3413414 ,  0.61959864]),
 array([ 0.2796441 ,  1.2687103 ,  0.45571017, ..., -0.52710646,
        -0.3413414 ,  0.61959864])]

In [45]:
pres = rainbow.q_target(samps[2][0])

tensor(746709., device='cuda:0', grad_fn=<SumBackward0>)
tensor(250476.9688, device='cuda:0', grad_fn=<SumBackward0>)
tensor(841444.4375, device='cuda:0', grad_fn=<SumBackward0>)
tensor(-193947.7500, device='cuda:0', grad_fn=<SumBackward0>)
tensor(275521.9375, device='cuda:0', grad_fn=<SumBackward0>)
tensor(119160.1172, device='cuda:0', grad_fn=<SumBackward0>)
tensor(336543.1250, device='cuda:0', grad_fn=<SumBackward0>)
tensor(8973.5264, device='cuda:0', grad_fn=<SumBackward0>)
tensor(60527.5430, device='cuda:0', grad_fn=<SumBackward0>)
tensor(-1717.0886, device='cuda:0', grad_fn=<SumBackward0>)
tensor(72680.1641, device='cuda:0', grad_fn=<SumBackward0>)
tensor(33962.7812, device='cuda:0', grad_fn=<SumBackward0>)


In [46]:
preds = model.q_net_target(samples[2].observations)

tensor(746709., device='cuda:0', grad_fn=<SumBackward0>)
tensor(250476.9688, device='cuda:0', grad_fn=<SumBackward0>)
tensor(841444.4375, device='cuda:0', grad_fn=<SumBackward0>)
tensor(-193947.7500, device='cuda:0', grad_fn=<SumBackward0>)
tensor(275521.9375, device='cuda:0', grad_fn=<SumBackward0>)
tensor(119160.1172, device='cuda:0', grad_fn=<SumBackward0>)
tensor(336543.1250, device='cuda:0', grad_fn=<SumBackward0>)
tensor(8973.5264, device='cuda:0', grad_fn=<SumBackward0>)
tensor(60527.5430, device='cuda:0', grad_fn=<SumBackward0>)
tensor(-1717.0886, device='cuda:0', grad_fn=<SumBackward0>)
tensor(72680.1641, device='cuda:0', grad_fn=<SumBackward0>)
tensor(33962.7812, device='cuda:0', grad_fn=<SumBackward0>)


In [47]:
(pres == preds).all()

tensor(True, device='cuda:0')

In [33]:
same_params(rainbow, model), same_buffers(rainbow, model)

(True, True)

In [36]:
(model.replay_buffer.priority_min == rainbow.buffer.priority_min)#.all()

False

In [37]:
(model.replay_buffer.priority_sum == rainbow.buffer.priority_sum)#.all()

False

In [30]:
model.replay_buffer.priority_sum[1]

392.9686983525753

392.9686983637512

392.9686983637512

392.9686983525753

In [31]:
rainbow.buffer.priority_sum[1]

392.9686984429062

392.9686984429062

392.9686984429062

In [33]:
new_p1 == new_p2

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [81]:
def same_params(old_rainbow, new_rainbow):
    old_params = torch.nn.utils.parameters_to_vector(list(old_rainbow.q_policy.parameters()) + list(old_rainbow.q_target.parameters())).detach().cpu().numpy()
    new_params = new_rainbow.policy.parameters_to_vector()
    if len(new_params) != len(old_params):
        print('Unequal lengths:', len(new_params), len(old_params))
        return False
    return ((old_params - new_params)[:9173844//20] != 0 ).sum()#.all()

def same_buffers(old_rainbow, new_rainbow):
    old_buffers = torch.nn.utils.parameters_to_vector(list(old_rainbow.q_policy.buffers()) + list(old_rainbow.q_target.buffers())).detach().cpu().numpy()
    new_buffers = torch.nn.utils.parameters_to_vector(model.policy.buffers()).detach().cpu().numpy()
    if len(new_buffers) != len(old_buffers):
        print('Unequal lengths:', len(new_buffers), len(old_buffers))
        return False
    return (old_buffers == new_buffers).all()

In [82]:
same_params(rainbow, model), same_buffers(rainbow, model)

(0, True)

In [120]:
a = torch.nn.utils.parameters_to_vector(model.q_net.features_extractor.main[2].residual_1.conv_1.parameters())

In [125]:
b = torch.nn.utils.parameters_to_vector(rainbow.q_policy.main[2].residual_1.conv_1.parameters())

In [122]:
(a != b).sum()

tensor(205, device='cuda:0')

In [135]:
list(rainbow.q_policy.main[2].residual_1.conv_1.parameters())[0].grad == list(model.q_net.features_extractor.main[2].residual_1.conv_1.parameters())[0].grad

tensor([ True, False,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True, False,  True, False,  True,  True,
         True, False, False, False,  True, False,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True, False,
         True, False,  True,  True,  True, False,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True, False,  True,  True,
         True,  True,  True,  True], device='cuda:0')

In [31]:
list(map(lambda l: ((l[0][1].grad - l[1].grad).abs().max().item(), l[0][0]), zip(model.q_net.named_parameters(), rainbow.q_policy.parameters())))

[(1.4551915228366852e-10, 'features_extractor.main.0.conv.weight'),
 (1.7462298274040222e-10, 'features_extractor.main.0.conv.bias'),
 (8.731149137020111e-11, 'features_extractor.main.0.residual_0.conv_0.bias'),
 (2.3283064365386963e-10,
  'features_extractor.main.0.residual_0.conv_0.weight_orig'),
 (1.3096723705530167e-10, 'features_extractor.main.0.residual_0.conv_1.bias'),
 (2.0372681319713593e-10,
  'features_extractor.main.0.residual_0.conv_1.weight_orig'),
 (1.1641532182693481e-10, 'features_extractor.main.0.residual_1.conv_0.bias'),
 (2.0372681319713593e-10,
  'features_extractor.main.0.residual_1.conv_0.weight_orig'),
 (2.3283064365386963e-10, 'features_extractor.main.0.residual_1.conv_1.bias'),
 (1.2369127944111824e-10,
  'features_extractor.main.0.residual_1.conv_1.weight_orig'),
 (1.7462298274040222e-10, 'features_extractor.main.1.conv.weight'),
 (1.7462298274040222e-10, 'features_extractor.main.1.conv.bias'),
 (1.1641532182693481e-10, 'features_extractor.main.1.residual_0.c

In [32]:
def 

In [32]:
grad1 =  rainbow.q_policy.main[2].residual_1.conv_1.bias.grad.clone()

In [44]:
grad1

tensor([-1.6236e-03, -5.3778e-04, -5.6266e-05,  0.0000e+00, -5.2314e-05,
        -8.2292e-05,  0.0000e+00,  1.1867e-03,  6.7422e-04,  6.6862e-04,
        -4.6065e-04, -7.3275e-05, -5.4489e-04,  1.1104e-03,  5.7649e-04,
        -9.3309e-04, -2.3133e-04,  9.2959e-06,  1.2393e-04,  0.0000e+00,
        -5.2042e-04, -6.1884e-05, -7.4515e-05, -3.7119e-04, -1.6360e-04,
        -1.8515e-04,  1.4474e-03, -2.5232e-03, -1.9661e-04, -2.3318e-04,
        -3.5856e-06,  5.1181e-04, -1.6746e-06,  8.1364e-04,  7.9730e-09,
         1.7285e-05, -4.4675e-04, -7.5733e-04, -2.0996e-05,  8.6999e-05,
        -9.0510e-05, -4.4699e-04,  1.8645e-03, -7.1447e-04, -4.2238e-04,
         3.9382e-04,  8.4147e-05, -6.8688e-04,  3.4405e-05, -1.7494e-04,
        -9.3331e-04, -8.2519e-04,  0.0000e+00, -6.2001e-04, -1.2787e-03,
         8.1953e-05,  5.9411e-04, -4.7243e-04, -1.0874e-03, -1.0010e-03,
        -3.4161e-04,  1.9612e-04,  3.1001e-04,  4.2435e-04], device='cuda:0')

In [43]:
grad2 =  rainbow.q_policy.main[2].residual_1.conv_1.bias.grad.clone()

In [45]:
grad1 - grad2

tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  7.2760e-12,  0.0000e+00,  0.0000e+00,
         5.8208e-11, -7.2760e-12,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        -7.2760e-12,  0.0000e+00,  0.0000e+00,  5.8208e-11,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -7.2760e-12,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  5.8208e-11,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00], device='cuda:0')

In [155]:
model.q_net.features_extractor.main[2].residual_1.conv_1.weight_orig.grad - rainbow.q_policy.main[2].residual_1.conv_1.weight_orig.grad

tensor([[[[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [-1.4552e-11,  0.0000e+00,  1.4552e-11]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

         [[-3.4106e-13, -6.8212e-13,  0.0000e+00],
          [ 0.0000e+00,  1.4211e-13,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00, -2.2737e-13]],

         ...,

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 7.2760e-12,  7.2760e-12,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  1.3642e-12]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  7.2760e-12],
          [ 3.6380e-12,  0.0000e+00,  0.0000e+00]]],


        [[[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
  

In [112]:
rainbow.q_policy.main

Sequential(
  (0): ImpalaCNNBlock(
    (conv): Conv2d(4, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (residual_0): ImpalaCNNResidual(
      (relu): ReLU()
      (conv_0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv_1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (residual_1): ImpalaCNNResidual(
      (relu): ReLU()
      (conv_0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv_1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
  )
  (1): ImpalaCNNBlock(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (residual_0): ImpalaCNNResidual(
      (relu): ReLU()
      (conv_0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     

In [97]:
len(a)

389024

In [61]:
len(model.policy.parameters_to_vector())

9173844

In [78]:
9173844//20

458692

In [83]:
class LO():
    record = lambda *a, **b: print(*a, b)
model.set_logger(LO())

In [84]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

train_iter = 0
if args.noisy_dqn and train_iter > 0: rainbow.reset_noise(rainbow.q_policy)
q1, loss1, grad_norm1 = rainbow.train(args.batch_size, beta=per_beta)
q1, loss1, grad_norm1
#train_iter = 1
#if args.noisy_dqn and train_iter > 0: rainbow.reset_noise(rainbow.q_policy)
#q2, loss2, grad_norm2 = rainbow.train(args.batch_size, beta=per_beta)
#
#np.mean([q1, q2]), np.mean([loss1, loss2]), np.mean([grad_norm1, grad_norm2])

ValueError: not enough values to unpack (expected 4, got 3)

In [85]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

model.train(1, args.batch_size)

In [32]:
stuff1[0] == stuff2[0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [33]:
stuff1[1] == stuff2[1]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [34]:
stuff1[2] == stuff2[2]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [35]:
stuff1[3] == stuff2[3]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [86]:
same_params(rainbow, model), same_buffers(rainbow, model)

(10168, True)

In [34]:
mp = list(model.q_net.features_extractor.main[0].parameters())
rp = list(rainbow.q_policy.main[0].parameters())

In [35]:
torch.allclose(mp[0], rp[0])

True

In [38]:
rainbow.scaler()

False

tensor([[[-0.0953,  0.1084,  0.0024],
         [-0.0296, -0.0273,  0.0071],
         [-0.0108,  0.0708,  0.0487]],

        [[ 0.1056, -0.0464,  0.0657],
         [-0.1111,  0.1646,  0.1293],
         [-0.0690, -0.0660, -0.1419]],

        [[ 0.0789, -0.0522,  0.0628],
         [-0.0917,  0.1513,  0.1576],
         [ 0.1217,  0.1185, -0.0790]],

        [[ 0.1305,  0.0805,  0.1509],
         [-0.1165,  0.0761,  0.1590],
         [-0.0063, -0.0776,  0.0744]]], device='cuda:0',
       grad_fn=<SelectBackward0>)

In [33]:
class DummyModel():
    def __init__(self, rb):
        self.rb = rb
    
    def predict(
        self,
        observation,
        state = None,
        episode_start = None,
        deterministic = False,
    ):
        actions = self.rb.act(observation, 0)
        return actions, None

In [26]:
args.time_limit = 10_000
print(evaluate_policy(DummyModel(rainbow), old_create_env(args, decorr_steps=decorr_steps), 64))
args.time_limit = 108_000

Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
Truncated episode due to time limit!
T

In [18]:
# same as long as no training

In [60]:
same_params(rainbow, model)

False

In [61]:
same_buffers(rainbow, model)

False

In [62]:
same_states(old_env, env)

False

In [59]:
state = env.venv.envs[0].np_random.get_state()

In [60]:
old_state = old_env.venv.envs[0].np_random.get_state()

In [62]:
(state[1] == old_state[1]).all()

True

In [61]:
state[2:] == old_state[2:]

False

In [63]:
state[2:]

(5, 0, 0.0)

In [64]:
old_state[2:]

(3, 0, 0.0)

In [66]:
np.array(no).shape

(188, 4, 84, 84, 1)

In [26]:
from collections import deque

In [27]:
q = deque(maxlen=3+1)

In [37]:
q.append(5)

In [40]:
q[-1]

5

In [45]:
o, no, a, r, d = list(zip(*rainbow.buffer.data[:rainbow.buffer.size]))

In [46]:
(np.array(o) == model.replay_buffer.observations[:model.replay_buffer.size()]).all()

True

In [47]:
(np.array([np.array(h) for h in no]) == model.replay_buffer.next_observations[:model.replay_buffer.size()]).all()

True

In [48]:
(torch.stack(a).detach().cpu().numpy() == model.replay_buffer.actions[:model.replay_buffer.size()]).all()

True

In [49]:
(torch.concat(r).detach().cpu().numpy() == model.replay_buffer.rewards[:model.replay_buffer.size()]).all()

True

False

In [48]:
torch.concat(r).detach().cpu().numpy()[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [46]:
model.replay_buffer.rewards[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [30]:
np.array(o).shape

(482, 4, 84, 84, 1)

In [31]:
model.replay_buffer.observations.shape

(512, 4, 84, 84, 1)

In [31]:
(len(transitions)-3)*64

128

188

188

In [34]:
model.replay_buffer.actions[:188]

array([[3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [0],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [0],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [26]:
(torch.stack(a).detach().cpu().numpy() == model.replay_buffer.actions[:125])#[:20]

array([[ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [ True],
       [

In [82]:
for _ in range(500):
    o, r, d, i = env.step([2]*args.parallel_envs)
    random.seed(args.seed)
    model_pred = model.predict(o, None, None, False)[0]
    random.seed(args.seed)
    rain_pred = rainbow.act(o, model.exploration_rate).numpy()
    if not (model_pred == rain_pred).all():
        print('Not Same!')
        break

In [81]:
model.exploration_rate

0.002

In [54]:
rainbow.q_policy#.parameters()

ImpalaCNNLarge(
  (main): Sequential(
    (0): ImpalaCNNBlock(
      (conv): Conv2d(4, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (residual_0): ImpalaCNNResidual(
        (relu): ReLU()
        (conv_0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv_1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (residual_1): ImpalaCNNResidual(
        (relu): ReLU()
        (conv_0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv_1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (1): ImpalaCNNBlock(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (residual_0): ImpalaCNNResidual(
        (relu): ReLU()
        (conv_0): Conv2

In [16]:
model.q_net

RainbowNetwork(
  (features_extractor): ImpalaCNNLarge(
    (main): Sequential(
      (0): ImpalaCNNBlock(
        (conv): Conv2d(4, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (residual_0): ImpalaCNNResidual(
          (relu): ReLU()
          (conv_0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (conv_1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (residual_1): ImpalaCNNResidual(
          (relu): ReLU()
          (conv_0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (conv_1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
      )
      (1): ImpalaCNNBlock(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (

In [17]:
old_params = list(rainbow.q_policy.main.parameters())

In [18]:
new_params = list(model.q_net.features_extractor.main[:-2].parameters())

In [19]:
len(old_params) == len(new_params)

True

In [20]:
get_shapes = lambda l: list(map(lambda a: a.shape, l))

In [21]:
get_shapes(old_params) == get_shapes(new_params)

True

In [28]:
all(map(lambda c: torch.equal(*c), zip(old_params, new_params)))

True

In [41]:
same_params(rainbow, model)

False

In [None]:
actions = [2]*env.num_envs

In [159]:
old_env.step(actions)

([<gym.wrappers.frame_stack.LazyFrames at 0x1549b66bf90>,
  <gym.wrappers.frame_stack.LazyFrames at 0x1549b66b360>],
 array([0., 0.], dtype=float32),
 array([False, False]),
 [{'lives': 5,
   'episode_frame_number': 187,
   'frame_number': 187,
   'actual_rewards': [0.0, 0.0, 0.0, 0.0]},
  {'lives': 5,
   'episode_frame_number': 176,
   'frame_number': 176,
   'actual_rewards': [0.0, 0.0, 0.0, 0.0]}])

In [160]:
env.step(actions)

([<gym.wrappers.frame_stack.LazyFrames at 0x1549b6521d0>,
  <gym.wrappers.frame_stack.LazyFrames at 0x1549bf444a0>],
 array([0., 0.], dtype=float32),
 array([False, False]),
 [{'lives': 5,
   'episode_frame_number': 187,
   'frame_number': 187,
   'actual_rewards': [0.0, 0.0, 0.0, 0.0]},
  {'lives': 5,
   'episode_frame_number': 176,
   'frame_number': 176,
   'actual_rewards': [0.0, 0.0, 0.0, 0.0]}])

In [24]:
model.policy.parameters_to_vector()[:10]

array([-0.09638923,  0.10707786,  0.00097233, -0.03044413, -0.02866514,
        0.00550009, -0.01170665,  0.06939051,  0.04697701,  0.1045445 ],
      dtype=float32)

In [None]:
import numpy as np

In [7]:
np.ndarray

  self.pbar = tqdm(total=self.locals["total_timesteps"] - self.model.num_timesteps)


Output()

<rainbow.Rainbow at 0x18838858550>

In [3]:
original_weights = model.policy.parameters_to_vector()

In [4]:
import cProfile

In [5]:
cProfile.run('model.learn(750, progress_bar=True)', 'learnstats2')

Output()

In [6]:
import pstats
from pstats import SortKey

In [7]:
p = pstats.Stats('learnstats2')

In [15]:
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats(10)

Mon Jul 17 09:33:27 2023    learnstats2

         15766431 function calls (14198453 primitive calls) in 239.984 seconds

   Ordered by: cumulative time
   List reduced from 1186 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      7/1    0.000    0.000  239.988  239.988 {built-in method builtins.exec}
        1    0.000    0.000  239.988  239.988 rainbow.py:339(learn)
        1    0.056    0.056  239.988  239.988 off_policy_algorithm.py:327(learn)
      494    0.292    0.001  232.484    0.471 rainbow.py:254(train)
    48412   80.382    0.002   80.382    0.002 {method 'item' of 'torch._C._TensorBase' objects}
    18844   69.879    0.004   69.879    0.004 {method 'copy_' of 'torch._C._TensorBase' objects}
187226/4446    0.821    0.000   32.304    0.007 module.py:1124(_call_impl)
     3458    0.039    0.000   32.154    0.009 policies.py:77(forward)
     8892    0.134    0.000   30.419    0.003 container.py:137(forward)
     3458    0

<pstats.Stats at 0x207228b02e0>

Mon Jul 17 08:35:53 2023    learnstats

         14820821 function calls (13252843 primitive calls) in 235.965 seconds

   Ordered by: cumulative time
   List reduced from 1180 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      7/1    0.000    0.000  235.968  235.968 {built-in method builtins.exec}
        1    0.000    0.000  235.968  235.968 rainbow.py:334(learn)
        1    0.052    0.052  235.968  235.968 off_policy_algorithm.py:327(learn)
      494    0.406    0.001  228.386    0.462 rainbow.py:251(train)
    48412   81.469    0.002   81.469    0.002 {method 'item' of 'torch._C._TensorBase' objects}
     1482   71.051    0.048   71.051    0.048 {method 'cpu' of 'torch._C._TensorBase' objects}
187226/4446    0.840    0.000   32.029    0.007 module.py:1124(_call_impl)
     3458    0.037    0.000   31.725    0.009 policies.py:77(forward)
     8892    0.129    0.000   30.065    0.003 container.py:137(forward)
     3458    0.03

<pstats.Stats at 0x12736138550>

In [14]:
p.strip_dirs().print_callees('rainbow.py:254\(train\)')

   Random listing order was used
   List reduced from 1186 to 1 due to restriction <'rainbow.py:254\\(train\\)'>

Function               called...
                           ncalls  tottime  cumtime
rainbow.py:254(train)  ->    1482    0.003    0.062  <__array_function__ internals>:177(mean)
                              988    0.011    7.128  _tensor.py:340(backward)
                             1976    0.001    0.001  base_class.py:273(logger)
                              494    0.003    0.004  base_class.py:306(_update_learning_rate)
                              988    1.945    9.936  buffer.py:288(update_priorities)
                              988    2.044   22.416  buffer.py:295(sample)
                              988    0.208    4.224  clip_grad.py:9(clip_grad_norm_)
                             1482    0.013   18.935  grad_mode.py:24(decorate_context)
                              988    0.004    0.004  grad_mode.py:126(__init__)
                              988    0.002 

<pstats.Stats at 0x207228b02e0>

   Random listing order was used
   List reduced from 1180 to 1 due to restriction <'rainbow.py:251\\(train\\)'>

Function               called...
                           ncalls  tottime  cumtime
rainbow.py:251(train)  ->    1482    0.003    0.059  <__array_function__ internals>:177(mean)
                              988    0.011    6.959  _tensor.py:340(backward)
                             1976    0.001    0.001  base_class.py:273(logger)
                              494    0.002    0.004  base_class.py:306(_update_learning_rate)
                              988    0.480    4.561  buffer.py:288(update_priorities)
                              988    1.999   21.942  buffer.py:295(sample)
                              988    0.210    4.148  clip_grad.py:9(clip_grad_norm_)
                             1482    0.013   18.281  grad_mode.py:24(decorate_context)
                             1976    0.002    0.002  logger.py:491(record)
                             1976    0.020   11.

<pstats.Stats at 0x12736138550>

In [28]:
model2 = Rainbow('CnnPolicy', 'BreakoutDeterministic-v4', buffer_size=512, learning_starts=256, policy_kwargs={'noisy_linear': True, 'linear_kwargs': {'sigma_0': 0.5}, 'optimizer_kwargs': {'eps': None}, 'features_extractor_kwargs': {'model_size': 1}})

In [29]:
new_weights = model2.policy.parameters_to_vector()

In [None]:
cProfile.run('model2.learn(750, progress_bar=True)', 'learnstats2')

Output()

In [10]:
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render("human")
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()

  logger.warn(


In [9]:
model.

466