In [1]:
"""
Copyright (c) Meta Platforms, Inc. and affiliates.

This source code is licensed under the CC BY-NC license found in the
LICENSE.md file in the root directory of this source tree.
"""

#from torch.utils.tensorboard import SummaryWriter
import argparse
import pickle
import random
import time
import gym

import torch
import numpy as np

from datasets import load_from_disk
import datasets

import utils
from replay_buffer import ReplayBuffer
from lamb import Lamb
#from stable_baselines3.common.vec_env import SubprocVecEnv
from pathlib import Path
from data import create_dataloader
from decision_transformer.models.decision_transformer import DecisionTransformer
from evaluation import create_vec_eval_episodes_fn, vec_evaluate_episode_rtg
from trainer import SequenceTrainer, SequenceTrainerCustom
from logger import Logger
from wrappers_custom import *
from utils_.helpers import *

from citylearn.citylearn import CityLearnEnv
from citylearn.wrappers import *
from utils_.variant_dict import variant


In [2]:
class self:
    a = 3
self = self()

In [18]:
env = CityLearnEnv(schema="citylearn_challenge_2022_phase_2")
env.central_agent = True
env = NormalizedObservationWrapper(env)
env = StableBaselines3WrapperCustom(env)

In [19]:
env.reset()

(array([0.0669873 , 0.8535534 , 1.        , 0.54135334, 0.47744358,
        0.6466165 , 0.54135334, 0.82222223, 0.7888889 , 0.64444447,
        0.7888889 , 0.        , 0.0245821 , 0.94788593, 0.        ,
        0.        , 0.10493179, 0.85519415, 0.        , 0.47462252,
        0.03030304, 0.03030304, 0.03030304, 0.03030304, 0.22008501,
        0.        , 0.        , 0.5091632 , 0.10645503, 0.        ,
        0.        , 0.4610448 , 0.        , 0.        , 0.        ,
        0.40333527, 0.0835057 , 0.        , 0.        , 0.47982168,
        0.06343947, 0.        , 0.        , 0.4165131 ], dtype=float32),
 {})

In [20]:
def _get_env_spec(env):
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    action_range = [
            float(env.action_space.low.min()) ,
            float(env.action_space.high.max()) ,
        ]
    return state_dim,act_dim, action_range

In [21]:
self.state_dim, self.act_dim, self.action_range = _get_env_spec(env)

In [22]:
self.action_range

[-0.78125, 0.78125]

## Load Dataset

In [23]:
def _load_dataset(trajectories):
    states, traj_lens, returns = [], [], []
    for path in trajectories:
        states.append(path["observations"])
        traj_lens.append(len(path["observations"]))
        returns.append(np.array(path["rewards"]).sum())
    traj_lens, returns = np.array(traj_lens), np.array(returns)

        # used for input normalization
    states = np.concatenate(states, axis=0)
    state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
    num_timesteps = sum(traj_lens)

    print("=" * 50)
    print(f"Starting new experiment: city_learn")
    print(f"{len(traj_lens)} trajectories, {num_timesteps} timesteps found")
    print(f"Average return: {np.mean(returns):.2f}, std: {np.std(returns):.2f}")
    print(f"Max return: {np.max(returns):.2f}, min: {np.min(returns):.2f}")
    print(f"Average length: {np.mean(traj_lens):.2f}, std: {np.std(traj_lens):.2f}")
    print(f"Max length: {np.max(traj_lens):.2f}, min: {np.min(traj_lens):.2f}")
    print("=" * 50)

    sorted_inds = np.argsort(returns)  # lowest to highest
    num_trajectories = 1
    timesteps = traj_lens[sorted_inds[-1]]
    ind = len(trajectories) - 2
    while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] < num_timesteps:
        timesteps += traj_lens[sorted_inds[ind]]
        num_trajectories += 1
        ind -= 1
    sorted_inds = sorted_inds[-num_trajectories:]
    print(sorted_inds)
    #print(trajectories[1])
    for ii in sorted_inds:
        print(ii)
    #print(trajectories[0].keys())
    trajectories = [trajectories[int(ii)] for ii in sorted_inds]

    for trajectory in trajectories:
        for key in trajectory.keys():
            trajectory[key] = np.array(trajectory[key])


    return trajectories, state_mean, state_std

In [24]:
dataset_path = "./data_interactions/sac_dataset.pkl"

In [25]:
dataset = load_from_disk(dataset_path)

In [26]:
dataset

Dataset({
    features: ['observations', 'next_observations', 'actions', 'rewards', 'dones', 'info'],
    num_rows: 30000
})

In [27]:
dataset,_ = segment_v2(dataset["observations"],dataset["actions"],dataset["rewards"],dataset["dones"])
   

Segmenting:   0%|          | 0/30000 [00:00<?, ?it/s]

In [28]:
dataset

[{'observations': array([[0.0669873 , 0.85355341, 1.        , ..., 0.        , 0.        ,
          0.48985395],
         [0.25      , 0.85355341, 0.98296291, ..., 0.        , 0.55495465,
          0.68397069],
         [0.25      , 0.85355341, 0.93301272, ..., 0.        , 0.59836298,
          0.49188823],
         ...,
         [0.0669873 , 0.85355341, 0.75      , ..., 0.        , 0.51978582,
          0.69413728],
         [0.0669873 , 0.85355341, 0.85355341, ..., 0.        , 0.96642524,
          0.73476398],
         [0.0669873 , 0.85355341, 0.93301272, ..., 0.        , 0.98913306,
          0.58467376]]),
  'actions': array([[-0.10103911, -0.6676929 ,  0.62797022, -0.46519962,  0.58497357],
         [ 0.45946574, -0.42295289,  0.39063358, -0.02685601,  0.04764682],
         [ 0.24261248,  0.40671182,  0.58524048, -0.32961681, -0.64024198],
         ...,
         [-0.39171493, -0.70435035, -0.42655581, -0.51389134,  0.53532851],
         [-0.46240404,  0.04107219,  0.13878572,  0

In [29]:
trajectories = datasets.Dataset.from_dict({k: [s[k] for s in dataset] for k in dataset[0].keys()})


In [30]:
trajectories

Dataset({
    features: ['observations', 'actions', 'rewards', 'dones'],
    num_rows: 4
})

In [31]:
self.offline_trajs, self.state_mean, self.state_std = _load_dataset(trajectories)

Starting new experiment: city_learn
4 trajectories, 30000 timesteps found
Average return: -6148.22, std: 1638.95
Max return: -3352.98, min: -7544.23
Average length: 7500.00, std: 2180.65
Max length: 8759.00, min: 3723.00
[1 2 3]
1
2
3


In [34]:
state_mean = torch.tensor(self.state_mean, device="cpu", dtype=torch.float32)
state_mean

tensor([0.5225, 0.4299, 0.5000, 0.4262, 0.4261, 0.4261, 0.4260, 0.6998, 0.6998,
        0.6997, 0.6997, 0.2019, 0.2019, 0.2018, 0.2018, 0.2111, 0.2111, 0.2111,
        0.2111, 0.4074, 0.1916, 0.1915, 0.1915, 0.1915, 0.1463, 0.2082, 0.0495,
        0.4175, 0.1566, 0.1927, 0.4486, 0.4479, 0.1336, 0.1925, 0.5922, 0.4744,
        0.1789, 0.1932, 0.4688, 0.4846, 0.2032, 0.2085, 0.6247, 0.4693])

In [36]:
self.replay_buffer = ReplayBuffer(1, self.offline_trajs)

In [37]:
self.replay_buffer

<replay_buffer.ReplayBuffer at 0x16cb1a070>

In [38]:
len(self.replay_buffer.trajectories)

1

In [39]:
self.aug_trajs = []

In [40]:
self.device = "cpu"
self.target_entropy = -self.act_dim
MAX_EPISODE_LEN = 8760

In [41]:
self.target_entropy

-5

In [42]:
variant["embed_dim"]

512

In [43]:
self.model = DecisionTransformer(
            state_dim=self.state_dim,
            act_dim=self.act_dim,
            action_range=self.action_range,
            max_length=variant["K"],
            eval_context_length=variant["eval_context_length"],
            max_ep_len=MAX_EPISODE_LEN,
            hidden_size=variant["embed_dim"],
            n_layer=variant["n_layer"],
            n_head=variant["n_head"],
            n_inner=4 * variant["embed_dim"],
            activation_function=variant["activation_function"],
            n_positions=1024,
            resid_pdrop=variant["dropout"],
            attn_pdrop=variant["dropout"],
            n_ctx = 60,
            stochastic_policy=True,
            ordering=variant["ordering"],
            init_temperature=variant["init_temperature"],
            target_entropy=self.target_entropy,
        ).to(device=self.device)

In [44]:
self.optimizer = Lamb(
            self.model.parameters(),
            lr=variant["learning_rate"],
            weight_decay=variant["weight_decay"],
            eps=1e-8,
        )
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, lambda steps: min((steps + 1) / variant["warmup_steps"], 1)
        )
self.log_temperature_optimizer = torch.optim.Adam(
            [self.model.log_temperature],
            lr=1e-4,
            betas=[0.9, 0.999],
        )

In [45]:
self.pretrain_iter = 0
self.online_iter = 0
self.total_transitions_sampled = 0
self.variant = variant
self.reward_scale = 1.0 
self.logger = Logger(variant)

Experiment log path: ./exp/2024.06.08/160923-default


In [46]:
def loss_fn(
            a_hat_dist,
            a,
            attention_mask,
            entropy_reg,
        ):
            # a_hat is a SquashedNormal Distribution
    log_likelihood = a_hat_dist.log_likelihood(a)[attention_mask > 0].mean()

    entropy = a_hat_dist.entropy().mean()
    loss = -(log_likelihood + entropy_reg * entropy)

    return (
                loss,
                -log_likelihood,
                entropy,
            )

In [47]:
def pretrain(self, eval_envs, loss_fn):
    print("\n\n\n*** Pretrain ***")

   

    trainer = SequenceTrainer(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )

    
    while self.pretrain_iter < 1:
            # in every iteration, prepare the data loader
        dataloader = create_dataloader(
                trajectories=self.offline_trajs,
                num_iters=self.variant["num_updates_per_pretrain_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
            )

        train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
        eval_outputs, eval_reward = self.evaluate(eval_fns)
        outputs = {"time/total": time.time() - self.start_time}
        outputs.update(train_outputs)
        outputs.update(eval_outputs)
        self.logger.log_metrics(
                outputs,
                iter_num=self.pretrain_iter,
                total_transitions_sampled=self.total_transitions_sampled,
                writer=writer,
            )

        self._save_model(
                path_prefix=self.logger.log_path,
                is_pretrain_model=True,
            )

        self.pretrain_iter += 1

## Pretrain

In [48]:
trainer = SequenceTrainerCustom(
            model=self.model,
            optimizer=self.optimizer,
            log_temperature_optimizer=self.log_temperature_optimizer,
            scheduler=self.scheduler,
            device=self.device,
        )

In [49]:
dataloader = create_dataloader(
                trajectories=self.offline_trajs,
                num_iters=self.variant["num_updates_per_pretrain_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
            )
## Remember n_ctx have to follow the query, key, value 



In [50]:
"""
train_outputs = trainer.train_iteration(
                loss_fn=loss_fn,
                dataloader=dataloader,
            )
"""

'\ntrain_outputs = trainer.train_iteration(\n                loss_fn=loss_fn,\n                dataloader=dataloader,\n            )\n'

In [51]:
self._save_model(path_prefix = self.logger.log_path, is_pretrain_model = True)

AttributeError: 'self' object has no attribute '_save_model'

In [52]:
self.log_temperature_optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.0001,
   'betas': [0.9, 0.999],
   'eps': 1e-08,
   'weight_decay': 0,
   'amsgrad': False,
   'maximize': False,
   'foreach': None,
   'capturable': False,
   'differentiable': False,
   'fused': None,
   'params': [0]}]}

In [53]:
self.variant["max_pretrain_iters"]

1

In [54]:
len(dataloader)

5000

In [55]:
total_params = sum(p.numel() for p in self.model.parameters())
total_params

17152567

In [56]:
self.variant["online_rtg"]

7200

In [57]:
self.variant["num_online_rollouts"]

1

## Augment Trajectories 


In [58]:
target_return = -6000

## Evaluate Episode RTG

In [59]:
self.model.eval()
self.model.to(device="cpu")

DecisionTransformer(
  (transformer): GPT2Model(
    (wte): Embedding(1, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (embed_timestep): Embedding(8760, 512)
  (embed_return): Linear(in_features=1, out_features=512, bias=True)
  (embed_state): Linear(in_features=44, out_features=512, bias=True)
  (embed_action): Linear(in_features=5, out_features=512, bias=

In [60]:
state_mean = torch.from_numpy(self.state_mean).to(device=self.device)
state_std = torch.from_numpy(self.state_std).to(device=self.device)

In [61]:
aug_env = CityLearnEnv(schema="citylearn_challenge_2022_phase_2")
aug_env.central_agent = True
aug_env = NormalizedObservationWrapper(aug_env)
aug_env = StableBaselines3WrapperCustom(aug_env)

In [62]:
state,_ = aug_env.reset()

In [63]:
num_envs = 1 

In [64]:
states = (
        torch.from_numpy(state)
        .reshape(num_envs, self.state_dim)
        .to(device=self.device, dtype=torch.float32)
    ).reshape(num_envs, -1, self.state_dim)

In [65]:
states

tensor([[[0.0670, 0.8536, 1.0000, 0.5414, 0.4774, 0.6466, 0.5414, 0.8222,
          0.7889, 0.6444, 0.7889, 0.0000, 0.0246, 0.9479, 0.0000, 0.0000,
          0.1049, 0.8552, 0.0000, 0.4746, 0.0303, 0.0303, 0.0303, 0.0303,
          0.2201, 0.0000, 0.0000, 0.5092, 0.1065, 0.0000, 0.0000, 0.4610,
          0.0000, 0.0000, 0.0000, 0.4033, 0.0835, 0.0000, 0.0000, 0.4798,
          0.0634, 0.0000, 0.0000, 0.4165]]])

In [66]:
actions = torch.zeros(0, device=self.device, dtype=torch.float32)
rewards = torch.zeros(0, device=self.device, dtype=torch.float32)

ep_return = target_return
target_return = torch.tensor(ep_return, device=self.device, dtype=torch.float32).reshape(
        num_envs, -1, 1
    )
timesteps = torch.tensor([0] * num_envs, device=self.device, dtype=torch.long).reshape(
        num_envs, -1
    )

In [67]:
timesteps

tensor([[0]])

In [68]:
episode_return = np.zeros((num_envs, 1)).astype(float)
episode_length = np.full(num_envs, np.inf)

In [69]:
episode_length

array([inf])

In [70]:
unfinished = np.ones(num_envs).astype(bool)

In [71]:
max_ep_len = 10

In [72]:
act_dim = self.act_dim
state_dim = self.state_dim 
device = self.device
model = self.model
use_mean = True
mode="normal"
reward_scale = 1 

In [73]:
for t in range(max_ep_len):
        # add padding
    actions = torch.cat(
            [
                actions,
                torch.zeros((num_envs, act_dim), device=device).reshape(
                    num_envs, -1, act_dim
                ),
            ],
            dim=1,
        )
    #print(actions)
    rewards = torch.cat(
            [
                rewards,
                torch.zeros((num_envs, 1), device=device).reshape(num_envs, -1, 1),
            ],
            dim=1,
        )

    state_pred, action_dist, reward_pred = model.get_predictions(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
            target_return.to(dtype=torch.float32),
            timesteps.to(dtype=torch.long),
            num_envs=num_envs,
        )
    state_pred = state_pred.detach().cpu().numpy().reshape(num_envs, -1)
    reward_pred = reward_pred.detach().cpu().numpy().reshape(num_envs)

        # the return action is a SquashNormal distribution
    action = action_dist.sample().reshape(num_envs, -1, act_dim)[:, -1]
    if use_mean:
        action = action_dist.mean.reshape(num_envs, -1, act_dim)[:, -1]
    action = action.clamp(*model.action_range)
    #print(action.detach().cpu().numpy())

    state, reward, done, _,_ = aug_env.step(action.detach().cpu().numpy()[0])

        # eval_env.step() will execute the action for all the sub-envs, for those where
        # the episodes have terminated, the envs will be reset. Hence we use
        # "unfinished" to track whether the first episode we roll out for each sub-env is
        # finished. In contrast, "done" only relates to the current episode
    print(reward)
    episode_return += reward

    actions[:, -1] = action
    print(actions)
    state = (
            torch.from_numpy(state).to(device=device).reshape(num_envs, -1, state_dim)
        )
    states = torch.cat([states, state], dim=1)
    #reward = torch.from_numpy(reward).to(device=device).reshape(num_envs, 1)
    rewards[:, -1] = reward

    if mode != "delayed":
        pred_return = target_return[:, -1] - (reward * reward_scale)
    else:
        pred_return = target_return[:, -1]
    target_return = torch.cat(
            [target_return, pred_return.reshape(num_envs, -1, 1)], dim=1
        )

    timesteps = torch.cat(
            [
                timesteps,
                torch.ones((num_envs, 1), device=device, dtype=torch.long).reshape(
                    num_envs, 1
                )
                * (t + 1),
            ],
            dim=1,
        )

    if t == max_ep_len - 1:
        done = done
        ind = 0
        episode_length[ind] = np.minimum(episode_length[ind], t + 1)
        
    if np.any(done):
        ind = np.where(done)[0]
        print("ind " + str(ind))
        unfinished[ind] = False
        episode_length[ind] = np.minimum(episode_length[ind], t + 1)

    if not np.any(unfinished):
        break

-1.185349941253662
tensor([[[-0.3673, -0.6925, -0.1066,  0.0391, -0.6722]]], grad_fn=<CopySlices>)
-1.2106000185012817
tensor([[[-0.3673, -0.6925, -0.1066,  0.0391, -0.6722],
         [-0.0409, -0.7217, -0.1225,  0.1506, -0.6952]]], grad_fn=<CopySlices>)
-4.137480735778809
tensor([[[-0.3673, -0.6925, -0.1066,  0.0391, -0.6722],
         [-0.0409, -0.7217, -0.1225,  0.1506, -0.6952],
         [ 0.4102, -0.4528, -0.4400,  0.1944, -0.2847]]], grad_fn=<CopySlices>)
-5.337705612182617
tensor([[[-0.3673, -0.6925, -0.1066,  0.0391, -0.6722],
         [-0.0409, -0.7217, -0.1225,  0.1506, -0.6952],
         [ 0.4102, -0.4528, -0.4400,  0.1944, -0.2847],
         [ 0.4343,  0.3511, -0.4904, -0.4765, -0.1629]]], grad_fn=<CopySlices>)
-2.2948639392852783
tensor([[[-0.3673, -0.6925, -0.1066,  0.0391, -0.6722],
         [-0.0409, -0.7217, -0.1225,  0.1506, -0.6952],
         [ 0.4102, -0.4528, -0.4400,  0.1944, -0.2847],
         [ 0.4343,  0.3511, -0.4904, -0.4765, -0.1629],
         [ 0.6255,  0.6

In [74]:
action_dist

SquashedNormal()

In [73]:
trajectories = []
for ii in range(num_envs):
    ep_len = episode_length[ii].astype(int)
    terminals = np.zeros(ep_len)
    terminals[-1] = 1
    traj = {
            "observations": states[ii].detach().cpu().numpy()[:ep_len],
            "actions": actions[ii].detach().cpu().numpy()[:ep_len],
            "rewards": rewards[ii].detach().cpu().numpy()[:ep_len],
            "terminals": terminals,
        }
    trajectories.append(traj)

In [74]:
#trajectories

In [125]:
returns,lengths,trajs = episode_return.reshape(num_envs),episode_length.reshape(num_envs),trajectories

In [128]:
np.mean(lengths)

10.0

In [77]:
trajs

[{'observations': array([[6.69872984e-02, 8.53553414e-01, 1.00000000e+00, 5.41353345e-01,
          4.77443576e-01, 6.46616518e-01, 5.41353345e-01, 8.22222233e-01,
          7.88888872e-01, 6.44444466e-01, 7.88888872e-01, 0.00000000e+00,
          2.45821048e-02, 9.47885931e-01, 0.00000000e+00, 0.00000000e+00,
          1.04931794e-01, 8.55194151e-01, 0.00000000e+00, 4.74622518e-01,
          3.03030442e-02, 3.03030442e-02, 3.03030442e-02, 3.03030442e-02,
          2.20085010e-01, 0.00000000e+00, 0.00000000e+00, 5.09163201e-01,
          1.06455028e-01, 0.00000000e+00, 0.00000000e+00, 4.61044788e-01,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.03335273e-01,
          8.35057050e-02, 0.00000000e+00, 0.00000000e+00, 4.79821682e-01,
          6.34394661e-02, 0.00000000e+00, 0.00000000e+00, 4.16513085e-01],
         [2.50000000e-01, 8.53553414e-01, 9.82962906e-01, 5.45112789e-01,
          5.18796980e-01, 6.46616518e-01, 5.18796980e-01, 7.66666651e-01,
          7.66666651e

In [115]:
self.replay_buffer.add_new_trajs(trajs)

In [116]:
self.aug_trajs += trajs

In [117]:
self.replay_buffer

<replay_buffer.ReplayBuffer at 0x161d366a0>

In [118]:
len(self.replay_buffer.trajectories)

6

In [119]:
self.variant["num_updates_per_online_iter"]

300

In [120]:
self.replay_buffer.trajectories

[{'observations': array([[0.0669873 , 0.85355341, 1.        , ..., 0.        , 0.        ,
          0.48985395],
         [0.25      , 0.85355341, 0.98296291, ..., 0.        , 0.        ,
          0.47563303],
         [0.25      , 0.85355341, 0.93301272, ..., 0.        , 0.        ,
          0.47491887],
         ...,
         [0.0669873 , 0.85355341, 0.75      , ..., 0.        , 0.97657079,
          0.60107756],
         [0.0669873 , 0.85355341, 0.85355341, ..., 0.        , 0.99151504,
          0.57269001],
         [0.0669873 , 0.85355341, 0.93301272, ..., 0.        , 0.99284434,
          0.5763483 ]]),
  'actions': array([[-0.16516054, -0.30771428, -0.04694301,  0.18486977, -0.58982462],
         [-0.397286  ,  0.11535633, -0.29753679,  0.11361456, -0.3242428 ],
         [-0.3931337 ,  0.32019639, -0.49844468,  0.60656881, -0.06641924],
         ...,
         [-0.49676207,  0.08235592, -0.25964403, -0.15479851,  0.60946286],
         [-0.20605201, -0.35296762,  0.2366128 ,  0

In [121]:
dataloader = create_dataloader(
                trajectories=self.replay_buffer.trajectories,
                num_iters=self.variant["num_updates_per_online_iter"],
                batch_size=self.variant["batch_size"],
                max_len=self.variant["K"],
                state_dim=self.state_dim,
                act_dim=self.act_dim,
                state_mean=self.state_mean,
                state_std=self.state_std,
                reward_scale=self.reward_scale,
                action_range=self.action_range,
            )

In [122]:
#train_outputs = trainer.train_iteration(
#                loss_fn=loss_fn,
#                dataloader=dataloader,
#            )

In [123]:
def _save_model(self, path_prefix, is_pretrain_model=False):
        to_save = {
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "scheduler_state_dict": self.scheduler.state_dict(),
            "pretrain_iter": self.pretrain_iter,
            "online_iter": self.online_iter,
            "args": self.variant,
            "total_transitions_sampled": self.total_transitions_sampled,
            "np": np.random.get_state(),
            "python": random.getstate(),
            "pytorch": torch.get_rng_state(),
            "log_temperature_optimizer_state_dict": self.log_temperature_optimizer.state_dict(),
        }

        with open(f"{path_prefix}/model.pt", "wb") as f:
            torch.save(to_save, f)
        print(f"\nModel saved at {path_prefix}/model.pt")

        if is_pretrain_model:
            with open(f"{path_prefix}/pretrain_model.pt", "wb") as f:
                torch.save(to_save, f)
            print(f"Model saved at {path_prefix}/pretrain_model.pt")


In [124]:
_save_model(self,
                path_prefix=self.logger.log_path,
                is_pretrain_model=False,
            )


Model saved at ./exp/2024.06.06/085724-default/model.pt


## Create Dataloader Testing

In [94]:
self.max_len = 10

In [95]:
traj = self.replay_buffer.trajectories[2]

In [96]:
trajectories

[{'observations': array([[6.69872984e-02, 8.53553414e-01, 1.00000000e+00, 5.41353345e-01,
          4.77443576e-01, 6.46616518e-01, 5.41353345e-01, 8.22222233e-01,
          7.88888872e-01, 6.44444466e-01, 7.88888872e-01, 0.00000000e+00,
          2.45821048e-02, 9.47885931e-01, 0.00000000e+00, 0.00000000e+00,
          1.04931794e-01, 8.55194151e-01, 0.00000000e+00, 4.74622518e-01,
          3.03030442e-02, 3.03030442e-02, 3.03030442e-02, 3.03030442e-02,
          2.20085010e-01, 0.00000000e+00, 0.00000000e+00, 5.09163201e-01,
          1.06455028e-01, 0.00000000e+00, 0.00000000e+00, 4.61044788e-01,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.03335273e-01,
          8.35057050e-02, 0.00000000e+00, 0.00000000e+00, 4.79821682e-01,
          6.34394661e-02, 0.00000000e+00, 0.00000000e+00, 4.16513085e-01],
         [2.50000000e-01, 8.53553414e-01, 9.82962906e-01, 5.45112789e-01,
          5.18796980e-01, 6.46616518e-01, 5.18796980e-01, 7.66666651e-01,
          7.66666651e

In [97]:
si = random.randint(0, traj["rewards"].shape[0] - 1)
si

786

In [98]:
ss = traj["observations"][si : si + self.max_len].reshape(-1, self.state_dim)
aa = traj["actions"][si : si + self.max_len].reshape(-1, self.act_dim)
rr = traj["rewards"][si : si + self.max_len].reshape(-1, 1)

In [99]:
#ss traj

In [100]:
dd = traj["dones"][si : si + self.max_len]

In [101]:
si

786

In [102]:
tlen = ss.shape[0]

In [103]:
timesteps = np.arange(si, si + tlen)

In [104]:
ordering = np.arange(tlen)

In [105]:
ordering[timesteps >= MAX_EPISODE_LEN] = -1
ordering[ordering == -1] = ordering.max()
timesteps[timesteps >= MAX_EPISODE_LEN] = MAX_EPISODE_LEN - 1  # padding cutoff


In [106]:
rtg = discount_cumsum(traj["rewards"][si:], gamma=1.0)[: tlen + 1].reshape(
            -1, 1
        )

NameError: name 'discount_cumsum' is not defined

## Evaluation

In [107]:
def evaluate_episode_rtg(
    state_dim,
    act_dim,
    model,
    target_return,
    max_ep_len = 8760,
    reward_scale = 1 ,
    state_mean = 0.0,
    state_std = 1.0,
    device = "cuda",
    mode="normal",
    use_mean = False,
    schema="citylearn_challenge_2022_phase_2"
):
    model.eval()
    model.to(device="cpu")

    num_envs = 1 

    env = CityLearnEnv(schema=schema)
    env.central_agent = True
    env = NormalizedObservationWrapper(env)
    env = StableBaselines3WrapperCustom(env)

    state,_ = env.reset()

    states = (
        torch.from_numpy(state)
        .reshape(num_envs, state_dim)
        .to(device=device, dtype=torch.float32)
    ).reshape(num_envs, -1, state_dim)

    actions = torch.zeros(0, device=device, dtype=torch.float32)
    rewards = torch.zeros(0, device=device, dtype=torch.float32)
    
    ep_return = target_return

    ep_return = target_return
    target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(
            num_envs, -1, 1
        )
    timesteps = torch.tensor([0] * num_envs, device=device, dtype=torch.long).reshape(
            num_envs, -1
        )
    episode_return = np.zeros((num_envs, 1)).astype(float)
    episode_length = np.full(num_envs, np.inf)

    unfinished = np.ones(num_envs).astype(bool)

    for t in range(max_ep_len):
            # add padding
        actions = torch.cat(
                [
                    actions,
                    torch.zeros((num_envs, act_dim), device=device).reshape(
                        num_envs, -1, act_dim
                    ),
                ],
                dim=1,
            )
        #print(actions)
        rewards = torch.cat(
                [
                    rewards,
                    torch.zeros((num_envs, 1), device=device).reshape(num_envs, -1, 1),
                ],
                dim=1,
            )
    
        state_pred, action_dist, reward_pred = model.get_predictions(
                (states.to(dtype=torch.float32) - state_mean) / state_std,
                actions.to(dtype=torch.float32),
                rewards.to(dtype=torch.float32),
                target_return.to(dtype=torch.float32),
                timesteps.to(dtype=torch.long),
                num_envs=num_envs,
            )
        state_pred = state_pred.detach().cpu().numpy().reshape(num_envs, -1)
        reward_pred = reward_pred.detach().cpu().numpy().reshape(num_envs)
    
            # the return action is a SquashNormal distribution
        action = action_dist.sample().reshape(num_envs, -1, act_dim)[:, -1]
        if use_mean:
            action = action_dist.mean.reshape(num_envs, -1, act_dim)[:, -1]
        action = action.clamp(*model.action_range)
        #print(action.detach().cpu().numpy())
    
        state, reward, done, _,_ = aug_env.step(action.detach().cpu().numpy()[0])
    
            # eval_env.step() will execute the action for all the sub-envs, for those where
            # the episodes have terminated, the envs will be reset. Hence we use
            # "unfinished" to track whether the first episode we roll out for each sub-env is
            # finished. In contrast, "done" only relates to the current episode
        #print(reward)
        episode_return += reward
    
        actions[:, -1] = action
        #print(actions)
        state = (
                torch.from_numpy(state).to(device=device).reshape(num_envs, -1, state_dim)
            )
        states = torch.cat([states, state], dim=1)
        #reward = torch.from_numpy(reward).to(device=device).reshape(num_envs, 1)
        rewards[:, -1] = reward
    
        if mode != "delayed":
            pred_return = target_return[:, -1] - (reward * reward_scale)
        else:
            pred_return = target_return[:, -1]
        target_return = torch.cat(
                [target_return, pred_return.reshape(num_envs, -1, 1)], dim=1
            )
    
        timesteps = torch.cat(
                [
                    timesteps,
                    torch.ones((num_envs, 1), device=device, dtype=torch.long).reshape(
                        num_envs, 1
                    )
                    * (t + 1),
                ],
                dim=1,
            )
    
        if t == max_ep_len - 1:
            done = done
            ind = 0
            episode_length[ind] = np.minimum(episode_length[ind], t + 1)
            
        if np.any(done):
            ind = np.where(done)[0]
            #print("ind " + str(ind))
            unfinished[ind] = False
            episode_length[ind] = np.minimum(episode_length[ind], t + 1)
    
        if not np.any(unfinished):
            break


    trajectories = []
    for ii in range(num_envs):
        ep_len = episode_length[ii].astype(int)
        terminals = np.zeros(ep_len)
        terminals[-1] = 1
        traj = {
                "observations": states[ii].detach().cpu().numpy()[:ep_len],
                "actions": actions[ii].detach().cpu().numpy()[:ep_len],
                "rewards": rewards[ii].detach().cpu().numpy()[:ep_len],
                "terminals": terminals,
            }
        trajectories.append(traj)
    return (
        episode_return.reshape(num_envs),
        episode_length.reshape(num_envs),
        trajectories,
    )
    

In [108]:
returns,lengths,_ = evaluate_episode_rtg(self.state_dim,self.act_dim,self.model, -6000, max_ep_len = 10, device = "cpu", use_mean = True)

In [109]:
lengths

array([10.])

In [110]:
MAX_EPISODE_LEN = 10 

In [111]:
def create_eval_episodes_fn(eval_rtg,
                            state_dim,
                            act_dim,
                            state_mean, 
                            state_std, 
                            device, 
                            use_mean = False, 
                            reward_scale = 1 ,
                            schema = "citylearn_challenge_2022_phase_2"):

    def eval_episodes_fn(model):
        target_return = eval_rtg * reward_scale
        returns,lengths,_ = evaluate_episode_rtg(state_dim,act_dim,model, -6000, MAX_EPISODE_LEN, device = "cpu", use_mean = True)

        return {
                f"evaluation/return": returns,
                f"evaluation/length": lengths,
            }
    return eval_episodes_fn
    

In [112]:
eval_fn = create_eval_episodes_fn(-6000,self.state_dim, self.act_dim, 0,1,self.device, use_mean = True)

In [113]:
def evaluate(self,eval_fn):
    eval_start = time.time()
    self.model.eval()
    outputs = {}
    
    o = eval_fn(self.model)
    outputs.update(o)
    outputs["time/evaluation"] = time.time() - eval_start

    eval_reward = outputs["evaluation/return"]
    return outputs, eval_reward


In [114]:
evaluate(self,eval_fn)

({'evaluation/return': array([-12.78699154]),
  'evaluation/length': array([10.]),
  'time/evaluation': 0.6521708965301514},
 array([-12.78699154]))