In [72]:
import os
import torch
import numpy as np
import wandb

import pickle
from tqdm.auto import trange, tqdm
from torch.utils.data import Dataset
from dataclasses import dataclass
from datasets import load_from_disk
from omegaconf import OmegaConf
import torch.nn.functional as F

from citylearn.agents.rbc import HourRBC
from citylearn.agents.q_learning import TabularQLearning
from citylearn.citylearn import CityLearnEnv
from citylearn.data import DataSet
from citylearn.reward_function import RewardFunction
from citylearn.wrappers import NormalizedObservationWrapper
from citylearn.wrappers import StableBaselines3Wrapper
from citylearn.wrappers import TabularQLearningWrapper

from stable_baselines3.a2c import A2C

from torch.utils.data import DataLoader
from trajectory.models.gpt import GPT, GPTTrainer

from trajectory.utils.common import pad_along_axis
from trajectory.utils.discretization import KBinsDiscretizer
from trajectory.utils.env import create_env

%matplotlib inline
import matplotlib.pyplot as plt


In [2]:
offline_data_path = "data_interactions/PPO/model_PPO_timesteps_100000_seed_572.pkl"

In [3]:
dataset = load_from_disk(offline_data_path)

In [4]:
def join_trajectory(states, actions, rewards, discount=0.99):
    traj_length = states.shape[0]
    # I can vectorize this for all dataset as once,
    # but better to be safe and do it once and slow and right (and cache it)
    
    if actions.ndim == 3 :
        actions = actions.reshape(actions.shape[0],actions.shape[1])
    
    if rewards.ndim == 1 :
        rewards = rewards.reshape(rewards.shape[0],1)
        
    print("Discount "+str(discount))
    discounts = (discount ** np.arange(traj_length))

    values = np.zeros_like(rewards)
    for t in range(traj_length):
        # discounted return-to-go from state s_t:
        # r_{t+1} + y * r_{t+2} + y^2 * r_{t+3} + ...
        # .T as rewards of shape [len, 1], see https://github.com/Howuhh/faster-trajectory-transformer/issues/9
        values[t] = (rewards[t + 1:].T * discounts[:-t - 1]).sum()
    print(states.shape)
    print(actions.shape)
    print(rewards.shape)
    print(values.shape)

    joined_transition = np.concatenate([states, actions, rewards, values], axis=-1)

    return joined_transition

def segment(states, actions, rewards, terminals):
    assert len(states) == len(terminals)
    
    trajectories = {}

    episode_num = 0
    for t in trange(len(terminals), desc="Segmenting"):
        if episode_num not in trajectories:
            trajectories[episode_num] = {
                "states": [],
                "actions": [],
                "rewards": []
            }
        
        trajectories[episode_num]["states"].append(states[t])
        trajectories[episode_num]["actions"].append(actions[t])
        trajectories[episode_num]["rewards"].append(rewards[t])

        if terminals[t]:
            # next episode
            episode_num = episode_num + 1

    trajectories_lens = [len(v["states"]) for k, v in trajectories.items()]

    for t in trajectories:
        trajectories[t]["states"] = np.stack(trajectories[t]["states"], axis=0)
        trajectories[t]["actions"] = np.stack(trajectories[t]["actions"], axis=0)
        trajectories[t]["rewards"] = np.stack(trajectories[t]["rewards"], axis=0)

    return trajectories, trajectories_lens


In [25]:
trajectories,traj_lengths = segment(dataset["observations"],dataset["actions"],dataset["rewards"],dataset["dones"])
joined_transitions=[]

Segmenting:   0%|          | 0/100352 [00:00<?, ?it/s]

In [27]:
for t in tqdm(trajectories, desc="Joining transitions"):
    joined_transitions.append(
                    join_trajectory(trajectories[t]["states"], trajectories[t]["actions"], trajectories[t]["rewards"],discount = 0.99)
                )

Joining transitions:   0%|          | 0/12 [00:00<?, ?it/s]

Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(4003, 26)
(4003, 5)
(4003, 1)
(4003, 1)


In [32]:
num_bins = 100
strategy = "uniform"
discretizer = KBinsDiscretizer(
            np.concatenate(joined_transitions, axis=0),
            num_bins=num_bins,
            strategy=strategy
        )

In [33]:
discretizer

<trajectory.utils.discretization.KBinsDiscretizer at 0x7f3922529130>

In [34]:
class DiscretizedDataset(Dataset):
    def __init__(self, dataset,env_name="city_learn", num_bins=100, seq_len=10, discount=0.99, strategy="uniform", cache_path="data"):
        self.seq_len = seq_len
        self.discount = discount
        self.num_bins = num_bins
        self.dataset = dataset
        self.env_name = env_name
        
        trajectories, traj_lengths = segment(self.dataset["observations"],self.dataset["actions"],self.dataset["rewards"],self.dataset["dones"])
        self.trajectories = trajectories
        self.traj_lengths = traj_lengths
        self.cache_path = cache_path
        self.cache_name = f"{env_name}_{num_bins}_{seq_len}_{strategy}_{discount}"
        
        self.joined_transitions = []
        for t in tqdm(trajectories, desc="Joining transitions"):
            self.joined_transitions.append(
                    join_trajectory(trajectories[t]["states"], trajectories[t]["actions"], trajectories[t]["rewards"],discount = self.discount)
                )
        """
        if cache_path is None or not os.path.exists(os.path.join(cache_path, self.cache_name)):
            self.joined_transitions = []
            for t in tqdm(trajectories, desc="Joining transitions"):
                self.joined_transitions.append(
                    join_trajectory(trajectories[t]["states"], trajectories[t]["actions"], trajectories[t]["rewards"],discount = self.discount)
                )

            os.makedirs(os.path.join(cache_path), exist_ok=True)
            # save cached version
            with open(os.path.join(cache_path, self.cache_name), "wb") as f:
                pickle.dump(self.joined_transitions, f)
        else:
            with open(os.path.join(cache_path, self.cache_name), "rb") as f:
                self.joined_transitions = pickle.load(f)
        """

        self.discretizer = KBinsDiscretizer(
            np.concatenate(self.joined_transitions, axis=0),
            num_bins=num_bins,
            strategy=strategy
        )

        # get valid indices for seq_len sampling
        indices = []
        for path_ind, length in enumerate(traj_lengths):
            end = length - 1
            for i in range(end):
                indices.append((path_ind, i, i + self.seq_len))
        self.indices = np.array(indices)

    def get_env_name(self):
        return self.env.name

    def get_discretizer(self):
        return self.discretizer

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        #print(idx)
        traj_idx, start_idx, end_idx = self.indices[idx]
        
        joined = self.joined_transitions[traj_idx][start_idx:end_idx]
        

        loss_pad_mask = np.ones((self.seq_len, joined.shape[-1]))
        if joined.shape[0] < self.seq_len:
            # pad to seq_len if at the end of trajectory, mask for padding
            loss_pad_mask[joined.shape[0]:] = 0
            joined = pad_along_axis(joined, pad_to=self.seq_len, axis=0)

        joined_discrete = self.discretizer.encode(joined).reshape(-1).astype(np.longlong)
        loss_pad_mask = loss_pad_mask.reshape(-1)

        return joined_discrete[:-1], joined_discrete[1:], loss_pad_mask[:-1]


In [35]:
config = OmegaConf.load("configs/medium/city_learn.yaml")
wandb.init(
        **config.wandb,
        config=dict(OmegaConf.to_container(config, resolve=True))
    )
device = "cuda:0"

  return LooseVersion(v) >= LooseVersion(check)


In [36]:
datasets = DiscretizedDataset(dataset,discount = 0.99)

Segmenting:   0%|          | 0/100352 [00:00<?, ?it/s]

Joining transitions:   0%|          | 0/12 [00:00<?, ?it/s]

Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 26)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(4003, 26)
(4003, 5)
(4003, 1)
(4003, 1)


In [38]:
datasets.joined_transitions[0][0].shape

(33,)

In [39]:
batch_size = 1
dataloader = DataLoader(datasets, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)


In [41]:
device = "cpu"

In [42]:
for i, batch in enumerate(tqdm(dataloader, desc="Epoch", leave=False)):
    batch = [b.to(device) for b in batch]
    break

Epoch:   0%|          | 0/100340 [00:00<?, ?it/s]



In [43]:
batch

[tensor([[81, 43, 64, 45, 85,  0, 27, 78, 75, 31, 52, 78, 99, 45, 12, 77,  0, 29,
           3, 56, 99, 35,  2, 55, 98, 34, 99, 88, 61, 91, 92, 89, 40, 81, 47, 64,
          45, 85,  0, 28, 78, 97, 47, 58, 82, 99, 47, 40, 82, 20, 46,  3, 67, 99,
          32,  2, 67, 99, 27, 77, 12, 92,  0, 75, 84, 40, 81, 52, 65, 47, 82,  0,
          36, 72, 99, 43,  7, 80, 82, 23, 33, 80, 92, 62,  3, 73, 82, 25,  2, 73,
          99, 26, 66, 48, 99, 99, 67, 99, 40, 81, 56, 66, 47, 85,  0, 34, 59, 99,
          44,  4, 71, 78, 28, 11, 71, 99, 32,  3, 72, 98, 38,  2, 72, 99, 26, 30,
          99,  0, 99, 29, 99, 39, 81, 60, 67, 47, 85,  0, 35, 41, 81, 42,  7, 57,
          97, 40, 28, 56, 79, 33,  3, 65, 99, 33, 33, 65, 81, 35, 99,  9,  0, 63,
          77, 80, 39, 81, 65, 68, 45, 85, 87, 36, 17, 98, 59,  0, 26, 73, 29, 25,
          26,  1, 20,  3, 34, 99, 39, 36, 36, 98, 52, 51, 99, 22, 47, 42, 87, 39,
          81, 69, 68, 45, 85, 87, 31,  4, 99, 54,  0,  9, 97, 48,  6,  9,  0, 39,
           3, 11

### Training Part

In [44]:
path = "configs/medium/city_learn.yaml"
config = OmegaConf.load("configs/medium/city_learn_traj.yaml")
trainer_conf = config.trainer
data_conf = config.dataset

In [45]:
model = GPT(**config.model)
model.to(device)

GPT(
  (tok_emb): Embedding(3300, 128)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (blocks): ModuleList(
    (0-3): 4 x TransformerBlock(
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.1, inplace=False)
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=512, out_features=128, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): EinLinear(n_models=33, in_features=128, out_features=100, bias=False)
)

In [46]:
tokens, targets, loss_pad_mask = batch

In [47]:
logits,state = model(tokens)

In [80]:
tokens.shape

torch.Size([1, 329])

In [54]:
logits.reshape(-1,logits.size(-1)).shape

torch.Size([329, 100])

In [75]:
 F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), reduction="none")

tensor([5.0381, 4.6269, 4.7293, 4.8253, 5.7284, 5.0677, 5.0340, 6.3195, 4.5720,
        4.3980, 6.2600, 4.6921, 4.9512, 4.5500, 4.4854, 4.9931, 4.8249, 5.0680,
        5.2321, 5.3744, 3.8377, 4.7963, 5.0063, 5.1536, 5.0111, 4.5762, 5.0376,
        5.3119, 5.0973, 4.7026, 4.5403, 5.0938, 5.5464, 4.6989, 5.0340, 4.5975,
        5.1440, 5.9169, 4.9770, 5.0263, 4.2698, 5.0621, 5.1759, 4.5404, 4.7906,
        4.2730, 4.1245, 4.1844, 5.6344, 5.2610, 5.1651, 4.5114, 5.2583, 4.7403,
        4.2205, 5.1672, 4.1532, 4.5078, 3.8340, 3.9620, 5.1510, 4.2050, 4.4064,
        3.6954, 4.9853, 5.4893, 4.2473, 3.9271, 5.3503, 4.2636, 5.4463, 5.2832,
        3.8971, 6.5259, 4.1326, 4.3753, 5.1215, 5.2528, 5.3798, 5.0849, 4.1875,
        3.7527, 5.3062, 5.3100, 4.8814, 4.1292, 4.5666, 4.1819, 5.9207, 4.2503,
        4.9299, 4.8595, 4.9214, 5.1158, 4.6382, 4.3642, 5.9857, 4.9086, 6.1734,
        5.4338, 5.5926, 5.3877, 5.3025, 5.6039, 3.3215, 3.2692, 5.7723, 4.6353,
        5.3754, 5.1430, 4.8481, 5.6612, 

In [15]:
num_epochs = int(3e4 / len(datasets) * trainer_conf.num_epochs_ref)

warmup_tokens = len(datasets) * data_conf.seq_len * config.model.transition_dim
final_tokens = warmup_tokens * num_epochs

In [16]:
num_epochs

14

In [17]:
trainer = GPTTrainer(
        final_tokens=final_tokens,
        warmup_tokens=warmup_tokens,
        action_weight=trainer_conf.action_weight,
        value_weight=trainer_conf.value_weight,
        reward_weight=trainer_conf.reward_weight,
        learning_rate=trainer_conf.lr,
        betas=trainer_conf.betas,
        weight_decay=trainer_conf.weight_decay,
        clip_grad=trainer_conf.clip_grad,
        eval_seed=trainer_conf.eval_seed,
        eval_every=trainer_conf.eval_every,
        eval_episodes=trainer_conf.eval_episodes,
        eval_temperature=trainer_conf.eval_temperature,
        eval_discount=trainer_conf.eval_discount,
        eval_plan_every=trainer_conf.eval_plan_every,
        eval_beam_width=trainer_conf.eval_beam_width,
        eval_beam_steps=trainer_conf.eval_beam_steps,
        eval_beam_context=trainer_conf.eval_beam_context,
        eval_sample_expand=trainer_conf.eval_sample_expand,
        eval_k_obs=trainer_conf.eval_k_obs,  # as in original implementation
        eval_k_reward=trainer_conf.eval_k_reward,
        eval_k_act=trainer_conf.eval_k_act,
        checkpoints_path=trainer_conf.checkpoints_path,
        save_every=1,
        device=device
    )

In [18]:
trainer.train(
        model=model,
        dataloader=dataloader,
        num_epochs=num_epochs
    )

Training:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 1: 5.208583519399168


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 2: 4.204621164260852


Epoch:   0%|          | 0/1568 [01:20<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>Exception ignored in: Traceback (most recent call last):

  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>Traceback (most recent call last):
    
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
self._shutdown_workers()Traceback (most recent call last):

      File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
self._shutdown_workers()        
if w.is_alive():s

   EPOCH 3: 3.937248721385705


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 4: 3.7893836208994514


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 5: 3.70218578502506


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 6: 3.642668913940583


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 7: 3.59653997517662


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
Traceback (most recent call last):
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
Traceback (most recent call last):
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/home/ml-stud15/a

   EPOCH 8: 3.5569109584974266


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 9: 3.5215758410977123


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 10: 3.4915563310790403


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 11: 3.467680165991402


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

   EPOCH 12: 3.4512651699067414


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
Traceback (most recent call last):
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
Traceback (most recent call last):
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/home/ml-stud15/a

   EPOCH 13: 3.444154729218725


Epoch:   0%|          | 0/1568 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0><function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>

Traceback (most recent call last):
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
Traceback (most recent call last):
      File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
Exception ignored in: self._shutdown_workers()    <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb7f1ac45e0>
self._shutdown_workers()
  File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers

Traceback (most recent call last):
      File "/home/ml-stud15/anaconda3/envs/stable_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
if w.is_ali

   EPOCH 14: 3.4392617026875443


GPT(
  (tok_emb): Embedding(3300, 128)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (blocks): ModuleList(
    (0-3): 4 x TransformerBlock(
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.1, inplace=False)
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=512, out_features=128, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): EinLinear(n_models=33, in_features=128, out_features=100, bias=False)
)

In [None]:
device 

In [None]:
torch.cuda.is_available()