In [142]:
import os
import torch
import numpy as np
import wandb

import pickle
from tqdm.auto import trange, tqdm
from torch.utils.data import Dataset
from dataclasses import dataclass
from datasets import load_from_disk
from omegaconf import OmegaConf
from replay_buffer import ReplayBuffer

from citylearn.agents.rbc import HourRBC
from citylearn.agents.q_learning import TabularQLearning
from citylearn.citylearn import CityLearnEnv
from citylearn.data import DataSet
from citylearn.reward_function import RewardFunction
from citylearn.wrappers import NormalizedObservationWrapper
from citylearn.wrappers import StableBaselines3Wrapper
from citylearn.wrappers import TabularQLearningWrapper

from stable_baselines3.a2c import A2C

from torch.utils.data import DataLoader
from trajectory.models.gpt import GPT, GPTTrainer

from trajectory.utils.common import pad_along_axis
from trajectory.utils.discretization import KBinsDiscretizer
from trajectory.utils.env import create_env
import datasets


In [143]:
def sort_trajectory(trajectories):
    states, traj_lens, returns = [], [], []
    for i in trajectories:
        states.append(trajectories[i]["states"])
        traj_lens.append(len(trajectories[i]["states"]))
        returns.append(np.array(trajectories[i]["rewards"]).sum())
    num_timesteps = sum(traj_lens)
    sorted_inds = np.argsort(returns)  # lowest to highest
    num_trajectories = 1
    timesteps = traj_lens[sorted_inds[-1]]
    ind = len(trajectories) - 2
    while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] < num_timesteps:
        timesteps += traj_lens[sorted_inds[ind]]
        num_trajectories += 1
        ind -= 1
    sorted_inds = sorted_inds[-num_trajectories:]
    print(sorted_inds)
            #print(trajectories[1])
    for ii in sorted_inds:
        print(ii)
            #print(trajectories[0].keys())
    trajectories = [trajectories[int(ii)] for ii in sorted_inds]
    for trajectory in trajectories:
        for key in trajectory.keys():
            trajectory[key] = np.array(trajectory[key])
    return trajectories

In [144]:
def join_trajectory(states, actions, rewards, discount=0.99):
    traj_length = states.shape[0]
    # I can vectorize this for all dataset as once,
    # but better to be safe and do it once and slow and right (and cache it)
    
    if actions.ndim == 3 :
        actions = actions.reshape(actions.shape[0],actions.shape[1])
    
    if rewards.ndim == 1 :
        rewards = rewards.reshape(rewards.shape[0],1)
        
    print("Discount "+str(discount))
    discounts = (discount ** np.arange(traj_length))

    values = np.zeros_like(rewards)
    for t in range(traj_length):
        # discounted return-to-go from state s_t:
        # r_{t+1} + y * r_{t+2} + y^2 * r_{t+3} + ...
        # .T as rewards of shape [len, 1], see https://github.com/Howuhh/faster-trajectory-transformer/issues/9
        values[t] = (rewards[t + 1:].T * discounts[:-t - 1]).sum()
    print(states.shape)
    print(actions.shape)
    print(rewards.shape)
    print(values.shape)

    joined_transition = np.concatenate([states, actions, rewards, values], axis=-1)

    return joined_transition

def segment(states, actions, rewards, terminals):
    assert len(states) == len(terminals)
    
    trajectories = {}

    episode_num = 0
    for t in trange(len(terminals), desc="Segmenting"):
        if episode_num not in trajectories:
            trajectories[episode_num] = {
                "states": [],
                "actions": [],
                "rewards": []
            }
        
        trajectories[episode_num]["states"].append(states[t])
        trajectories[episode_num]["actions"].append(actions[t])
        trajectories[episode_num]["rewards"].append(rewards[t])

        if terminals[t]:
            # next episode
            episode_num = episode_num + 1

    trajectories_lens = [len(v["states"]) for k, v in trajectories.items()]

    for t in trajectories:
        trajectories[t]["states"] = np.stack(trajectories[t]["states"], axis=0)
        trajectories[t]["actions"] = np.stack(trajectories[t]["actions"], axis=0)
        trajectories[t]["rewards"] = np.stack(trajectories[t]["rewards"], axis=0)

    return trajectories, trajectories_lens

class DiscretizedDataset(Dataset):
    def __init__(self, trajectories,traj_lengths, env_name="city_learn", num_bins=100, seq_len=10, discount=0.99, strategy="uniform", cache_path="data"):
        self.seq_len = seq_len
        self.discount = discount
        self.num_bins = num_bins
        self.dataset = dataset
        self.env_name = env_name
        
        #trajectories, traj_lengths = segment(self.dataset["observations"],self.dataset["actions"],self.dataset["rewards"],self.dataset["dones"])
        #trajectories = self.sort_trajectory(trajectories)
        self.trajectories = trajectories
        self.traj_lengths = traj_lengths
        self.cache_path = cache_path
        self.cache_name = f"{env_name}_{num_bins}_{seq_len}_{strategy}_{discount}"
        
        self.joined_transitions = []
        for i,t in enumerate(tqdm(trajectories, desc="Joining transitions")):
            self.joined_transitions.append(
                    join_trajectory(trajectories[i]["states"], trajectories[i]["actions"], trajectories[i]["rewards"],discount = self.discount)
                )
        """
        if cache_path is None or not os.path.exists(os.path.join(cache_path, self.cache_name)):
            self.joined_transitions = []
            for t in tqdm(trajectories, desc="Joining transitions"):
                self.joined_transitions.append(
                    join_trajectory(trajectories[t]["states"], trajectories[t]["actions"], trajectories[t]["rewards"],discount = self.discount)
                )

            os.makedirs(os.path.join(cache_path), exist_ok=True)
            # save cached version
            with open(os.path.join(cache_path, self.cache_name), "wb") as f:
                pickle.dump(self.joined_transitions, f)
        else:
            with open(os.path.join(cache_path, self.cache_name), "rb") as f:
                self.joined_transitions = pickle.load(f)
        """

        self.discretizer = KBinsDiscretizer(
            np.concatenate(self.joined_transitions, axis=0),
            num_bins=num_bins,
            strategy=strategy
        )

        # get valid indices for seq_len sampling
        indices = []
        for path_ind, length in enumerate(traj_lengths):
            end = length - 1
            for i in range(end):
                indices.append((path_ind, i, i + self.seq_len))
        self.indices = np.array(indices)

    def get_env_name(self):
        return self.env.name

   

    def get_discretizer(self):
        return self.discretizer

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        #print(idx)
        traj_idx, start_idx, end_idx = self.indices[idx]
        
        joined = self.joined_transitions[traj_idx][start_idx:end_idx]
        

        loss_pad_mask = np.ones((self.seq_len, joined.shape[-1]))
        if joined.shape[0] < self.seq_len:
            # pad to seq_len if at the end of trajectory, mask for padding
            loss_pad_mask[joined.shape[0]:] = 0
            joined = pad_along_axis(joined, pad_to=self.seq_len, axis=0)

        joined_discrete = self.discretizer.encode(joined).reshape(-1).astype(np.longlong)
        loss_pad_mask = loss_pad_mask.reshape(-1)

        return joined_discrete[:-1], joined_discrete[1:], loss_pad_mask[:-1]

In [145]:
## Training

In [148]:
def train(config_path = "configs/medium/city_learn_traj.yaml",offline_data_path = None,device = "cpu"):
    config = OmegaConf.load(config_path)

    if offline_data_path is None :
        offline_data_path = config.trainer.offline_data_path
    if torch.cuda.is_available():
        device = "cuda:1"

  
    wandb.init(
            **config.wandb,
            config=dict(OmegaConf.to_container(config, resolve=True))
        )
    
    offline_data_path = offline_data_path
    dataset = load_from_disk(offline_data_path)

    trajectories, traj_lengths = segment(dataset["observations"],dataset["actions"],dataset["rewards"],dataset["dones"])
    offline_trajectories = sort_trajectory(trajectories)

    replay_buffer = ReplayBuffer(5, offline_trajectories)


    datasets = DiscretizedDataset(offline_trajectories,traj_lengths,discount = config.dataset.discount, seq_len = config.dataset.seq_len, strategy = config.dataset.strategy)
    dataloader = DataLoader(datasets,  batch_size=config.dataset.batch_size, shuffle=True, num_workers=8, pin_memory=True)

    trainer_conf = config.trainer
    data_conf = config.dataset

    model = GPT(**config.model)
    model.to(device)
    

    num_epochs = config.trainer.num_epochs

    warmup_tokens = len(datasets) * data_conf.seq_len * config.model.transition_dim
    final_tokens = warmup_tokens * num_epochs

    trainer = GPTTrainer(
            final_tokens=final_tokens,
            warmup_tokens=warmup_tokens,
            action_weight=trainer_conf.action_weight,
            value_weight=trainer_conf.value_weight,
            reward_weight=trainer_conf.reward_weight,
            learning_rate=trainer_conf.lr,
            betas=trainer_conf.betas,
            weight_decay=trainer_conf.weight_decay,
            clip_grad=trainer_conf.clip_grad,
            eval_seed=trainer_conf.eval_seed,
            eval_every=trainer_conf.eval_every,
            eval_episodes=trainer_conf.eval_episodes,
            eval_temperature=trainer_conf.eval_temperature,
            eval_discount=trainer_conf.eval_discount,
            eval_plan_every=trainer_conf.eval_plan_every,
            eval_beam_width=trainer_conf.eval_beam_width,
            eval_beam_steps=trainer_conf.eval_beam_steps,
            eval_beam_context=trainer_conf.eval_beam_context,
            eval_sample_expand=trainer_conf.eval_sample_expand,
            eval_k_obs=trainer_conf.eval_k_obs,  # as in original implementation
            eval_k_reward=trainer_conf.eval_k_reward,
            eval_k_act=trainer_conf.eval_k_act,
            checkpoints_path=trainer_conf.checkpoints_path,
            save_every=1,
            device=device
        )
    
    trainer.train(
        model=model,
        dataloader=dataloader,
        num_epochs=num_epochs
    )

In [149]:
train(config_path = "configs/medium/city_learn_ott.yaml")

Exception ignored in: <function tqdm.__del__ at 0x115259310>
Traceback (most recent call last):
  File "/Users/danieljonatan/miniconda3/envs/stable3/lib/python3.9/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/Users/danieljonatan/miniconda3/envs/stable3/lib/python3.9/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


Segmenting:   0%|          | 0/30000 [00:00<?, ?it/s]

[1 2 3]
1
2
3


Joining transitions:   0%|          | 0/3 [00:00<?, ?it/s]

Discount 0.99
(8759, 44)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(8759, 44)
(8759, 5)
(8759, 1)
(8759, 1)
Discount 0.99
(3723, 44)
(3723, 5)
(3723, 1)
(3723, 1)


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/danieljonatan/miniconda3/envs/stable3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/danieljonatan/miniconda3/envs/stable3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'DiscretizedDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

## Online Tuning Part