## install minigrid

In [None]:
!pip install minigrid
!pip install gym==0.26.2

Collecting minigrid
  Downloading minigrid-2.3.1-py3-none-any.whl (103 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m102.4/103.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.8/103.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting gymnasium>=0.28.1 (from minigrid)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium>=0.28.1->minigrid)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, minigrid
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 minigrid-2.3.1
Collecting gym==0.26.2
  Downloading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import libs


In [None]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import sys
import random
import csv
from datetime import datetime
import pickle
import collections
import math

import numpy as np
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from gym.envs.registration import registry, register


from __future__ import annotations

from minigrid.core.constants import COLOR_NAMES
from minigrid.core.grid import Grid
from minigrid.core.mission import MissionSpace
from minigrid.core.world_object import Door, Goal, Key, Wall
from minigrid.manual_control import ManualControl
from minigrid.minigrid_env import MiniGridEnv
import numpy as np




# training parameters

In [None]:
env_name = 'KeyToDoorEnv-v0'
rtg_target = 1
dataset_sz = 1659
dataset_path = '/content/drive/MyDrive/6_8200_project/decision_transformer/new_key_to_door_1610.pkl'

T = 225 # episode length

max_eval_ep_len = T      # max len of one evaluation episode
num_eval_ep = 10            # num of evaluation episodes per iteration

batch_size = 1

            # training batch size
lr = 1e-4                   # learning rate
wt_decay = 1e-4             # weight decay
warmup_steps = 10000        # warmup steps for lr scheduler

# total updates = max_train_iters x num_updates_per_iter
max_train_iters = 200
num_updates_per_iter = 100

context_len = T        # K in decision transformer

act_dim = 7             # num of actions
n_blocks = 3            # num of transformer blocks
embed_dim = 128
        # embedding (hidden) dim of transformer
n_heads = 1             # num of transformer heads
dropout_p = 0.1         # dropout probability


# saves model and csv in this directory
log_dir =  f'/content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/'
os.makedirs(log_dir, exist_ok=True)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)


# training and evaluation device
device_name = 'cuda'
device = torch.device(device_name)
print("device set to: ", device)



device set to:  cuda


# KeyToDoor environment

In [None]:
class KeyToDoorEnv(MiniGridEnv):
    def __init__(
        self,
        phase_2_begin = 30,
        phase_3_begin = 60,
        max_steps: int | None = None,
        **kwargs,
    ):
        assert phase_2_begin < phase_3_begin, "Phase 2 must start before Phase 3"
        assert max_steps > phase_3_begin, "Episode must end after 3 phases"

        mission_space = MissionSpace(mission_func=self._gen_mission)
        size = 25
        self.agent_start_pos=(2, 5)
        self.agent_start_dir=0

        if max_steps is None:
            max_steps = 4 * size**2

        self.phase_2_iter_begin = phase_2_begin
        self.phase_3_iter_begin = phase_3_begin
        self.end_iter = max_steps

        super().__init__(
            mission_space=mission_space,
            grid_size=size,
            # Set this to True for maximum speed
            see_through_walls=False,
            max_steps=max_steps,
            agent_view_size=9,
            **kwargs,
        )

    @staticmethod
    def _gen_mission():
        return "grand mission"

    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)

        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Generate vertical separation wall
        for i in range(0, height):
            self.grid.set(6, i, Wall())
            self.grid.set(7, i, Wall())
            self.grid.set(17, i, Wall())
            self.grid.set(18, i, Wall())
        for i in range(0, width):
            self.grid.set(i, 6, Wall())
        self.grid.set(1, 3, Wall())
        self.grid.set(5, 3, Wall())
        self.grid.set(19, 3, Wall())
        self.grid.set(23, 3, Wall())

        # Place key
        self.grid.set(4, 2, Key(COLOR_NAMES[1]))

        # Place a goal square in the bottom-right corner
        self.put_obj(Door(COLOR_NAMES[1], is_locked=True), 19, 5)

        # Place the agent
        if self.agent_start_pos is not None:
            self.agent_pos = self.agent_start_pos
            self.agent_dir = self.agent_start_dir
        else:
            self.place_agent()

        self.mission = "grand mission"

    def step(self, action):
        reward = 0
        terminated = False
        truncated = False

        self.step_count += 1

        if self.step_count == self.phase_2_iter_begin:
            self.agent_pos = (12, 5)
            self.agent_dir = self.agent_start_dir
            obs = self.gen_obs()
            return obs, reward, terminated, truncated, {}
        elif self.step_count == self.phase_3_iter_begin:
            self.agent_pos = (22,1)
            self.agent_dir = self.agent_start_dir
            obs = self.gen_obs()
            return obs, reward, terminated, truncated, {}

        # Get the position in front of the agent
        fwd_pos = self.front_pos

        # Get the contents of the cell in front of the agent
        fwd_cell = self.grid.get(*fwd_pos)

        # Rotate left
        if action == self.actions.left:
            self.agent_dir -= 1
            if self.agent_dir < 0:
                self.agent_dir += 4

        # Rotate right
        elif action == self.actions.right:
            self.agent_dir = (self.agent_dir + 1) % 4

        # Move forward
        elif action == self.actions.forward:
            if fwd_cell is None or fwd_cell.can_overlap():
                self.agent_pos = tuple(fwd_pos)

        # Pick up an object
        elif action == self.actions.pickup:
            if fwd_cell and fwd_cell.can_pickup():
                if self.carrying is None:
                    self.carrying = fwd_cell
                    self.carrying.cur_pos = np.array([-1, -1])
                    self.grid.set(fwd_pos[0], fwd_pos[1], None)

        # Drop an object
        elif action == self.actions.drop:
            if not fwd_cell and self.carrying:
                self.grid.set(fwd_pos[0], fwd_pos[1], self.carrying)
                self.carrying.cur_pos = fwd_pos
                self.carrying = None

        # Toggle/activate an object
        elif action == self.actions.toggle:
            if fwd_cell:
                fwd_cell.toggle(self, fwd_pos)

            if fwd_cell is not None and fwd_cell.type == "door":
                terminated = True
                if not fwd_cell.is_locked:
                    reward = 1
                else:
                    reward = 0
        # Done action (not used by default)
        elif action == self.actions.done:
            pass

        else:
            raise ValueError(f"Unknown action: {action}")

        if self.step_count >= self.max_steps:
            truncated = True

        if self.render_mode == "human":
            self.render()

        obs = self.gen_obs()

        return obs, reward, terminated, truncated, {}




# Dataset generation (skip if already have a dataset)

In [None]:
env = KeyToDoorEnv(phase_2_begin=100, phase_3_begin=125, max_steps=T)
trajectories = []
good = 0
for _ in range(dataset_sz // 4 * 3):
  traj = {}
  s, _ = env.reset()
  path = [s['image']]
  actions = []
  rewards = []
  dones = []
  for i in range(T):
    action = env.action_space.sample()
    # Don't allow "DROP" action.
    while action == 4:
      action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(np.array([action]))
    path.append(obs['image'])
    actions.append([action])
    rewards.append(reward)
    dones.append(done)

    if reward:
      good += 1

    if done or truncated:
      break
  path = path[:-1]
  traj['rewards'] = np.array(rewards)
  traj['dones'] = np.array(dones)
  traj['actions'] = np.array(actions)
  traj['observations'] = np.array(path)
  trajectories.append(traj)

print(good)
for _ in range(dataset_sz - dataset_sz // 4 * 3 ):
  traj = {}
  s, _ = env.reset()
  path = [s['image']]
  actions = []
  rewards = []
  dones = []
  for i in range(T):
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(np.array([action]))
    path.append(obs['image'])
    actions.append([action])
    rewards.append(reward)
    dones.append(done)
    if reward:
      good += 1

    if done or truncated:
      break
  path = path[:-1]
  traj['rewards'] = np.array(rewards)
  traj['dones'] = np.array(dones)
  traj['actions'] = np.array(actions)
  traj['observations'] = np.array(path)
  trajectories.append(traj)

print(good)
with open(dataset_path, 'wb') as f:
  pickle.dump(trajectories, f)

11
11


In [None]:
a=0
for t in  trajectories:
  if (t['rewards'].sum()) > 0:
    # print(t['rewards'])
    a+=1
print("Num successful trajs", a)
# print(len(trajectories))

Num successful trajs 11


# decision transformer model

In [None]:

"""
this extremely minimal GPT model is based on:
Misha Laskin's tweet:
https://twitter.com/MishaLaskin/status/1481767788775628801?cxt=HHwWgoCzmYD9pZApAAAA

and its corresponding notebook:
https://colab.research.google.com/drive/1NUBqyboDcGte5qAJKOl8gaJC28V_73Iv?usp=sharing

the above colab has a bug while applying masked_fill which is fixed in the
following code

"""

class MaskedCausalAttention(nn.Module):
    def __init__(self, h_dim, max_T, n_heads, drop_p):
        super().__init__()

        self.n_heads = n_heads
        self.max_T = max_T

        self.q_net = nn.Linear(h_dim, h_dim)
        self.k_net = nn.Linear(h_dim, h_dim)
        self.v_net = nn.Linear(h_dim, h_dim)

        self.proj_net = nn.Linear(h_dim, h_dim)

        self.att_drop = nn.Dropout(drop_p)
        self.proj_drop = nn.Dropout(drop_p)

        ones = torch.ones((max_T, max_T))
        mask = torch.tril(ones).view(1, 1, max_T, max_T)

        # register buffer makes sure mask does not get updated
        # during backpropagation
        self.register_buffer('mask',mask)

    def forward(self, x):
        B, T, C = x.shape # batch size, seq length, h_dim * n_heads

        N, D = self.n_heads, C // self.n_heads # N = num heads, D = attention dim

        # rearrange q, k, v as (B, N, T, D)
        q = self.q_net(x).view(B, T, N, D).transpose(1,2)
        k = self.k_net(x).view(B, T, N, D).transpose(1,2)
        v = self.v_net(x).view(B, T, N, D).transpose(1,2)

        # weights (B, N, T, T)
        weights = q @ k.transpose(2,3) / math.sqrt(D)
        # causal mask applied to weights
        weights = weights.masked_fill(self.mask[...,:T,:T] == 0, float('-inf'))
        # normalize weights, all -inf -> 0 after softmax
        normalized_weights = F.softmax(weights, dim=-1)

        # attention (B, N, T, D)
        attention = self.att_drop(normalized_weights @ v)

        # gather heads and project (B, N, T, D) -> (B, T, N*D)
        attention = attention.transpose(1, 2).contiguous().view(B,T,N*D)

        out = self.proj_drop(self.proj_net(attention))
        return out


class Block(nn.Module):
    def __init__(self, h_dim, max_T, n_heads, drop_p):
        super().__init__()
        self.attention = MaskedCausalAttention(h_dim, max_T, n_heads, drop_p)
        self.mlp = nn.Sequential(
                nn.Linear(h_dim, 4*h_dim),
                nn.GELU(),
                nn.Linear(4*h_dim, h_dim),
                nn.Dropout(drop_p),
            )
        self.ln1 = nn.LayerNorm(h_dim)
        self.ln2 = nn.LayerNorm(h_dim)

    def forward(self, x):
    # Discussion: https://github.com/openai/CLIP/issues/91
        # instead of (Attention -> LayerNorm -> MLP -> LayerNorm)
        x = x + self.attention(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x


class DecisionTransformer(nn.Module):
    def __init__(self, act_dim, n_blocks, h_dim, context_len,
                 n_heads, drop_p, rtg_dim, max_timestep=4096):
        super().__init__()

        self.act_dim = act_dim
        self.h_dim = h_dim

        ### transformer blocks
        input_seq_len = 3 * context_len
        blocks = [Block(h_dim, input_seq_len, n_heads, drop_p) for _ in range(n_blocks)]
        self.transformer = nn.Sequential(*blocks)

        ### projection heads (project to embedding)
        self.embed_ln = nn.LayerNorm(h_dim)
        self.embed_timestep = nn.Embedding(max_timestep, h_dim)
        self.embed_rtg = torch.nn.Linear(1, h_dim)

        self.state_encoder = nn.Sequential(nn.Conv2d(3, 16, (3,3)),
                                           nn.ReLU(),
                                           nn.Conv2d(16,32,(3,3)),
                                           nn.ReLU(),
                                           nn.Conv2d(32, 64, (3,3)),
                                           nn.ReLU(),
                                           nn.Flatten(),
                                           nn.Linear(576, h_dim),
                                           nn.Tanh())
        # Prediction head
        self.head = nn.Linear(h_dim, act_dim, bias=False)

        # # discrete actions
        self.embed_action = torch.nn.Embedding(act_dim, h_dim)

        self.loss = nn.CrossEntropyLoss()


    def forward(self, timesteps, states, actions, targets, returns_to_go, traj_mask):
        B, seq_length, H, W, chan = states.shape
        states = states.reshape((-1, chan, H , W))

        time_embeddings = self.embed_timestep(timesteps)
        # time embeddings are treated similar to positional embeddings
        state_embeddings = self.state_encoder(states.to(torch.float32)).squeeze().float().reshape(B, seq_length, -1) + time_embeddings
        action_embeddings = self.embed_action(actions.to(torch.int32)).squeeze().float() + time_embeddings
        returns_embeddings = self.embed_rtg(returns_to_go) + time_embeddings


        # stack rtg, states and actions and reshape sequence as
        # (r1, s1, a1, r2, s2, a2 ...)
        h = torch.stack(
            (returns_embeddings, state_embeddings, action_embeddings), dim=1
        ).permute(0, 2, 1, 3).reshape(B, 3 * seq_length, self.h_dim)

        h = self.embed_ln(h)

        # transformer and prediction
        h = self.transformer(h)

        # get h reshaped such that its size = (B x 3 x T x h_dim) and
        # h[:, 0, t] is conditioned on r_0, s_0, a_0 ... r_t
        # h[:, 1, t] is conditioned on r_0, s_0, a_0 ... r_t, s_t
        # h[:, 2, t] is conditioned on r_0, s_0, a_0 ... r_t, s_t, a_t
        h = h.reshape(B, seq_length, 3, self.h_dim).permute(0, 2, 1, 3)

        # Predict action based on (r, s)
        logits = self.head(h[:, 1])

        # only consider non padded elements
        logits = logits.view(-1, act_dim)[traj_mask.view(-1,) > 0]
        targets = targets.view(-1,)[traj_mask.view(-1,) > 0]

        loss = self.loss(logits.view(-1, self.act_dim), targets.view(-1))
        return logits, loss


# s4 model

In [None]:
def split_train_val(train, val_split):
    train_len = int(len(train) * (1.0-val_split))
    train, val = torch.utils.data.random_split(
        train,
        (train_len, len(train) - train_len),
        generator=torch.Generator().manual_seed(42),
    )
    return train, val


class S4Model(nn.Module):

    def __init__(
        self,
        state_dim, # d_input?
        act_dim, #d_output=10,
        h_dim, # d_model=256?
        context_len,
        dropout=0.2,
        n_layers=4,
        max_timestep=4096,
        prenorm=False,

        # d_output=10,
        # # d_model=256,
        # n_layers=4,

    ):
        super().__init__()

        self.prenorm = prenorm

        ### projection heads (project to embedding)
        self.embed_ln = nn.LayerNorm(h_dim)
        self.embed_timestep = nn.Embedding(max_timestep, h_dim)
        self.embed_rtg = torch.nn.Linear(1, h_dim)
        self.embed_state = torch.nn.Linear(state_dim, h_dim)

        # continuous actions
        self.embed_action = torch.nn.Linear(act_dim, h_dim)
        use_action_tanh = True # True for continuous actions

        # Linear encoder (d_input = 1 for grayscale and 3 for RGB)
        # self.encoder = nn.Linear(d_input, d_model)

        # Stack S4 layers as residual blocks
        self.s4_layers = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        for _ in range(n_layers):
            self.s4_layers.append(
                S4D(h_dim, dropout=dropout, transposed=True, lr=min(0.001, lr))
            )
            self.norms.append(nn.LayerNorm(h_dim))
            self.dropouts.append(dropout_fn(dropout))

        # # Linear decoder
        # self.decoder = nn.Linear(d_model, d_output)

        ### prediction heads
        self.predict_rtg = torch.nn.Linear(h_dim, 1)
        self.predict_state = torch.nn.Linear(h_dim, state_dim)
        self.predict_action = nn.Sequential(
            *([nn.Linear(h_dim, act_dim)] + ([nn.Tanh()] if use_action_tanh else []))
        )

    def forward(self, timesteps, states, actions, returns_to_go): # timesteps, states, actions, returns_to_go
        """
        Input x is shape (B, L, d_input)
        """
        B, T, _ = states.shape

        time_embeddings = self.embed_timestep(timesteps) # (B, L, d_input) -> max_timestep, (h_dim)?

        # time embeddings are treated similar to positional embeddings
        state_embeddings = self.embed_state(states) + time_embeddings # (h_dim)
        action_embeddings = self.embed_action(actions) + time_embeddings # (h_dim)
        returns_embeddings = self.embed_rtg(returns_to_go) + time_embeddings # (h_dim)

        # stack rtg, states and actions and reshape sequence as
        # (r1, s1, a1, r2, s2, a2 ...)
        h = torch.stack(
            (returns_embeddings, state_embeddings, action_embeddings), dim=1
        ).permute(0, 2, 1, 3).reshape(B, 3 * T, self.h_dim)

        h = self.embed_ln(h) # (B x 3 x T x h_dim)
        print("H shape before S4". h.shape)

        x = self.encoder(x)  # (B, L, d_input) -> (B, L, d_model)

        x = x.transpose(-1, -2)  # (B, L, d_model) -> (B, d_model, L)


        for layer, norm, dropout in zip(self.s4_layers, self.norms, self.dropouts):
            # Each iteration of this loop will map (B, d_model, L) -> (B, d_model, L)

            z = x
            if self.prenorm:
                # Prenorm
                z = norm(z.transpose(-1, -2)).transpose(-1, -2)

            # Apply S4 block: we ignore the state input and output
            z, _ = layer(z)

            # Dropout on the output of the S4 block
            z = dropout(z)

            # Residual connection
            x = z + x

            if not self.prenorm:
                # Postnorm
                x = norm(x.transpose(-1, -2)).transpose(-1, -2)

        x = x.transpose(-1, -2)

        # Pooling: average pooling over the sequence length
        x = x.mean(dim=1)

        # Decode the outputs
        x = self.decoder(x)  # (B, d_model) -> (B, d_output)

        # get h reshaped such that its size = (B x 3 x T x h_dim) and
        # h[:, 0, t] is conditioned on r_0, s_0, a_0 ... r_t
        # h[:, 1, t] is conditioned on r_0, s_0, a_0 ... r_t, s_t
        # h[:, 2, t] is conditioned on r_0, s_0, a_0 ... r_t, s_t, a_t
        h = h.reshape(B, T, 3, self.h_dim).permute(0, 2, 1, 3)

        # get predictions
        return_preds = self.predict_rtg(h[:,2])     # predict next rtg given r, s, a
        state_preds = self.predict_state(h[:,2])    # predict next state given r, s, a
        action_preds = self.predict_action(h[:,1])  # predict action given r, s

        return x



def setup_optimizer(model, lr, weight_decay, epochs):
    """
    S4 requires a specific optimizer setup.

    The S4 layer (A, B, C, dt) parameters typically
    require a smaller learning rate (typically 0.001), with no weight decay.

    The rest of the model can be trained with a higher learning rate (e.g. 0.004, 0.01)
    and weight decay (if desired).
    """

    # All parameters in the model
    all_parameters = list(model.parameters())

    # General parameters don't contain the special _optim key
    params = [p for p in all_parameters if not hasattr(p, "_optim")]

    # Create an optimizer with the general parameters
    optimizer = optim.AdamW(params, lr=lr, weight_decay=weight_decay)

    # Add parameters with special hyperparameters
    hps = [getattr(p, "_optim") for p in all_parameters if hasattr(p, "_optim")]
    hps = [
        dict(s) for s in sorted(list(dict.fromkeys(frozenset(hp.items()) for hp in hps)))
    ]  # Unique dicts
    for hp in hps:
        params = [p for p in all_parameters if getattr(p, "_optim", None) == hp]
        optimizer.add_param_group(
            {"params": params, **hp}
        )

    # Create a lr scheduler
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.2)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

    # Print optimizer info
    keys = sorted(set([k for hp in hps for k in hp.keys()]))
    for i, g in enumerate(optimizer.param_groups):
        group_hps = {k: g.get(k, None) for k in keys}
        print(' | '.join([
            f"Optimizer group {i}",
            f"{len(g['params'])} tensors",
        ] + [f"{k} {v}" for k, v in group_hps.items()]))

    return optimizer, scheduler


# utils

In [None]:


def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    disc_cumsum[-1] = x[-1]
    for t in reversed(range(x.shape[0]-1)):
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum

def evaluate_on_env(model, device, context_len, env, rtg_target,
                    num_eval_ep=10, max_test_ep_len=1000,
                    state_mean=None, state_std=None, render=False):

    eval_batch_size = 1  # required for forward pass

    results = {}
    total_reward = 0
    total_timesteps = 0

    state_dim = 1
    act_dim = 1

    if state_mean is None:
        state_mean = torch.zeros((state_dim,)).to(device)
    else:
        state_mean = torch.from_numpy(state_mean).to(device)

    if state_std is None:
        state_std = torch.ones((state_dim,)).to(device)
    else:
        state_std = torch.from_numpy(state_std).to(device)

    # same as timesteps used for training the transformer
    # also, crashes if device is passed to arange()
    timesteps = torch.arange(start=0, end=max_test_ep_len, step=1)
    timesteps = timesteps.repeat(eval_batch_size, 1).to(device)

    model.eval()

    with torch.no_grad():

        for _ in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim),
                                dtype=torch.float32, device=device)

            states = torch.zeros((eval_batch_size, max_test_ep_len, 9, 9, 3),
                                dtype=torch.float32, device=device)

            rewards_to_go = torch.zeros((eval_batch_size, max_test_ep_len, 1),
                                dtype=torch.float32, device=device)

            # init episode
            running_state, _ = env.reset()
            running_reward = 0
            running_rtg = rtg_target# / rtg_scale

            for t in range(max_test_ep_len):

                total_timesteps += 1

                # add state in placeholder and normalize
                states[0, t] = torch.from_numpy(running_state['image']).to(device)

                # calcualate running rtg and add in placeholder
                running_rtg = running_rtg - (running_reward)
                rewards_to_go[0, t] = running_rtg

                if t < context_len:
                    traj_len = t + 1
                    padding_len = context_len - traj_len
                    traj_mask = torch.cat([torch.ones(traj_len, dtype=torch.long), torch.zeros(padding_len)]).to(device)
                    act_preds, loss  = model.forward(timesteps[:,:context_len],
                                                states[:,:context_len],
                                                actions[:,:context_len].type(torch.LongTensor).to(device),
                                                actions[:, :context_len].type(torch.LongTensor).to(device),
                                                rewards_to_go[:,:context_len],
                                                traj_mask)
                    act = act_preds[t].argmax()
                else:
                    traj_mask = torch.ones(context_len, dtype=torch.long).to(device)
                    act_preds, loss = model.forward(timesteps[:,t-context_len+1:t+1],
                                                states[:,t-context_len+1:t+1],
                                                actions[:,t-context_len+1:t+1].type(torch.LongTensor).to(device),
                                                actions[:,t-context_len+1:t+1].type(torch.LongTensor).to(device),
                                                rewards_to_go[:,t-context_len+1:t+1],
                                                traj_mask)
                    act = act_preds[t].argmax()

                running_state, running_reward, done, truncated, _ = env.step(act.argmax().cpu().numpy())

                # add action in placeholder
                actions[0, t] = act.item()

                total_reward += running_reward

                if render:
                    env.render()
                if done or truncated:
                    break

    results['eval/avg_reward'] = total_reward / num_eval_ep
    results['eval/avg_ep_len'] = total_timesteps / num_eval_ep

    return results

from gym.wrappers import RecordVideo

def evaluate_on_env_video(model, device, context_len, env, rtg_target,
                    num_eval_ep=10, max_test_ep_len=1000,
                    state_mean=None, state_std=None, render=False):

    eval_batch_size = 1  # required for forward pass

    results = {}
    total_reward = 0
    total_timesteps = 0

    act_dim = 1

    # same as timesteps used for training the transformer
    # also, crashes if device is passed to arange()
    timesteps = torch.arange(start=0, end=max_test_ep_len, step=1)
    timesteps = timesteps.repeat(eval_batch_size, 1).to(device)

    model.eval()


    with torch.no_grad():

        for episode_i in range(num_eval_ep):

            # zeros place holders
            actions = torch.zeros((eval_batch_size, max_test_ep_len, act_dim),
                                dtype=torch.float32, device=device)

            states = torch.zeros((eval_batch_size, max_test_ep_len, 9, 9, 3),
                                dtype=torch.float32, device=device)

            rewards_to_go = torch.zeros((eval_batch_size, max_test_ep_len, 1),
                                dtype=torch.float32, device=device)

            # init episode
            wrapped_env = RecordVideo(env, episode_trigger=lambda x: True, video_folder='/content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video', name_prefix=f'1k_traj_ep{episode_i}')
            running_state, _ = wrapped_env.reset()
            wrapped_env.start_video_recorder()
            running_reward = 0
            running_rtg = rtg_target

            for t in range(max_test_ep_len):

                total_timesteps += 1

                # add state in placeholder and normalize
                states[0, t] = torch.from_numpy(running_state['image']).to(device)

                # calcualate running rtg and add in placeholder
                running_rtg = running_rtg - (running_reward)
                rewards_to_go[0, t] = running_rtg

                if t < context_len:
                    traj_len = t + 1
                    padding_len = context_len - traj_len
                    traj_mask = torch.cat([torch.ones(traj_len, dtype=torch.long), torch.zeros(padding_len)]).to(device)
                    act_preds, loss  = model.forward(timesteps[:,:context_len],
                                                states[:,:context_len],
                                                actions[:,:context_len].type(torch.LongTensor).to(device),
                                                actions[:, :context_len].type(torch.LongTensor).to(device),
                                                rewards_to_go[:,:context_len],
                                                traj_mask)
                    act = act_preds[t].argmax()
                else:
                    traj_mask = torch.ones(context_len, dtype=torch.long).to(device)
                    act_preds, loss = model.forward(timesteps[:,t-context_len+1:t+1],
                                                states[:,t-context_len+1:t+1],
                                                actions[:,t-context_len+1:t+1].type(torch.LongTensor).to(device),
                                                actions[:,t-context_len+1:t+1].type(torch.LongTensor).to(device),
                                                rewards_to_go[:,t-context_len+1:t+1],
                                                traj_mask)
                    act = act_preds[t].argmax()

                running_state, running_reward, done, truncated, _ = wrapped_env.step(act.argmax().cpu().numpy())

                # add action in placeholder
                actions[0, t] = act.item()

                total_reward += running_reward

                if render:
                    wrapped_env.render()
                if done or truncated:
                    break
            wrapped_env.close_video_recorder()
            wrapped_env.close()

    results['eval/avg_reward'] = total_reward / num_eval_ep
    results['eval/avg_ep_len'] = total_timesteps / num_eval_ep

    return results




# dataset

In [None]:
## check data

# load dataset
with open(dataset_path, 'rb') as f:
    trajectories = pickle.load(f)

min_len = 10**4

for traj in trajectories:
    min_len = min(min_len, traj['observations'].shape[0])



print(dataset_path)
print("num of trajectories in dataset: ", len(trajectories))
print("minimum trajectory length in dataset: ", min_len)



/content/drive/MyDrive/6_8200_project/decision_transformer/key_to_door_random_1000.pkl
num of trajectories in dataset:  1000
minimum trajectory length in dataset:  142


In [None]:


class D4RLTrajectoryDataset(Dataset):
    def __init__(self, dataset_path, context_len):

        self.context_len = context_len

        # load dataset
        with open(dataset_path, 'rb') as f:
            self.trajectories = pickle.load(f)

        # calculate min len of traj, state mean and variance
        # and returns_to_go for all traj
        min_len = 10**6
        states = []
        for traj in self.trajectories:
            traj_len = traj['observations'].shape[0]
            min_len = min(min_len, traj_len)
            states.append(traj['observations'])
            # calculate returns to go and rescale them
            traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0)


        # # used for input normalization
        states = np.concatenate(states, axis=0)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6


    def get_state_stats(self):
        return self.state_mean, self.state_std

    def __len__(self):
        return len(self.trajectories)

    def __getitem__(self, idx):
        traj = self.trajectories[idx]
        traj_len = traj['observations'].shape[0]

        if traj_len >= self.context_len:
            # sample random index to slice trajectory
            si = random.randint(0, traj_len - self.context_len)

            states = torch.from_numpy(traj['observations'][si : si + self.context_len])
            actions = torch.from_numpy(traj['actions'][si : si + self.context_len])
            returns_to_go = torch.from_numpy(traj['returns_to_go'][si : si + self.context_len])
            timesteps = torch.arange(start=si, end=si+self.context_len, step=1)


            # all ones since no padding
            traj_mask = torch.ones(self.context_len, dtype=torch.long)

        else:
            padding_len = self.context_len - traj_len

            # padding with zeros
            states = torch.from_numpy(traj['observations'])
            states = torch.cat([states,
                                torch.zeros(([padding_len] + list(states.shape[1:])),
                                dtype=states.dtype)],
                               dim=0)

            actions = torch.from_numpy(traj['actions'])
            actions = torch.cat([actions,
                                torch.zeros(([padding_len] + list(actions.shape[1:])),
                                dtype=actions.dtype)],
                               dim=0)


            returns_to_go = torch.from_numpy(traj['returns_to_go'])
            returns_to_go = torch.cat([returns_to_go,
                                torch.zeros(([padding_len] + list(returns_to_go.shape[1:])),
                                dtype=returns_to_go.dtype)],
                               dim=0)

            timesteps = torch.arange(start=0, end=self.context_len, step=1)

            traj_mask = torch.cat([torch.ones(traj_len, dtype=torch.long),
                                   torch.zeros(padding_len, dtype=torch.long)],
                                  dim=0)

        return  timesteps, states, actions, returns_to_go, traj_mask




In [None]:
with open('/content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_610_good_traj.pkl', 'rb') as f:
  gt = pickle.load(f)
with open(dataset_path, 'rb') as f:
  dt = pickle.load(f)
gt.extend(dt)
print(len(gt))
with open('/content/drive/MyDrive/6_8200_project/decision_transformer/new_key_to_door_1610.pkl', 'wb') as f:
  pickle.dump(gt, f)

1659


# train

In [None]:

model_load = '/content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-15-56-27_best.pt'
start_time = datetime.now().replace(microsecond=0)

start_time_str = start_time.strftime("%y-%m-%d-%H-%M-%S")

prefix = "dt_keytodoor"

save_model_name =  prefix + "_model_" + start_time_str + ".pt"
save_model_path = os.path.join(log_dir, save_model_name)
save_best_model_path = save_model_path[:-3] + "_best.pt"

log_csv_name = prefix + "_log_" + start_time_str + ".csv"
log_csv_path = os.path.join(log_dir, log_csv_name)


csv_writer = csv.writer(open(log_csv_path, 'a', 1))
csv_header = (["duration", "num_updates", "action_loss",
               "eval_avg_reward", "eval_avg_ep_len", "eval_d4rl_score"])

csv_writer.writerow(csv_header)


print("=" * 60)
print("start time: " + start_time_str)
print("=" * 60)

print("device set to: " + str(device))
print("dataset path: " + dataset_path)
print("model save path: " + save_model_path)
print("log csv save path: " + log_csv_path)

def collate_fn(data):
    return list(zip(*data))

traj_dataset = D4RLTrajectoryDataset(dataset_path, context_len)

traj_data_loader = DataLoader(traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True, collate_fn=collate_fn )

data_iter = iter(traj_data_loader)

env = KeyToDoorEnv(phase_2_begin=100, phase_3_begin=125, max_steps=T)


model = DecisionTransformer(
			act_dim=act_dim,
			n_blocks=n_blocks,
			h_dim=embed_dim,
			context_len=context_len,
			n_heads=n_heads,
			drop_p=dropout_p,
			rtg_dim=2
		).to(device)

if model_load is not None:
	print("loading model from: " + model_load)
	model.load_state_dict(torch.load(model_load))

optimizer = torch.optim.AdamW(
					model.parameters(),
					lr=lr,
					weight_decay=wt_decay
				)

scheduler = torch.optim.lr_scheduler.LambdaLR(
		optimizer,
		lambda steps: min((steps+1)/warmup_steps, 1)
	)

max_d4rl_score = -np.inf
total_updates = 0


for i_train_iter in range(max_train_iters):

	log_action_losses = []
	model.train()

	for _ in range(num_updates_per_iter):
		try:
			timesteps, states, actions, returns_to_go, traj_mask = next(data_iter)
		except StopIteration:
			data_iter = iter(traj_data_loader)
			timesteps, states, actions, returns_to_go, traj_mask = next(data_iter)

		timesteps = torch.stack(timesteps, dim=0).to(device)	# B x T
		states = torch.stack(states, dim=0).to(device)			# B x T x 1
		actions = torch.stack(actions, dim=0).type(torch.LongTensor).to(device)		# B x T x 1
		returns_to_go = torch.stack(returns_to_go, dim=0).to(device).unsqueeze(dim=-1).float() # B x T x 1
		traj_mask = torch.stack(traj_mask, dim=0).to(device)	# B x T


		logits, loss = model.forward(
														timesteps=timesteps,
														states=states,
														actions=actions,
														targets=actions,
														returns_to_go=returns_to_go,
														traj_mask=traj_mask
													)

		optimizer.zero_grad()
		loss.backward()
		# torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
		optimizer.step()
		scheduler.step()

		log_action_losses.append(loss.detach().cpu().item())

	# evaluate on env
	results = evaluate_on_env(model, device, context_len, env, rtg_target,
	                        num_eval_ep, max_eval_ep_len
							)
	eval_avg_reward = results['eval/avg_reward']
	eval_avg_ep_len = results['eval/avg_ep_len']
	eval_d4rl_score = eval_avg_reward

	mean_action_loss = np.mean(log_action_losses)
	time_elapsed = str(datetime.now().replace(microsecond=0) - start_time)

	total_updates += num_updates_per_iter

	log_str = ("=" * 60 + '\n' +
			"time elapsed: " + time_elapsed  + '\n' +
			"num of updates: " + str(total_updates) + '\n' +
			"action loss: " +  format(mean_action_loss, ".5f") + '\n' +
			"eval avg reward: " + format(eval_avg_reward, ".5f") + '\n' +
			"eval avg ep len: " + format(eval_avg_ep_len, ".5f") + '\n'
			)

	print(log_str)

	log_data = [time_elapsed, total_updates, mean_action_loss,
				eval_avg_reward, eval_avg_ep_len]

	csv_writer.writerow(log_data)

	# save model
	print("max score: " + format(max_d4rl_score, ".5f"))
	if eval_d4rl_score >= max_d4rl_score:
		print("saving max score model at: " + save_best_model_path)
		torch.save(model.state_dict(), save_best_model_path)
		max_d4rl_score = eval_d4rl_score

	print("saving current model at: " + save_model_path)
	torch.save(model.state_dict(), save_model_path)


print("=" * 60)
print("finished training!")
print("=" * 60)
end_time = datetime.now().replace(microsecond=0)
time_elapsed = str(end_time - start_time)
end_time_str = end_time.strftime("%y-%m-%d-%H-%M-%S")
print("started training at: " + start_time_str)
print("finished training at: " + end_time_str)
print("total training time: " + time_elapsed)
print("max score: " + format(max_d4rl_score, ".5f"))
print("saved max score model at: " + save_best_model_path)
print("saved last updated model at: " + save_model_path)
print("=" * 60)

# csv_writer.close()




start time: 24-05-07-16-12-47
device set to: cuda
dataset path: /content/drive/MyDrive/6_8200_project/decision_transformer/new_key_to_door_1610.pkl
model save path: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-16-12-47.pt
log csv save path: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_log_24-05-07-16-12-47.csv
loading model from: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-15-56-27_best.pt
time elapsed: 0:00:15
num of updates: 100
action loss: 1.69467
eval avg reward: 0.00000
eval avg ep len: 225.00000

max score: -inf
saving max score model at: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-16-12-47_best.pt
saving current model at: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-16-12-47.pt
tim

# test

In [None]:

num_test_eval_ep = 10			# num of evaluation episodes
eval_max_eval_ep_len = T		# max len of one episode


context_len = T        # K in decision transformer
n_blocks = 3            # num of transformer blocks
embed_dim = 128         # embedding (hidden) dim of transformer
n_heads = 1             # num of transformer heads
dropout_p = 0.1         # dropout probability
act_dim = act_dim # From training parameters code block


eval_chk_pt_dir = '/content/drive/MyDrive/6_8200_project/decision_transformer'
eval_chk_pt_name = "/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-15-14-46_best.pt"


eval_chk_pt_list = [eval_chk_pt_name]

eval_env = KeyToDoorEnv(phase_2_begin=75, phase_3_begin=150, max_steps=T, render_mode='rgb_array')
eval_rtg_target = 0

all_scores = []

for eval_chk_pt_name in eval_chk_pt_list:

	eval_model = DecisionTransformer(
				act_dim=act_dim,
				n_blocks=n_blocks,
				h_dim=embed_dim,
				context_len=context_len,
				n_heads=n_heads,
				drop_p=dropout_p,
				rtg_dim=T
			).to(device)


	eval_chk_pt_path = os.path.join(eval_chk_pt_dir, eval_chk_pt_name)

	# load checkpoint
	eval_model.load_state_dict(torch.load(eval_chk_pt_path, map_location=device))

	print("model loaded from: " + eval_chk_pt_path)

	# evaluate on env
	results = evaluate_on_env_video(eval_model, device, context_len,
							eval_env, eval_rtg_target,
							num_test_eval_ep, max_test_ep_len=T)
	print(results)

	norm_score = results['eval/avg_reward']
	print("score: ", norm_score)

	all_scores.append(norm_score)

print("=" * 60)
all_scores = np.array(all_scores)
print("evaluated on env: keytodoor-v0 ")
print("total num of checkpoints evaluated: " + str(len(eval_chk_pt_list)))
print("score mean: " + format(all_scores.mean(), ".5f"))
print("score std: " + format(all_scores.std(), ".5f"))
print("=" * 60)




model loaded from: /content/drive/MyDrive/6_8200_project/decision_transformer/dt_runs_keytodoor_1k/dt_keytodoor_model_24-05-07-12-57-23_best.pt
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep0-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep1-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep2-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep3-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep4-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep5-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep6-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep7-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4
Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4




Moviepy - Building video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep9-episode-0.mp4
{'eval/avg_reward': 0.0, 'eval/avg_ep_len': 225.0}
score:  0.0
evaluated on env: keytodoor-v0 
total num of checkpoints evaluated: 1
score mean: 0.00000
score std: 0.00000


In [None]:
from IPython.display import HTML
from base64 import b64encode
file = '/content/drive/MyDrive/6_8200_project/decision_transformer/keytodoor_video/1k_traj_ep8-episode-0.mp4'
mp4 = open(file,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)