## Device check

In [1]:
!nvidia-smi

Tue Oct 24 13:42:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:B3:00.0 Off |                    0 |
| N/A   32C    P0    39W / 300W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Packages

In [2]:
import warnings
warnings.filterwarnings(action="ignore")

In [3]:
import numpy as np

import logging
logging.basicConfig(level="INFO")
import os

import pandas as pd
from pprint import pprint

import sys
sys.path.append("../src")

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchinfo import summary

import lightning.pytorch as pl

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpdmcndn60
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpdmcndn60/_remote_module_non_scriptable.py


In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
%load_ext autoreload
%autoreload 2

In [19]:
import config
from dataloader import BEDataset, BEDataModule
import token_learner
from transformer import PositionalEncoder,TransformerDecoder, generate_masks

from film_layers import FiLMBlockV2, FiLMEncoder, ResBlockDWConv
from rt1 import RT1Encoder
from utils.model_utils import TextEncoder, ImageFeatureExtractor

## Load data summary

In [7]:
csv = pd.read_csv(os.path.join(config.DATASET_PATH, "train.csv"))

csv.head()

Unnamed: 0,sample_ID,in_state,goal_state,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,7294,0,10,put the fork to the right of buttermilk,:FORK GREEN POSE-9 :BUTTERMILK GREEN POSE-2 :F...,8,11,v2
1,405,0,8,move the bottle backwards,:BOTTLE RED POSE-2 :BOTTLE #'*backward-transf...,4,8,v1
2,4235,0,10,put the bottle to the left of breakfast-cereal,:BOTTLE RED POSE-7 :BREAKFAST-CEREAL BLUE POSE...,8,11,v2
3,6990,0,10,put the milk in front of bottle,:MILK BLUE POSE-8 :BOTTLE RED POSE-4 :MILK #'...,7,11,v2
4,7096,0,10,put the cup in front of glasses,:CUP GREEN POSE-6 :GLASSES RED POSE-2 :CUP #'...,7,11,v2


In [8]:
# building data object
ds = BEDataset(
    df=csv    
)

len(ds)

4876

In [9]:
# fetching example
rand_idx = np.random.randint(low=0, high=len(ds))
ex = ds[rand_idx]

print("Dataset size: ", len(ds))
print("="*100)
print("ID\t: ", ex["sample_id"])
print(">> InState\t: ", ex["in_state"].shape)
print(">> Desc\t:")
pprint(ex["action_desc"])
print(">> Cmd\t:")
pprint(ex["motor_cmd"])
print("="*100)

Dataset size:  4876
ID	:  9401
>> InState	:  torch.Size([3, 224, 224])
>> Desc	:
{'ids': tensor([  101,  2404,  1996,  5442,  2000,  1996,  2157,  1997, 12256, 17130,
         2378,   102,     0,     0,     0,     0]),
 'length': 8,
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
 'raw': 'put the knife to the right of mondamin',
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
>> Cmd	:
{'ids': tensor([ 0, 17, 41, 30, 18, 41, 36, 17, 48, 18,  2,  2,  2,  2,  2,  2]),
 'length': 11,
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]),
 'raw': ':KNIFE GREEN POSE-10 :MONDAMIN GREEN POSE-3 :KNIFE  '
        "#'*rightward-transformation*  :MONDAMIN"}


## Data Module

In [10]:
dm = BEDataModule()
dm.setup()

Total # examples: 4876


INFO:root:Training on 3610 samples.
INFO:root:Validating on 1266 samples.


In [11]:
# print("="*100)
# logging.info("\n>> train data loader")
# print(f"# train batches\t: {len(dm.train_dataloader())}")
# for data in dm.train_dataloader():
#     # pprint(data)
#     sample_id, in_state, ad, cmd = data["sample_id"], data["in_state"], data["action_desc"], data["motor_cmd"]
#     print("In \t\t\t: ", in_state.shape)
#     print("Action desc \t\t: ", ad["ids"].shape)
#     print("Action desc (len) \t: ", ad["length"].shape)
#     print("CMD \t\t\t: ", cmd["ids"].shape)
#     print("CMD(len) \t\t: ", cmd["length"].shape)
#     break

# print("\nIDs & decided tokens")
# for data in dm.train_dataloader():
#     print(data["action_desc"]["ids"][0].tolist())
#     print(dm.train_ds._decode_inputs(data["action_desc"]["ids"][0].tolist()))
#     print()
#     print(data["motor_cmd"]["ids"][0].tolist())
#     print(dm.train_ds._decode_outputs(data["motor_cmd"]["ids"][0].tolist()))

#     break
    
# print("="*100)

## Fetch batch

In [12]:
%%time
sample = next(iter(dm.train_dataloader()))
sample["in_state"].shape

CPU times: user 1.24 s, sys: 1.49 s, total: 2.72 s
Wall time: 12 s


torch.Size([8, 3, 224, 224])

## Model Design

<!-- ![RT1 model architecture](../../imgs/rt1+.png) -->
<center>
    <img src="../imgs/rt1+.png" alt="RT1 model architecture" width="300" height="400">

<center>

### Encoder

#### Test Text Encoder

In [None]:
# te = TextEncoder(freeze=True).cuda()
# summary(model=te, col_names=["num_params", "trainable"])

In [None]:
# emb = te(
#     inp_ids=sample["action_desc"]["ids"].cuda(),
#     mask=sample["action_desc"]["mask"].cuda(),
#     tok_type_ids=sample["action_desc"]["token_type_ids"].cuda()
# )

# emb.shape

#### Test Img Feature Extractor

In [None]:
# fe = ImageFeatureExtractor(pretrained=True, arch="resnet34").cuda()

# summary(fe, col_names=["num_params", "trainable"])

In [None]:
# img_ftrs = fe(sample["in_state"].cuda())

# img_ftrs.shape

#### Test FiLM Block

In [None]:
# film_block = FiLMBlockV2().cuda()
# print(film_block)
# summary(model=film_block)

In [None]:
# text_cond_ftrs = film_block(
#     img_features=img_ftrs, 
#     conditioning=emb
# )

# text_cond_ftrs.shape

#### Test Residual FiLM Block

In [None]:
# dw_res = ResBlockDWConv(512, 512).cuda()
# summary(model=dw_res)

In [None]:
# text_cond_ftrs_res = dw_res(
#     img_features=img_ftrs, 
#     conditioning=emb
# )

# text_cond_ftrs_res.shape

#### Test FiLM Encoder

In [None]:
# film_encoder = FiLMEncoder(
#     arch="resnet34",
#     n_res_blocks=6,
# ).cuda()

# # print(film_encoder)
# summary(model=film_encoder)

In [None]:
# %%time

# out = film_encoder(
#     x= sample["in_state"].cuda(),
#     conditioning= emb
# )

# out.shape

#### Token Learner

In [None]:
# N, C, H_W = out.shape
# N, C, H_W

In [None]:
# tokL_v11 = token_learner.TokenLearnerModuleV11(feature_shape=(N, H_W, C)).cuda()
# print(tokL_v11)
# summary(model=tokL_v11)

In [None]:
# learned_tokens = tokL_v11(out.view(N, H_W, C))
# learned_tokens.shape

#### RT-1 Encoder

In [13]:
encoder = RT1Encoder(cnn_bacnbone="resnet34").to(config.DEVICE)
summary(model=encoder, col_names=["num_params", "trainable"])

Layer (type:depth-idx)                                       Param #                   Trainable
RT1Encoder                                                   --                        Partial
├─TextEncoder: 1-1                                           --                        False
│    └─BertModel: 2-1                                        --                        False
│    │    └─BertEmbeddings: 3-1                              (15,891,456)              False
│    │    └─BertEncoder: 3-2                                 (12,609,536)              False
│    │    └─BertPooler: 3-3                                  (262,656)                 False
│    └─Dropout: 2-2                                          --                        --
├─FiLMEncoder: 1-2                                           --                        Partial
│    └─ImageFeatureExtractor: 2-3                            --                        Partial
│    │    └─ResNet: 3-4                                      21

In [55]:
%%time 

src_enc, tokens = encoder(
    input_ids=sample["action_desc"]["ids"].cuda(),
    attn_mask=sample["action_desc"]["mask"].cuda(),
    token_type_ids=sample["action_desc"]["token_type_ids"].cuda(),
    imgs=sample["in_state"].cuda()
)

src_enc.shape, tokens.shape

CPU times: user 56.7 ms, sys: 34.8 ms, total: 91.5 ms
Wall time: 7.02 s


torch.Size([8, 512, 8])

### Decoder

#### Transformer Decoder

In [65]:
dec = TransformerDecoder().cuda()
summary(model=dec)

Layer (type:depth-idx)                   Param #
TransformerDecoder                       --
├─Embedding: 1-1                         26,624
├─ModuleList: 1-2                        --
│    └─TransformerDecoderLayer: 2-1      --
│    │    └─MultiHeadAttention: 3-1      1,049,088
│    │    └─LayerNorm: 3-2               1,024
│    │    └─MultiHeadAttention: 3-3      1,049,088
│    │    └─LayerNorm: 3-4               1,024
│    │    └─Sequential: 3-5              2,099,712
│    │    └─LayerNorm: 3-6               1,024
│    │    └─Dropout: 3-7                 --
│    └─TransformerDecoderLayer: 2-2      --
│    │    └─MultiHeadAttention: 3-8      1,049,088
│    │    └─LayerNorm: 3-9               1,024
│    │    └─MultiHeadAttention: 3-10     1,049,088
│    │    └─LayerNorm: 3-11              1,024
│    │    └─Sequential: 3-12             2,099,712
│    │    └─LayerNorm: 3-13              1,024
│    │    └─Dropout: 3-14                --
│    └─TransformerDecoderLayer: 2-3      --
│    │ 

In [71]:
dec_out = dec(
    src=src_enc, 
    encoder_outputs=tokens,
    y=None #sample["motor_cmd"]["ids"].cuda()
)

q: torch.Size([8, 1, 1, 512]) - k: torch.Size([8, 1, 1, 512]) - v: torch.Size([8, 1, 1, 512]) 
q: torch.Size([8, 8, 1, 512]) - k: torch.Size([8, 512, 8]) - v: torch.Size([8, 512, 8]) 
mask: torch.Size([8, 16, 16])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x8 and 512x512)

In [67]:
src_enc.shape

torch.Size([8, 512])

#### Action Generator

In [37]:
class ActionGenerator(nn.Module):
    def __init__(
        self,
        d_model:int=config.D_MODEL, 
        vocab_size:int=len(config.TARGETS)
    ):
        super().__init__()

        self.proj = nn.Linear(in_features=d_model, out_features=vocab_size)
        self._softmax = nn.LogSoftmax(dim=-1)

    def forward(self, tokens):
        return self._softmax(self.proj(tokens))
    

In [41]:
generator = ActionGenerator().cuda()
print(generator)
summary(generator)

ActionGenerator(
  (proj): Linear(in_features=512, out_features=52, bias=True)
  (_softmax): LogSoftmax(dim=-1)
)


Layer (type:depth-idx)                   Param #
ActionGenerator                          --
├─Linear: 1-1                            26,676
├─LogSoftmax: 1-2                        --
Total params: 26,676
Trainable params: 26,676
Non-trainable params: 0

In [43]:
N, C, T = tokens.shape

N, C, T

(8, 512, 8)

In [44]:
actions = generator(tokens.view(N, T, C))

actions.shape

torch.Size([8, 8, 52])

### RT-1 Decoder

In [None]:
class RT1Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.positional_encoder = PositionalEncoder()
        self.transformer = TransformerDecoder()
        # 
        self.action_generator = ActionGenerator()
        

    def _positional_encoding(
        self,
        seq, 
        dim, 
        temperature = 10000, 
        device = None, 
        dtype = torch.float32
    ):
        n = torch.arange(seq, device = device)
        omega = torch.arange(dim // 2, device = device) / (dim // 2 - 1)
        omega = 1. / (temperature ** omega)

        n = n[:, None] * omega[None, :]
        pos_emb = torch.cat((n.sin(), n.cos()), dim = 1)
        
        return pos_emb.type(dtype)

    
    def forward(self, instructions, imgs):
        pass

### RT-1 

In [None]:
class RT1(pl.LightningModule):
    def __init__(
        self
    ):
        super().__init__()
        self.encoder = RT1Encder()
        self.decoder = RT1Decoder()
        
    def forward(self, input_ids, attn_mask, token_type_ids, imgs):
        
        tokens = self.encode(input_ids, attn_mask, token_type_ids, imgs)
        out = self.decode(tokens, )
    
    def encode(self, input_ids, attn_mask, token_type_ids, imgs):
        return self.encoder(input_ids, attn_mask, token_type_ids, imgs)
    
    def decode(self, enc_outputs, x_mask, y, y_mask):
        return self.decoder(y, enc_outputs, src_mask, tgt_mask)
    
    def configure_optimizers(self):
        pass
    
    def training_step(self, batch, batch_idx):
        pass
    
    def validation_step(self, batch, batch_idx):
        pass
    
    def test_step(self, batch, batch_idx):
        pass
    
    def compute_loss(self, outputs, targets):
        pass