The agent receives visual inputs (165x220
RGB pixels) and language inputs (an example input is shown
in Appendix Fig. 9). The pixel inputs pass through a series
of four ResNet blocks, with 3×3 kernels, strides of 2, 2, 2,
2, and an increasing number of output channels (32, 128,
256, 512). This results in 14×11 feature vectors, which we
flatten into a list of 154 tokens

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride):
        super(ResBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                                      nn.BatchNorm2d(out_channels),
                                      )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.layer1 = ResBlock(in_channels=3,   out_channels=32,  stride=2)  
        self.layer2 = ResBlock(in_channels=32,  out_channels=128, stride=2)  
        self.layer3 = ResBlock(in_channels=128, out_channels=256, stride=2)  
        self.layer4 = ResBlock(in_channels=256, out_channels=512, stride=2)  

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)  #(512, 11, 14)
        out = out.view(out.size(0), 512, -1)
        return out  

# Instantiate the model
model = ResNet()  # Assuming each layer has 2 blocks
print('# paramters =', sum(p.numel() for p in model.parameters()))
print('output size =', model(torch.randn((1,3,165,220))).size())
print(model)



# paramters = 4791680
output size = torch.Size([1, 512, 154])
ResNet(
  (layer1): ResBlock(
    (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (shortcut): Sequential(
      (0): Conv2d(3, 32, kernel_size=(1, 1), stride=(2, 2), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): ResBlock(
    (conv1): Conv2d(32, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(128, eps=1e-05, moment

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import hashlib

class LanguageTransformer(nn.Module):
    def __init__(self, vocab_size=370, embed_size=64, extended_vocab_size=1000, transformer_heads=4, transformer_hidden=4*64):
        super(LanguageTransformer, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.extended_vocab_size = extended_vocab_size
        self.extra_embbeding = nn.Parameter(torch.randn(1, 1, embed_size))   # Learnable query embedding (similar to CLS token in BERT)

        self.embedding = nn.Embedding(vocab_size + extended_vocab_size, embed_size)
        self.MHA = nn.MultiheadAttention(embed_size, transformer_heads)  #batch_first=True
        self.LN1 = nn.LayerNorm(embed_size, eps=1e-06)
        self.ffn = nn.Sequential(nn.Linear(embed_size, transformer_hidden), 
                    nn.GELU(),
                    nn.Linear(transformer_hidden, embed_size))
        self.LN2 = nn.LayerNorm(embed_size, eps=1e-06)
        self.output_proj = nn.Linear(embed_size, 512)

    def forward(self, tokens):
        x = self.embedding(tokens)
        x_extra = self.extra_embbeding.expand(tokens.size(0), tokens.size(1), -1)  # Un solo repetido embbeding o varios por cada token y cada batch?
        x = self.LN1(x + self.MHA(query=x_extra, key=x, value=x)[0])
        x = self.LN2(x + self.ffn(x))
        final_output = self.output_proj(x[:,0,:])
        return final_output

    def hash_to_index(self, word):
        """Hash function to handle out-of-vocabulary words."""
        hash_digest = hashlib.sha256(word.encode()).digest()
        hash_int = int.from_bytes(hash_digest, 'big')
        # Reduce to integer between 370 and 1369
        return 370 + (hash_int % (self.vocab_size + self.extended_vocab_size - self.vocab_size))
    

model = LanguageTransformer()
print('# parameters =', sum(p.numel() for p in model.parameters()))
tokens = torch.tensor([[1, 2, 34, 5], [4, 3, 2, 9], [6, 5, 9, 15]])
output_embedding = model(tokens)
print('output size =', output_embedding.shape)  # Should print: torch.Size([2, 512])
print(output_embedding)
print(model)
# Example token indices (assuming preprocessing is already done)




# parameters = 171008
output size = torch.Size([3, 512])
tensor([[ 0.1027,  0.6116,  0.4241,  ...,  0.0612, -0.2628,  0.0431],
        [-0.1940,  1.1913, -0.3832,  ..., -0.5313,  0.2483, -0.6485],
        [ 0.4718,  0.7512, -0.5684,  ..., -0.5354, -1.1729,  0.9062]],
       grad_fn=<AddmmBackward0>)
LanguageTransformer(
  (embedding): Embedding(1370, 64)
  (MHA): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
  )
  (LN1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
  (ffn): Sequential(
    (0): Linear(in_features=64, out_features=256, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=256, out_features=64, bias=True)
  )
  (LN2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
  (output_proj): Linear(in_features=64, out_features=512, bias=True)
)


In [4]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

transformer_layer = TransformerEncoderLayer(d_model=512, nhead=8)
multimodal_transformer = TransformerEncoder(transformer_layer, num_layers=8)

print('# paramters =', sum(p.numel() for p in multimodal_transformer.parameters()))
print('output size =', multimodal_transformer(torch.randn((10,2,512))).size())
print(multimodal_transformer)


# paramters = 25219072
output size = torch.Size([10, 2, 512])
TransformerEncoder(
  (layers): ModuleList(
    (0-7): 8 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (linear1): Linear(in_features=512, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)


In [10]:
import torch
import torch.nn as nn

# Le hace control del gate de salida!!!!

class ResidualLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super(ResidualLSTM, self).__init__() 
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True)   
        self.Residualfc1 = nn.Linear(input_size, hidden_size, bias=False)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True)

    def forward(self, x):
        o1, _ = self.lstm1(x)
        o1 += self.Residualfc1(x)
        o2, _ = self.lstm2(o1)
        o2 += o1
        return o2

dual_lstm = ResidualLSTM(input_size=1536, hidden_size=512, num_layers=2)

print('# paramters =', sum(p.numel() for p in dual_lstm.parameters()))
print('output size =', dual_lstm(torch.randn((10,1536))).size())
print(dual_lstm)


# paramters = 7086080
output size = torch.Size([10, 512])
ResidualLSTM(
  (lstm1): LSTM(1536, 512, batch_first=True)
  (Residualfc1): Linear(in_features=1536, out_features=512, bias=False)
  (lstm2): LSTM(512, 512, batch_first=True)
)


In [1]:
import time
import gymnasium
import miniwob
from miniwob.action import ActionTypes, ActionSpaceConfig
import numpy as np 
#https://github.com/Farama-Foundation/miniwob-plusplus
# https://miniwob.farama.org/
gymnasium.register_envs(miniwob)

env = gymnasium.make('miniwob/click-test-2-v1', render_mode='human', action_space_config = "all_supported")
# "humphreys22"  "all_supported", "shi17", "liu18"
for x in iter(ActionTypes):
    print(x)

# Wrap the code in try-finally to ensure proper cleanup.
try:
  # Start a new episode.
  obs, info = env.reset()
  print(info)
  for key in obs.keys():
      xx = obs[key] if not isinstance(obs[key], np.ndarray) else obs[key].shape
      print(key, xx)

  assert obs["utterance"] == "Click button ONE."
  assert obs["fields"] == (("target", "ONE"),)
  time.sleep(4)       # Only here to let you look at the environment.
  
  
  # Find the HTML element with text "ONE".
  for element in obs["dom_elements"]:
    if element["text"] == "ONE":
      break

  action = env.unwrapped.create_action(ActionTypes.CLICK_ELEMENT, ref=element["ref"])
  # NONE, MOVE_COORDS, CLICK_COORDS, DBLCLICK_COORDS, MOUSEDOWN_COORDS, MOUSEUP_COORDS, SCROLL_UP_COORDS, SCROLL_DOWN_COORDS, PRESS_KEY, TYPE_TEXT,
  # CLICK_ELEMENT, TYPE_FIELD, FOCUS_ELEMENT_AND_TYPE_TEXT, FOCUS_ELEMENT_AND_TYPE_FIELD
  print('action', action)
  obs, reward, terminated, truncated, info = env.step(action)
  print(reward, terminated, truncated, info)
  for key in obs.keys():
      xx = obs[key] if not isinstance(obs[key], np.ndarray) else obs[key].shape
      print(key, xx)

  # Check if the action was correct. 
  print(reward)      # Should be around 0.8 since 2 seconds has passed.
  assert terminated is True
  time.sleep(2)

finally:
  env.close()



ActionTypes.NONE
ActionTypes.MOVE_COORDS
ActionTypes.CLICK_COORDS
ActionTypes.DBLCLICK_COORDS
ActionTypes.MOUSEDOWN_COORDS
ActionTypes.MOUSEUP_COORDS
ActionTypes.SCROLL_UP_COORDS
ActionTypes.SCROLL_DOWN_COORDS
ActionTypes.CLICK_ELEMENT
ActionTypes.PRESS_KEY
ActionTypes.TYPE_TEXT
ActionTypes.TYPE_FIELD
ActionTypes.FOCUS_ELEMENT_AND_TYPE_TEXT
ActionTypes.FOCUS_ELEMENT_AND_TYPE_FIELD
{'done': False, 'env_reward': 0, 'raw_reward': 0, 'reason': None, 'root_dom': [1] body @ (0, 0) classes=[] children=1}
utterance Click button ONE.
dom_elements ({'ref': 1, 'parent': 0, 'left': array([0.], dtype=float32), 'top': array([0.], dtype=float32), 'width': array([500.], dtype=float32), 'height': array([210.], dtype=float32), 'tag': 'body', 'text': '', 'value': '', 'id': '', 'classes': '', 'bg_color': array([0.33333334, 0.33333334, 0.33333334, 1.        ], dtype=float32), 'fg_color': array([0., 0., 0., 1.], dtype=float32), 'flags': array([1, 0, 0, 0], dtype=int8)}, {'ref': 2, 'parent': 1, 'left': array

In [None]:
import gymnasium
import miniwob
gymnasium.register_envs(miniwob)
env = gymnasium.make('miniwob/click-test-2-v1', render_mode='human')
try:
  observation, info = env.reset(seed=42)
  for _ in range(1000):
    action = policy(observation)  # User-defined policy function
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated:
      observation, info = env.reset()
finally:
  env.close()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnet18
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class CCNet(nn.Module):
    def __init__(self, num_tokens, num_actions, num_cursor, num_keys, num_fields, embed_dim, num_heads, hidden_dim):
        super(CCNet, self).__init__()
        
        # Image processing path
        self.resnet_blocks = nn.Sequential(*list(resnet18(pretrained=True).children())[:-2])
        self.flatten = nn.Flatten()
        
        # Language processing path
        self.embedding = nn.Embedding(num_tokens, embed_dim)
        transformer_layer = TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.language_transformer = TransformerEncoder(transformer_layer, num_layers=1)
        
        # Multimodal Transformer
        self.multimodal_transformer = TransformerEncoder(transformer_layer, num_layers=8)
        
        # LSTM for integrating sequence information
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2)
        
        # Output heads
        self.action_type_head = nn.Linear(hidden_dim, num_actions)
        self.cursor_head = nn.Linear(hidden_dim, num_cursor*2)  # x and y coordinates
        self.key_index_head = nn.Linear(hidden_dim, num_keys)
        
        # Attention for task field index
        self.task_field_attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
        self.query_embed = nn.Parameter(torch.rand(embed_dim))
        
    def forward(self, images, token_indices, previous_actions):
        # Process images through ResNet and flatten
        image_features = self.flatten(self.resnet_blocks(images))
        
        # Process language tokens through embedding and transformer
        token_embeddings = self.embedding(token_indices)
        language_features = self.language_transformer(token_embeddings)
        
        # Concatenate processed image and language features
        multimodal_features = torch.cat((image_features, language_features), dim=1)
        
        # Process multimodal features through transformer
        multimodal_features = self.multimodal_transformer(multimodal_features)
        
        # Prepare for LSTM
        prev_action_embedding = self.embedding(previous_actions)
        lstm_input = torch.cat((multimodal_features, prev_action_embedding.unsqueeze(0)), dim=2)
        
        # LSTM output
        _, (hidden, _) = self.lstm(lstm_input)
        
        # Action type prediction
        action_type = self.action_type_head(hidden[-1])
        
        # Cursor coordinates prediction
        cursor_coords = self.cursor_head(hidden[-1]).view(-1, 2)  # Reshape to (x, y)
        
        # Keyboard key index prediction
        key_index = self.key_index_head(hidden[-1])
        
        # Task field index prediction using attention
        query = self.query_embed.unsqueeze(0).expand(token_indices.size(0), -1, -1)
        task_field_logits, _ = self.task_field_attention(query, token_embeddings, token_embeddings)
        
        return action_type, cursor_coords, key_index, task_field_logits

# Example usage
num_tokens = 1370  # Your vocab size + additional OOV tokens
num_actions = 10   # Number of possible actions (from your architecture)
num_cursor = 51    # Assuming this is the number of bins for cursor coordinates
num_keys = 512     # Assuming this is the number of possible keys (this might vary)
num_fields = 512   # For task fields
embed_dim = 64     # Embedding dimension from your architecture
num_heads = 4      # Number of heads in multi-head attention mechanisms
hidden_dim = 512   # Hidden dimension size for LSTMs

model = CCNet(num_tokens, num_actions, num_cursor, num_keys, num_fields, embed_dim, num_heads, hidden_dim)

# Create dummy data for testing
dummy_images = torch.randn
