# Testing vision and language encoders

In this notebook, we test the vision and language encoders from the MobileVLM model. The vision encoder consists of a ViT based CLIP Vision Tower, while the language encoder is based on a MobileLLaMA-1.4B decoder.

In [3]:
%load_ext autoreload
%autoreload 2
import torch
from encoders import MobileVLMVisionEncoder, MobileVLMLanguageEncoder

#### Testing vision encoder

In [4]:
vision_encoder = MobileVLMVisionEncoder()
vision_encoder

MobileLlamaForCausalLM(
  (model): MobileLlamaModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

MobileVLMVisionEncoder(
  (vision_tower): CLIPVisionTower(
    (vision_tower): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(577, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-23): 24 x CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

In [3]:
emb = vision_encoder(torch.rand((1, 3, 340, 340)))
emb

torch.Size([1, 576, 1024])
torch.Size([1, 144, 2048])


tensor([[[ 0.0901, -0.2929, -0.0329,  ..., -0.3193,  0.0277, -0.1266],
         [ 0.0996, -0.0598,  0.0642,  ..., -0.2292,  0.1782, -0.1177],
         [ 0.1956,  0.0206,  0.0917,  ..., -0.1801,  0.2091, -0.1433],
         ...,
         [-0.1992, -0.1344,  0.1641,  ..., -0.2346, -0.0174, -0.1702],
         [-0.0494,  0.0761,  0.1609,  ..., -0.0809, -0.0032, -0.0728],
         [-0.0798, -0.3504,  0.0542,  ..., -0.3592,  0.0347, -0.2036]]],
       grad_fn=<TransposeBackward0>)

In [28]:
emb.shape

[autoreload of encoders failed: Traceback (most recent call last):
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPython\extensions\autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPython\extensions\autoreload.py", line 397, in update_generic
    update(a, b)
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPython\extensions\autoreload.py", line 349, in update_class
    if update_generic(old_obj, new_obj):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPython\extensions\autoreload.py", line 397, in update_generic
    update(a, b)
  File "C:\Users\dev-sys\AppData\Roaming\Python\Python311\site-packages\IPy

torch.Size([1, 144, 2048])

#### Testing language encoder

In [4]:
lang_encoder = MobileVLMLanguageEncoder()
lang_encoder

  return self.fget.__get__(instance, owner)()


MobileVLMLanguageEncoder(
  (llama): MobileLlamaForCausalLM(
    (model): MobileLlamaModel(
      (embed_tokens): Embedding(32000, 2048, padding_idx=0)
      (layers): ModuleList(
        (0-23): 24 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
            (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
            (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (

In [6]:
emb = lang_encoder('Hello there')

In [7]:
emb

tensor([[[ 0.4087, -0.1623,  0.1283,  ...,  0.3256,  0.1529,  0.0899],
         [ 2.2792,  3.0339, -2.9533,  ..., -5.0880,  6.3237,  3.3738],
         [ 3.2055,  3.2210, -4.0507,  ..., -5.3091,  7.8863,  4.1120]]],
       grad_fn=<MulBackward0>)

In [8]:
emb.shape

torch.Size([1, 3, 2048])