In [60]:
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPImageProcessor, CLIPModel, CLIPVisionModel, AutoConfig
import numpy as np
from PIL import Image
import requests
import torch
import torch.nn as nn
from torchvision.transforms import PILToTensor


In [61]:
# grab LMSYS Vicuna v1.5 7B, takes up roughly 20GB of RAM if you don't have a CUDA enabled GPU available
model_name = "lmsys/vicuna-7b-v1.5"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [62]:
# test generate code from vicuna using prompt
prompt = "2+2="
inputs = tokenizer(prompt, return_tensors='pt')
response = model.generate(**inputs, max_new_tokens=50)
outputs = tokenizer.decode(response[0], skip_special_tokens=True)
print(outputs)

2+2=4


In [63]:
# grab some image from COCO
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# convert from PIL.Image to Tensor (might be useful later)
im_tensor = PILToTensor()(image)

# grab CLIP's image processor

CLIPURL = "openai/clip-vit-base-patch32"
clipimage = CLIPImageProcessor.from_pretrained(CLIPURL)

# process image through CLIP
input = clipimage(image)

# grab pixel values from encoded image, convert to tensor
encoded_image = clipimage(image)['pixel_values'][0]
encoded_tensor = torch.from_numpy(encoded_image)

# grab CLIPVisionModel, after processing images they are fed through CLIPVisionModel
clipmodel = CLIPVisionModel.from_pretrained(CLIPURL)

In [64]:

# make prompt and tokenize, most of these settings are copied from LLaVA's github
pr = "please just work"
tokenized_pr = tokenizer(pr, 
                   return_tensors='pt', 
                   padding="longest", 
                   max_length=tokenizer.model_max_length, 
                   truncation=True)

# grab the input_ids and attention_mask from tokenized prompt
tokenized_pr_input_ids = tokenized_pr['input_ids']
tokenized_pr_attention_mask = tokenized_pr['attention_mask']

In [65]:
# like mentioned above, send processed image through CLIPVisionModel for feature extraction
encoded_im = clipmodel(encoded_tensor.unsqueeze(0))

In [66]:
# instantiate the linear projection layer for conneting CLIP outputs to Vicuna,
# CLIP outputs a (batch_size, no. of 'tokens', 768) sized output and vicuna's embedding dimension is 4096
linear_proj = nn.Linear(768, 4096)


# grab the input embedding layer from vicuna
vicuna_embed = model.get_input_embeddings()

# to connect CLIP to vicuna we need to:
#   1) pull this embedding out from the model (vicuna_embed)
#       a. replace the embedding architecture in vicuna with an nn.Identity layer
#   2) embed the prompt (embedded_pr)
#   3) linearly transform of image features (projected_im)
#   4) concatenate projected_im and embedded_pr along no. of tokens dimension (im_and_pr_input_ids and im_and_pr_attention_mask)
#       a. i.e. adding the image 'tokens' to the context window
#   5) use these for prediction using vicuna

In [67]:
# linear projection of encoded image
projected_im = linear_proj(encoded_im['last_hidden_state'])

# embed prompt using vicuna's default embedding
embedded_pr = vicuna_embed(tokenized_pr_input_ids)

# concatenate projected input and embedded prompt
im_and_pr_input_ids = torch.cat((projected_im, embedded_pr), dim=1)
im_and_pr_attention_mask = torch.cat((torch.ones(1,projected_im.size()[1]), tokenized_pr_attention_mask),dim=1)

# replace input embedding with Identity layer, **THE MODEL CAN NO LONGER BE USED WITH .GENERATE NOW**
model.set_input_embeddings(nn.Identity())

In [68]:
# see layers of vicuna (i think this model is basically just an open-source, instruction-tuned version of llama)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Identity()
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRM

In [69]:
# do forward pass through model for token prediction
preds = model(input_ids=im_and_pr_input_ids)




In [70]:
# looks like the output is as tuple of its loss in [0] and logits in [1]?
preds

CausalLMOutputWithPast(loss=None, logits=tensor([[[-5.3354,  4.5074,  6.1575,  ..., -0.4120, -1.7000, -0.7496],
         [-5.1094,  4.5691,  6.2315,  ..., -0.1829, -1.3269, -0.6546],
         [-4.9676,  4.6907,  6.2402,  ..., -0.0197, -1.1277, -0.4788],
         ...,
         [-6.2304,  4.8186, 15.7422,  ..., -2.5759, -5.2334, -3.6471],
         [-6.8357,  3.6749, 12.0847,  ..., -1.2094, -3.9155, -3.7350],
         [-5.1064,  5.0415, 13.6683,  ..., -1.4907, -3.3639, -2.3184]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 1.3170e-01, -1.2153e-01,  6.5412e-02,  ..., -1.8024e-01,
            7.7437e-02, -1.7841e-01],
          [ 1.1753e-01, -3.6808e-02,  3.5797e-02,  ...,  2.5615e-01,
           -2.0749e-01,  1.5803e-01],
          [ 4.4114e-02, -2.2856e-03,  9.8386e-02,  ...,  3.5180e-01,
           -3.3276e-01,  2.1212e-01],
          ...,
          [-3.9328e-01,  2.1629e-01,  1.3041e-02,  ...,  2.8709e-01,
           -6.9314e-03,  4.8594e-01],
          [ 1.150