In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPImageProcessor, CLIPModel, CLIPVisionModel, AutoConfig
import numpy as np
from PIL import Image
import requests
import torch
import torch.nn as nn
from torchvision.transforms import PILToTensor


In [None]:
# grab LMSYS Vicuna v1.5 7B, takes up roughly 20GB of RAM if you don't have a CUDA enabled GPU available
model_name = "lmsys/vicuna-7b-v1.5"

model = AutoModelForCausalLM.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
# test generate code from vicuna using prompt
prompt = "2+2="
inputs = tokenizer(prompt, return_tensors='pt')
response = model.generate(**inputs, max_new_tokens=50)
outputs = tokenizer.decode(response[0], skip_special_tokens=True)
print(outputs)

2+2=4


In [14]:
# grab some image from COCO
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# convert from PIL.Image to Tensor (might be useful later)
im_tensor = PILToTensor()(image)

# grab CLIP's image processor

CLIPURL = "openai/clip-vit-base-patch32"
clipimage = CLIPImageProcessor.from_pretrained(CLIPURL)

# process image through CLIP
input = clipimage(image)

# grab pixel values from encoded image, convert to tensor
encoded_image = clipimage(image)['pixel_values'][0]
encoded_tensor = torch.from_numpy(encoded_image)

# grab CLIPVisionModel, after processing images they are fed through CLIPVisionModel
clipmodel = CLIPVisionModel.from_pretrained(CLIPURL)

In [15]:

# make prompt and tokenize, most of these settings are copied from LLaVA's github
pr = "please just work"
tokenized_pr = tokenizer(pr, 
                   return_tensors='pt', 
                   padding="longest", 
                   max_length=tokenizer.model_max_length, 
                   truncation=True)

# grab the input_ids and attention_mask from tokenized prompt
tokenized_pr_input_ids = tokenized_pr['input_ids']
tokenized_pr_attention_mask = tokenized_pr['attention_mask']

In [16]:
# like mentioned above, send processed image through CLIPVisionModel for feature extraction
encoded_im = clipmodel(encoded_tensor.unsqueeze(0))

In [11]:
# instantiate the linear projection layer for conneting CLIP outputs to Vicuna,
# CLIP outputs a (batch_size, no. of 'tokens', 768) sized output and vicuna's embedding dimension is 4096
linear_proj = nn.Linear(768, 4096)


# grab the input embedding layer from vicuna
vicuna_embed = model.get_input_embeddings()

# to connect CLIP to vicuna we need to:
#   1) pull this embedding out from the model (vicuna_embed)
#       a. replace the embedding architecture in vicuna with an nn.Identity layer
#   2) embed the prompt (embedded_pr)
#   3) linearly transform of image features (projected_im)
#   4) concatenate projected_im and embedded_pr along no. of tokens dimension (im_and_pr_input_ids and im_and_pr_attention_mask)
#       a. i.e. adding the image 'tokens' to the context window
#   5) use these for prediction using vicuna

In [12]:
# linear projection of encoded image
projected_im = linear_proj(encoded_im['last_hidden_state'])

# embed prompt using vicuna's default embedding
embedded_pr = vicuna_embed(tokenized_pr_input_ids)

# concatenate projected input and embedded prompt
im_and_pr_input_ids = torch.cat((projected_im, embedded_pr), dim=1)
im_and_pr_attention_mask = torch.cat((torch.ones(1,projected_im.size()[1]), tokenized_pr_attention_mask),dim=1)

# replace input embedding with Identity layer, **THE MODEL CAN NO LONGER BE USED WITH .GENERATE NOW**
model.set_input_embeddings(nn.Identity())

In [13]:
# see layers of vicuna (i think this model is basically just an open-source, instruction-tuned version of llama)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Identity()
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRM

In [14]:
# do forward pass through model for token prediction
preds = model(input_ids=im_and_pr_input_ids, attention_mask = im_and_pr_attention_mask)


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [15]:
# looks like the output is as tuple of its loss in [0] and logits in [1]?
preds

CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.9704,  4.3489,  6.7879,  ..., -0.1526, -1.6258, -0.3482],
         [-5.0072,  4.2940,  6.5516,  ..., -0.0140, -1.3542, -0.3899],
         [-4.9991,  4.2175,  6.5360,  ...,  0.1383, -1.2136, -0.3374],
         ...,
         [-8.1040, -0.8108,  9.5464,  ..., -3.5729, -4.3248, -4.6205],
         [-8.6032, -2.8227,  7.7578,  ..., -4.1981, -3.8631, -5.2058],
         [-8.7752, -2.2112,  7.9513,  ..., -3.9506, -4.7060, -5.5724]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 0.1341, -0.1808,  0.0871,  ...,  0.1256, -0.1313,  0.0391],
          [-0.1920,  0.1285, -0.0380,  ..., -0.0385,  0.1217,  0.0073],
          [-0.1298,  0.0690,  0.0740,  ...,  0.0141,  0.0592,  0.0473],
          ...,
          [-0.3933,  0.2163,  0.0130,  ...,  0.2871, -0.0069,  0.4859],
          [ 0.1151, -0.2967, -0.0633,  ...,  0.2012, -0.0244,  0.1896],
          [-0.0119, -0.1757, -0.6171,  ...,  0.0641,  0.1820,  0.1058]],

         [[

In [6]:
from transformers import AutoModelForCausalLM, AutoConfig
import torch.nn as nn
import torch


In [1]:
from vicuna_llava import vicuna_llava

In [4]:
model_name = "lmsys/vicuna-7b-v1.5"

config = AutoConfig.from_pretrained(model_name)

testmodel = vicuna_llava(config)

In [5]:
testmodel(encoded_im, tokenized_pr)

NameError: name 'encoded_im' is not defined