In [1]:
from transformers import AutoConfig
import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
from vicuna_llava import vicuna_llava, dataset_llava
from accelerate import Accelerator
accelerator = Accelerator()

Log in to huggingface (using llama requires you to request access on huggingface)

In [2]:

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Instantiate vicunallava

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# grab vicuna and its tokenizer
model_name = "meta-llama/Llama-3.2-1B"#"lmsys/vicuna-7b-v1.5"
config = AutoConfig.from_pretrained(model_name)

vicunallava = accelerator.prepare(vicuna_llava(config, llmURL=model_name, accelerator=accelerator))


Grab pretrained projector

In [4]:
#linear_llava_proj = torch.load('mm_projector.bin', weights_only=True)
#linear_llava_weights = linear_llava_proj['model.mm_projector.weight']
#linear_llava_biases = linear_llava_proj['model.mm_projector.bias']

#with torch.no_grad():
#    vicunallava.im_embedding.weight.copy_(linear_llava_weights)
#    vicunallava.im_embedding.bias.copy_(linear_llava_biases)

### Instantiate Dataset

In [5]:
# directories pointing to images directory and chat.json, I've downloaded the dataset locally to deal with some of the missing images
chat = 'CC3M/chat.json'
im_dir = "CC3M/images/"

num_samples=20000
subset_idcs = list(range(num_samples))

cc3m_dataset = accelerator.prepare(Subset(dataset_llava(chat, im_dir), subset_idcs))
batch_size=10
cc3m_dataloader =  accelerator.prepare(DataLoader(cc3m_dataset, batch_size=batch_size))

Test Generating on Some Sample from Dataset

In [6]:
#testprompt, testimage, _ = cc3m_dataset[0]
#testimage = testimage.to(device)
#transform = T.ToPILImage()

#output = vicunallava.generate(testimage, testprompt, max_new_tokens=15)
#print(f'input prompt: {testprompt}')
#transform(testimage).show()

#print(f'response: {output}')

### Training (Stage 1)

Set which parameters to be optimized

In [7]:
# don't compute gradient on any vicunallava layers besides im_embedding
for i in iter(vicunallava.parameters()):
    i.requires_grad = False
for i in iter(vicunallava.model.parameters()):
    i.requires_grad = False
for i in iter(vicunallava.im_embedding.parameters()):
    i.requires_grad = True

In [8]:
from torch.optim import Adam
from transformers import get_scheduler
from tqdm.auto import tqdm



optimizer =  accelerator.prepare(Adam([{"params":vicunallava.parameters(),
                                        "params":vicunallava.im_embedding.parameters()}], lr=2e-3))


num_epochs = 1
gradient_accumulation_steps = 8
num_dataset_samples = len(cc3m_dataset)
num_batch_steps = num_epochs*num_dataset_samples/batch_size
num_optim_steps = num_batch_steps/gradient_accumulation_steps
num_warmup_steps = np.ceil(0.03*num_optim_steps)
lr_scheduler = accelerator.prepare(get_scheduler("cosine",
                             optimizer=optimizer,
                             num_warmup_steps=num_warmup_steps,
                             num_training_steps=num_batch_steps))

progress_bar=tqdm(range(int(num_batch_steps)))
losses = []

vicunallava.train()
for i in range(num_epochs):
    batchiter = 0

    for batchprompt,batchimage,batchresp in cc3m_dataloader:
        
        input = [batchprompt[j]+'###'+batchresp[j] for j in range(batch_size)]
        input = vicunallava.process_prompt(input)
        tokenized_input = vicunallava.tokenize(input)
        outs = vicunallava(batchimage, input, batch_size=batch_size)
        loss = outs['loss']/gradient_accumulation_steps
        losses.append(loss)
        accelerator.backward(loss)
        batchiter+=1
        progress_bar.update(1)
        print(f"batchiter:{batchiter}, loss:{loss}")
        if batchiter%gradient_accumulation_steps==0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if batchiter%10 == 0:
            torch.save(vicunallava.im_embedding, 'vicunallava_im_embedding_stage1.pt')



  0%|          | 0/2000 [00:00<?, ?it/s]

  with autocast():


batchiter:1, loss:1.4684319496154785
batchiter:2, loss:1.4685208797454834
batchiter:3, loss:1.4692022800445557
batchiter:4, loss:1.4685019254684448
batchiter:5, loss:1.4695931673049927
batchiter:6, loss:1.4693340063095093
batchiter:7, loss:1.4690731763839722


KeyboardInterrupt: 

In [None]:
vicunallava.save_pretrained("ece598-llava-recreated")

In [None]:
vicunallava.push_to_hub("cjschul/ece598-llava-recreated")

model-00002-of-00002.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]