In [1]:
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from datasets import load_dataset
from evaluate import load
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

from peft import LoraConfig, get_peft_model
from tqdm import tqdm
import csv


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/torch2.0/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /opt/conda/envs/torch2.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/envs/torch2.0/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:
model_name_or_path = 'Salesforce/blip2-flan-t5-xl'
cache_dir = "./" + model_name_or_path.split('/')[-1]

dtype = torch.float16

# We load our model and processor using `transformers`
processor = AutoProcessor.from_pretrained(model_name_or_path,cache_dir=cache_dir)
model = AutoModelForVision2Seq.from_pretrained(model_name_or_path,cache_dir=cache_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"

# for param in model.language_model.parameters():
#     param.requires_grad=False

for param in model.vision_model.parameters():
    param.requires_grad=False


model.to(device)


# Get our peft model and print the number of trainable parameters
# model = get_peft_model(model, config)
# model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [3]:
train_dataset = load_dataset("csv", data_files={"train" : "./train_6400_same_true_aug_v2.csv"}, split="train")
test_dataset = load_dataset("csv", data_files={"test" : "./test_367_same_true_v2.csv"}, split="test")

print(train_dataset)
print(test_dataset)

Using custom data configuration default-8def742d50488994
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-8def742d50488994/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-be7081244b66250f
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-be7081244b66250f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Dataset({
    features: ['q_id', 'image_id', 'ambiguous_question', 'ambiguous_entity', 'intermediate_question', 'intermediate_answer', 'entity_id', 'labels'],
    num_rows: 5369
})
Dataset({
    features: ['q_id', 'image_id', 'ambiguous_question', 'ambiguous_entity', 'intermediate_question', 'intermediate_answer', 'entity_id', 'labels'],
    num_rows: 367
})


In [4]:
class ImageTextClassificationDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        image = Image.open("./images/"+str(item['image_id'])+".jpg")
        encoding = self.processor(images=image, padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = "Ambiguous question: " + item["ambiguous_question"] +" Ambigous entity: " + item["ambiguous_entity"] + " Intermediate question: " + item["intermediate_question"] # + " Intermediate answer: " + item["intermediate_answer"]
        encoding["text"] = encoding['text'] + " Is the intermediate question effective to clarify the ambiguous entity in the ambiguous question? Classify yes or no. Short answer: "
        if 'effectiveness' in item.keys():
            encoding['label'] = "yes" if item['effectiveness'] == "O" else 'no' # torch.tensor(1) if item['effectiveness'] == "O" else torch.tensor(0)
        elif 'labels' in item.keys():
            encoding['label'] = "Yes" if item['labels'] == "O" else 'No' # item['labels']
        else:
            encoding['label'] = encoding['text']
            
            
        if "t5" in self.processor.tokenizer.name_or_path:
            encoding['decoder_input_ids'] = torch.tensor([self.processor.tokenizer.pad_token_id])
        
        return encoding


def collator(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key == "text":
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
            
        elif key == "label":
            labels = processor.tokenizer([example['label'] for example in batch], padding=True, add_special_tokens=True, return_tensors='pt', max_length=128)
            processed_batch['label'] = labels['input_ids']
        else:
            processed_batch[key] = torch.stack([example[key] for example in batch])
     
    return processed_batch

In [5]:

train_dataset = ImageTextClassificationDataset(train_dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2, collate_fn=collator)

test_dataset = ImageTextClassificationDataset(test_dataset, processor)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=4, collate_fn=collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)



In [6]:
import numpy as np
epoch_loss_list = []

# with open("./ambiguous_questions_test.csv", 'r') as f:
#     reader = csv.reader(f)
#     lines = [line for line in reader]

def compute_acc(predictions, references):
    
    total_len = len(predictions)
    same_count = 0
    for prediction, reference in zip(predictions, references):
        if prediction == reference:
            same_count += 1
    
    return same_count / total_len


model.train()

for epoch in range(10):
    print("Epoch:", epoch)
    epoch_loss = []
    for idx, batch in enumerate(tqdm(train_dataloader)):
        
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device, dtype=dtype)
        labels = batch.pop("label").to(device)
    
        if "t5" in model_name_or_path:
            decoder_input_ids = batch.pop("decoder_input_ids").to(device)
            outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=labels)
        
        else:
            outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=labels)
        
        #print(labels)
        #print(outputs)
        
        # loss = criterion(outputs, labels)
        loss = outputs.loss
        #print(loss.item())
        #loss = torch.mean(outputs)
        
        epoch_loss.append(loss.item())

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        #if idx % 10 == 0:
        #    generated_output = model.generate(pixel_values=pixel_values, input_ids=input_ids)
        #    print(processor.batch_decode(generated_output, skip_special_tokens=True))
    
    print(np.mean(epoch_loss))
    
    
    model.eval()
    with torch.no_grad():
        epoch_outputs = []
        gold_references = []
        metric = load("accuracy")
        for idx, batch in enumerate(tqdm(test_dataloader)):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            labels = batch.pop("label").to(device)
            # if "t5" in model_name_or_path:
            #     decoder_input_ids = batch.pop("decoder_input_ids").to(device)
            #     logits = model(pixel_values, input_ids, decoder_input_ids)
            # else:
            outputs = model.generate(pixel_values=pixel_values, input_ids=input_ids)
            predictions = processor.batch_decode(outputs, skip_special_tokens=True)
            references = processor.batch_decode(labels, skip_special_tokens=True)
            # metric.add_batch(predictions=predictions, references=references)
            
            epoch_outputs += predictions #processor.batch_decode(generated_output, skip_special_tokens=True)
            gold_references += references
            
        #accuracy = metric.compute()
        print(epoch_outputs)
        print(gold_references)
        print(compute_acc(epoch_outputs , gold_references))
        
    # with open ("./test_{}.csv".format(epoch), 'w') as f:
        
    #     writer = csv.writer(f)
    #     for idx, line in enumerate(lines):
    #         if idx == 0:
    #             writer.writerow(line)
    #         else:
    #             line.append(epoch_outputs[idx-1])
    #             writer.writerow(line)
                
    model.train()            
                

Epoch: 0


  0%|          | 0/2685 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 0; 47.53 GiB total capacity; 45.83 GiB already allocated; 13.44 MiB free; 46.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF