In [1]:
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from datasets import load_dataset
from evaluate import load
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image

from peft import LoraConfig, get_peft_model
from tqdm import tqdm
import csv
import random
import os
import numpy as np


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/torch2.0/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /opt/conda/envs/torch2.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/envs/torch2.0/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:

seed = 42
# Fixed RandomSeed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True

torch.backends.cudnn.benchmark = True

In [3]:
model_name_or_path = 'Salesforce/instructblip-flan-t5-xl'
cache_dir = "./" + model_name_or_path.split('/')[-1]

dtype = torch.float16

# We load our model and processor using `transformers`
processor = InstructBlipProcessor.from_pretrained(model_name_or_path,cache_dir=cache_dir)
model = InstructBlipForConditionalGeneration.from_pretrained(model_name_or_path,cache_dir=cache_dir, torch_dtype=dtype)

# Get our peft model and print the number of trainable parameters
print([name for name,p in model.named_parameters()])

for param in model.vision_model.parameters():
    param.requires_grad=False


device = "cuda:1" if torch.cuda.is_available() else "cpu"

# model = Model(model)

model.to(device)
model.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

['query_tokens', 'vision_model.embeddings.class_embedding', 'vision_model.embeddings.position_embedding', 'vision_model.embeddings.patch_embedding.weight', 'vision_model.embeddings.patch_embedding.bias', 'vision_model.encoder.layers.0.self_attn.qkv.weight', 'vision_model.encoder.layers.0.self_attn.qkv.bias', 'vision_model.encoder.layers.0.self_attn.projection.weight', 'vision_model.encoder.layers.0.self_attn.projection.bias', 'vision_model.encoder.layers.0.layer_norm1.weight', 'vision_model.encoder.layers.0.layer_norm1.bias', 'vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_model.encoder.layers.0.mlp.fc1.bias', 'vision_model.encoder.layers.0.mlp.fc2.weight', 'vision_model.encoder.layers.0.mlp.fc2.bias', 'vision_model.encoder.layers.0.layer_norm2.weight', 'vision_model.encoder.layers.0.layer_norm2.bias', 'vision_model.encoder.layers.1.self_attn.qkv.weight', 'vision_model.encoder.layers.1.self_attn.qkv.bias', 'vision_model.encoder.layers.1.self_attn.projection.weight', 'vision_mod

InstructBlipForConditionalGeneration(
  (vision_model): InstructBlipVisionModel(
    (embeddings): InstructBlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InstructBlipEncoder(
      (layers): ModuleList(
        (0-38): 39 x InstructBlipEncoderLayer(
          (self_attn): InstructBlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): InstructBlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        

In [4]:
train_dataset = load_dataset("csv", data_files={"train" : "./train_6400_same_true_aug_v2.csv"}, split="train")
test_dataset = load_dataset("csv", data_files={"test" : "./test_367_same_true_v2.csv"}, split="test")

print(train_dataset)
print(test_dataset)

Using custom data configuration default-86ee3f83d7c3c7b3


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-86ee3f83d7c3c7b3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-86ee3f83d7c3c7b3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


Using custom data configuration default-168447e99049af9a


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-168447e99049af9a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-168447e99049af9a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.
Dataset({
    features: ['q_id', 'image_id', 'ambiguous_question', 'ambiguous_entity', 'intermediate_question', 'intermediate_answer', 'entity_id', 'labels'],
    num_rows: 3343
})
Dataset({
    features: ['q_id', 'image_id', 'ambiguous_question', 'ambiguous_entity', 'intermediate_question', 'intermediate_answer', 'entity_id', 'labels'],
    num_rows: 367
})


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


In [5]:
class ImageTextClassificationDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        image = Image.open("./images/"+str(item['image_id'])+".jpg")
      
        # remove batch dimension
        encoding = {}
        encoding['image']=image
        encoding["text"] = "Given an ambiguous quesiton, an ambigous entity and an intermediate question, your task is to classify whether the intermediate question clarifies the ambigious entity in the ambiguous quesiton." # The ambiguous entity means that it appears multiple times in the image and cannot be distinctly identified. A good intermediate question is one that clarify a specific entity among same entities. Additionaly, A bad intermediate question is one that can't determine one entity among the entities through the intermediate quesiton.  If you think the given intermediate question is good, indicate it by answering \"Yes\". Otherwise, answer \"No\".There are only two types of answers possible: \"Yes\" and \"No\"."
        encoding["text"] = encoding["text"] + " Ambiguous question: " + item["ambiguous_question"] + "Ambiguous entity: " + item["ambiguous_entity"] + " Intermediate question: " + item["intermediate_question"] + " Short answer:"
        
        # encoding["text"] = "Ambiguous question: " + item["ambiguous_question"] +" Ambigous entity: " + item["ambiguous_entity"] + " Intermediate question: " + item["intermediate_question"] # + " Intermediate answer: " + item["intermediate_answer"]
        # encoding["text"] = encoding['text'] + " Is the intermediate question effective to clarify the ambiguous entity in the ambiguous question? Classify yes or no. Short answer: "
        
        
        if 'effectiveness' in item.keys():
            encoding['label'] = "yes" if item['effectiveness'] == "O" else 'no' # torch.tensor(1) if item['effectiveness'] == "O" else torch.tensor(0)
        elif 'labels' in item.keys():
            encoding['label'] =  "Yes" if item['labels'] == "O" else 'No' # item['labels']
        else:
            encoding['label'] = encoding['text']

        
        if "t5" in self.processor.tokenizer.name_or_path:
            encoding['decoder_input_ids'] = torch.tensor([self.processor.tokenizer.pad_token_id])
        
        inputs = processor(images=encoding['image'],text=encoding['text'],return_tensors="pt", max_length=128, padding='max_length' ,truncation=True)
        encoding.pop('image')
        encoding.pop('text')
        encoding.update(inputs)
        return encoding


def collator(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():       
        if key == "label":
            labels = processor.tokenizer([example['label'] for example in batch], padding='max_length', add_special_tokens=True, return_tensors='pt')
            processed_batch['labels'] = labels['input_ids']
        else:
            processed_batch[key] = torch.stack([example[key].squeeze() for example in batch])
     
    return processed_batch

In [6]:

train_dataset = ImageTextClassificationDataset(train_dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1, collate_fn=collator)

test_dataset = ImageTextClassificationDataset(test_dataset, processor)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=4, collate_fn=collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)



In [7]:
import numpy as np
from torch import nn
epoch_loss_list = []

#criterion = nn.CrossEntropyLoss(reduction='mean')

# with open("./ambiguous_questions_test.csv", 'r') as f:
#     reader = csv.reader(f)
#     lines = [line for line in reader]

def compute_acc(predictions, references):
    
    total_len = len(predictions)
    same_count = 0
    for prediction, reference in zip(predictions, references):
        if prediction == reference:
            same_count += 1
    
    return same_count / total_len

for epoch in range(10):
    print("Epoch:", epoch)
    epoch_loss = []
    for idx, batch in enumerate(tqdm(train_dataloader)):
        for key in batch.keys():
            batch[key] = batch[key].to(device)
            
        if "t5" in model_name_or_path:
            decoder_input_ids = batch.pop("decoder_input_ids").to(device)
            outputs = model(**batch)
        
        else:
            outputs = model(**batch)
        
        #print(labels)
        #print(outputs)
        
        # loss = criterion(outputs, labels)
        loss = outputs.loss
        #print(loss.item())
        #loss = torch.mean(outputs)
        
        epoch_loss.append(loss.item())

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        #if idx % 10 == 0:
        #    generated_output = model.generate(pixel_values=pixel_values, input_ids=input_ids)
        #    print(processor.batch_decode(generated_output, skip_special_tokens=True))
    
    print(np.mean(epoch_loss))
    
    
    model.eval()
    with torch.no_grad():
        epoch_outputs = []
        gold_references = []
        # metric = load("accuracy")
        for idx, batch in enumerate(tqdm(test_dataloader)):
            for key in batch.keys():
                batch[key] = batch[key].to(device)
            
            # if "t5" in model_name_or_path:
            #     decoder_input_ids = batch.pop("decoder_input_ids").to(device)
            #     logits = model(pixel_values, input_ids, decoder_input_ids)
            # else:
            outputs = model.generate(**batch)
            predictions = processor.batch_decode(outputs, skip_special_tokens=True)
            references = processor.batch_decode(batch['labels'], skip_special_tokens=True)
            # metric.add_batch(predictions=predictions, references=references)
            
            epoch_outputs += predictions #processor.batch_decode(generated_output, skip_special_tokens=True)
            gold_references += references
            
        #accuracy = metric.compute()
        print(epoch_outputs[:10])
        print(gold_references[:10])
        print(compute_acc(epoch_outputs , gold_references))
        
    # with open ("./test_{}.csv".format(epoch), 'w') as f:
        
    #     writer = csv.writer(f)
    #     for idx, line in enumerate(lines):
    #         if idx == 0:
    #             writer.writerow(line)
    #         else:
    #             line.append(epoch_outputs[idx-1])
    #             writer.writerow(line)
                
    model.train()            
                

Epoch: 0



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  1%|          | 25/3343 [00:13<30:23,  1.82it/s]


KeyboardInterrupt: 