In [1]:
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image

from tqdm import tqdm
import csv

In [2]:
# import os 
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [3]:
model_name_or_path = 'Salesforce/instructblip-flan-t5-xxl'
cache_dir = "./" + model_name_or_path.split('/')[-1]

# We load our model and processor using `transformers`
processor = AutoProcessor.from_pretrained(model_name_or_path,cache_dir=cache_dir)
model = AutoModelForVision2Seq.from_pretrained(model_name_or_path,cache_dir=cache_dir,torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)
model.eval()


InstructBlipForConditionalGeneration(
  (vision_model): InstructBlipVisionModel(
    (embeddings): InstructBlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InstructBlipEncoder(
      (layers): ModuleList(
        (0-38): 39 x InstructBlipEncoderLayer(
          (self_attn): InstructBlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): InstructBlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        

In [5]:
import pandas as pd
import json
import numpy as np

few_shot_examples = {}

intermediate_questions =  pd.read_excel("intermediate_questions_samples_32.xlsx", dtype={'qid' : str})

with open("./ambiguous_questions_rebuilt_ptb.json") as f:
    ambiguous_questions = json.load(f)

with open("./ambiguous_questions.json") as f:
    original_questions = json.load(f)

print(len(ambiguous_questions))
for idx in intermediate_questions.index:
    intermediate_question_example = intermediate_questions.iloc[idx]
    qid = intermediate_question_example['qid']

    ambiguous_question_example = ambiguous_questions[qid]
    
    ambiguous_question = ambiguous_question_example['question']
    intermediate_question = intermediate_question_example['intermediate question']
    original_question_example = original_questions.pop(qid)
    
    few_shot_examples[qid] = original_question_example
    few_shot_examples[qid]['ambiguous_question'] = ambiguous_question
    few_shot_examples[qid]['intermediate_question'] = intermediate_question
    few_shot_examples[qid].pop("addtional_question")
    
    entities_list = set()
    for object_name in few_shot_examples[qid]['irrelated_object_names'].values():
        if object_name in few_shot_examples[qid]['ambiguous_question']:
             entities_list.add(object_name)
    few_shot_examples[qid]['question_entities'] = list(entities_list)
   
np.random.seed(42) 
test_keys = np.random.choice(list(original_questions.keys()), 32, replace=False)
test_examples = {}

for qid in test_keys:
    original_questions[qid].pop("addtional_question")
    test_examples[qid] =  original_questions[qid]
    
    ambiguous_question_example = ambiguous_questions[qid]
    ambiguous_question = ambiguous_question_example['question']
    
    test_examples[qid]["ambiguous_question"] = ambiguous_question
    
    entities_list = set()
    for object_name in test_examples[qid]['irrelated_object_names'].values():
        if object_name in test_examples[qid]['ambiguous_question']:
             entities_list.add(object_name)
    test_examples[qid]['question_entities'] = list(entities_list)

125854


In [6]:

promt = f"Instructions : Classify the following into ambigous and definite question. An ambiguous question is unanswerable question considering image. "
text = ""

predictions = []
csv_lines = [["AQ", "IQ"]]
for idx,(qid, example) in tqdm(enumerate(test_examples.items())):
    if idx == 0:
        continue
    question = example["ambiguous_question"]
    image_id = example['imageId']
    image = Image.open("./images/" + str(image_id) + '.jpg').convert('RGB')
    
    input_prompt = text + \
    f"Main question is {example['ambiguous_question']} " +  \
    "The ambiguous entities of the main question: " +  " ".join(example['question_entities']) + ". " + \
    "Write a intermediate question to clarify the ambiguous entities of the main question. " + \
    f"Question: "

    # "the intermediate question is not same with the main question." + \
    
    
    print(input_prompt)
    inputs = processor(image, input_prompt, return_tensors="pt").to(device, torch.float16)
    out = model.generate(**inputs,
                         do_sample=False,
                        num_beams=5,
                        max_length=256,
                        min_length=3,
                        top_p=0.9,
                        repetition_penalty=1.5,
                        length_penalty=1.0,
                        temperature=1,)
    prediction = processor.decode(out[0], skip_special_tokens=True)
    example['intermediate_question'] = prediction
    print(prediction)
    predictions.append(prediction)
    csv_lines.append([question, prediction])
    
with open (f"./test_fewshot_{model_name_or_path.split('/')[-1]}.json" , 'w') as f:
     
    json.dump(test_examples, f)   
    # writer = csv.writer(f)
    # for idx, line in enumerate(test_lines):
    #     if idx == 0:
    #         writer.writerow(line)
    #     else:
    #         line.append(predictions[idx-1])
    #         writer.writerow(line)

with open (f"./test_fewshot_{model_name_or_path.split('/')[-1]}.csv" , 'w') as f:
    writer = csv.writer(f)
    writer.writerows(csv_lines)       
            
                

0it [00:00, ?it/s]

Main question is Is the bus to the left or to the right of the pedestrians? The ambiguous entities of the main question: pedestrians. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


2it [00:01,  1.01it/s]

What side of the street are the pedestrians on?
Main question is What is the food to the left of the doughnut the bucket is to the right of? The ambiguous entities of the main question: bucket. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


3it [00:02,  1.33it/s]

What is to the right of the doughnut?
Main question is How large is the bear? The ambiguous entities of the main question: bear. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


4it [00:02,  1.67it/s]

what kind of bear is this?
Main question is Is the bus to the right or to the left of the driver? The ambiguous entities of the main question: bus. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


5it [00:03,  1.74it/s]

Is the bus to the right or to the left of the driver?
Main question is Is the blue car to the right or to the left of the van? The ambiguous entities of the main question: van. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


6it [00:03,  1.71it/s]

Is the blue car to the right or to the left of the van?
Main question is Is the elephant? The ambiguous entities of the main question: elephant. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


7it [00:04,  1.91it/s]

Is the elephant in water?
Main question is Which side is the picture on? The ambiguous entities of the main question: picture. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


8it [00:04,  2.03it/s]

picture is on which side of tv?
Main question is What device is the girl holding , a phone or a Wii controller? The ambiguous entities of the main question: girl. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


9it [00:05,  1.88it/s]

What device is the girl holding, a phone or a Wii controller?
Main question is What is the food on the plate called? The ambiguous entities of the main question: plate. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


10it [00:05,  2.07it/s]

What is the food on the plate called?
Main question is Is the person to the left of the laptop looking at a television? The ambiguous entities of the main question: laptop. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


11it [00:06,  1.91it/s]

Is the person to the right of the laptop looking at a television?
Main question is Is the chair to the left of the remote control that is to the left of the girl? The ambiguous entities of the main question: remote control. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


12it [00:06,  1.90it/s]

What is to the left of the girl?
Main question is What is the person to the left of the man holding? The ambiguous entities of the main question: man. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


13it [00:07,  1.98it/s]

What is the person to the left of the man holding?
Main question is Is the house to the right or to the left of the cow? The ambiguous entities of the main question: cow. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


14it [00:07,  2.02it/s]

Is the cow to the left or right of the house?
Main question is What color is the zebra? The ambiguous entities of the main question: zebra. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


15it [00:08,  2.07it/s]

What color are the stripes on the zebra?
Main question is What is the elephant? The ambiguous entities of the main question: elephant. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


16it [00:08,  2.20it/s]

What is the elephant doing?
Main question is What is the vegetable to the left of the cucumber called? The ambiguous entities of the main question: cucumber. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


17it [00:09,  2.18it/s]

What is the vegetable to the left of the cucumber called?
Main question is Is she to the left of a picture? The ambiguous entities of the main question: picture. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


18it [00:09,  2.23it/s]

Is she to the right of a picture?
Main question is What shape does the table have? The ambiguous entities of the main question: table. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


19it [00:09,  2.49it/s]

What shape is the table?
Main question is What is the item of furniture to the left of the speaker? The ambiguous entities of the main question: speaker. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


20it [00:10,  2.29it/s]

What is the item of furniture to the right of the speaker?
Main question is Are the glasses to the right or to the left of the person? The ambiguous entities of the main question: person glasses. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


21it [00:10,  2.28it/s]

are glasses on right or left of person?
Main question is Which side of the picture is the car on? The ambiguous entities of the main question: car. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


22it [00:11,  2.34it/s]

Which side of the picture is the car on?
Main question is What is the appliance to the right of the woman sitting on top of? The ambiguous entities of the main question: woman. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


23it [00:11,  2.30it/s]

What is the woman sitting on top of?
Main question is What is on the building behind the window? The ambiguous entities of the main question: window. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


24it [00:11,  2.41it/s]

What is on the building behind the window?
Main question is Are the people? The ambiguous entities of the main question: people. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


25it [00:12,  2.59it/s]

are there people in picture
Main question is What color is the shirt? The ambiguous entities of the main question: shirt. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


26it [00:12,  2.47it/s]

What color is the shirt?
Main question is What is the player wearing? The ambiguous entities of the main question: player. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


27it [00:13,  2.48it/s]

what is player wearing?
Main question is Is the person in the snow wearing a backpack? The ambiguous entities of the main question: snow. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


28it [00:13,  2.37it/s]

Is the person in the snow wearing a backpack?
Main question is What type of device is open , the cell phone or the microphone? The ambiguous entities of the main question: cell phone. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


29it [00:13,  2.56it/s]

What type of phone is this?
Main question is Is the man to the right of the people sitting on a bench? The ambiguous entities of the main question: bench. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


30it [00:14,  2.21it/s]

Is the man to the left of the people sitting on a bench?
Main question is What is the item of furniture that is to the right of the curtains? The ambiguous entities of the main question: curtains. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


31it [00:14,  2.32it/s]

What is to the left of the curtains?
Main question is Is the car? The ambiguous entities of the main question: car. Write a intermediate question to clarify the ambiguous entities of the main question. Question: 


32it [00:15,  2.08it/s]

Is the car in front of bus?



