In [1]:
import sys, os
sys.path.append("./..")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import numpy as np
import torch
import random
import json

In [3]:
device = 'cuda'

# Set a seed value
seed_value = 10
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value

random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

#Fix torch random seed
torch.manual_seed(seed_value)

<torch._C.Generator at 0x7f923c13d590>

## Load model for prediction

In [4]:
model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
max_new_tokens = 1024

In [6]:
# wrapper model
if 'cogvlm' in model_path.lower():
    from modules.models.cogvlm_models import CogVLMModel
    model = CogVLMModel(model_name=model_path, stop_sequences=[], max_new_tokens=max_new_tokens)
elif 'contactdoctor' in model_path.lower():
    from modules.models.biomedllama_models import BioMedLlamaModel
    model = BioMedLlamaModel(model_name=model_path, stop_sequences=[], max_new_tokens=max_new_tokens)
elif 'liuhaotian' in model_path.lower():
    from modules.models.llava_models import HuggingfaceModel as LlavaModel
    model = LlavaModel(model_name=model_path, stop_sequences=[], max_new_tokens=max_new_tokens)
else: # It can be any other model (LLava, Llama, etc)
    from modules.models.vision_models import VisionModel
    model = VisionModel(model_name=model_path, stop_sequences=[], max_new_tokens=max_new_tokens)     

2025-07-15 10:46:25.077635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752547585.090902 3736010 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752547585.094777 3736010 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752547585.106802 3736010 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752547585.106814 3736010 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752547585.106815 3736010 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

## Input data


The input data (image, text) should follow the format sth like this:
- ```image_path = "data/vqav2/val2014/COCO_val2014_000000525732.jpg"```
- ```question = "What color strip does the surfboard have?"```

In [7]:
# Load VQAv2 dataset
# Note: You can change the question_file and image_dir to your own dataset
question_file = "/home/daohieu/maplecg_nfs/research/VLM/su_vlm/data/vqav2/llava_OpenEnded_mscoco_val2014_questions.jsonl"
image_dir = "/home/daohieu/maplecg_nfs/research/VLM/su_vlm/data/vqav2/val2014"
questions = [json.loads(q) for q in open(os.path.expanduser(question_file), "r")]

In [8]:
questions[0]

{'question_id': 262148000,
 'image': 'COCO_val2014_000000262148.jpg',
 'text': 'Where is he looking?',
 'category': 'default',
 'answers': ['down',
  'down',
  'at table',
  'skateboard',
  'down',
  'table',
  'down',
  'down',
  'down',
  'down']}

In [9]:
idx = 0
image_path = os.path.join(image_dir, questions[idx]["image"])
question = questions[idx]["text"]
answer = questions[idx]["answers"]
print("idx: ", idx)
print(f"Image path: {image_path}")
print(f"Question: {question}")
print(f"Answer: {answer}")


idx:  0
Image path: /home/daohieu/maplecg_nfs/research/VLM/su_vlm/data/vqav2/val2014/COCO_val2014_000000262148.jpg
Question: Where is he looking?
Answer: ['down', 'down', 'at table', 'skateboard', 'down', 'table', 'down', 'down', 'down', 'down']


## Model Prediction

### U1: Sampling

In [10]:
# Prefix prompt for the VQA task 
# You can change this prompt to fit your task
prefix_prompt = 'Answer this question in only a word or a phrase. '

In [11]:
prompt = prefix_prompt + question

In [12]:
# Most likely generation with low temperature
# This uses for evaluation purpose, i.e., to get the correctness of the model's prediction to evaluate UQ methods, these metrics can be AUROC, ECE, CPC etc.
most_likely_generation_output_text, most_likely_generation_log_likelihood, most_likely_generation_embedding = model.predict_prompt_image(prompt, image_path, temperature=0.1, top_p=0.9)
print(f"Most likely generation output text: {most_likely_generation_output_text}")

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Most likely generation output text: Above the table.


In [13]:
print("Prediction: ", most_likely_generation_output_text)
print("Grouth truth: ", answer)

Prediction:  Above the table.
Grouth truth:  ['down', 'down', 'at table', 'skateboard', 'down', 'table', 'down', 'down', 'down', 'down']


In [14]:
# k-samples generation with high temperature and save the log likelihood and embedding of the generation. 
# (this uses to compute UQ metrics)
num_samples = 10
temperature = 1.0
top_p = 0.9
generation_list = []
generation_log_likelihood_list = []
embedding = []
for i in range(num_samples):
    generation, generation_log_likelihood, generation_embedding = model.predict_prompt_image(prompt, image_path, temperature=temperature, top_p=top_p)
    generation_list.append(generation)
    generation_log_likelihood_list.append(generation_log_likelihood)
    embedding.append(generation_embedding)
embedding = np.array(torch.stack(embedding).tolist())

In [15]:
print(f"Generation list: {generation_list}")

Generation list: ['above.', 'Above the table.', 'The ground.', 'The sky.', 'Above the skateboard.', 'Above the bench.', 'The ground.', 'Above the bench.', 'The skateboarder appears to be indoors.', 'Out of the window.']


In [16]:
print("Embedding shape:", embedding.shape)

Embedding shape: (10, 4096)


In [17]:
print("Log likelihoods of a response:", generation_log_likelihood_list[0])

Log likelihoods of a response: [-3.1181602478027344, -2.1971194744110107, 0.0]


## Compute UMPIRE

### U2-4

In [20]:
def normalize_embedding(x):
    return np.array([e / np.linalg.norm(e, ord=2) for e in x])

def compute_incoherence_score(x):
    x_ = 1-np.array([np.exp(np.sum(i)) for i in x]) # 1-p_sequence
    return x_ 

from sklearn.utils.extmath import fast_logdet
def compute_logdet(K, alpha=1e-8):
    # seed = np.random.rand()
    logdet_value = fast_logdet(K + np.identity(K.shape[0])*alpha)
    return logdet_value

def compute_umpire(V, C, alpha=30):
    """
    Compute the UMPIRE value.
    V: logdet value
    C: incoherence score
    alpha: hyperparameter
    """
    V_tilde = V + alpha * np.linalg.norm(C, ord=1)
    return V_tilde

In [26]:
# Normalize the embedding
norm_embedding = normalize_embedding(embedding)

# Phi is the normalized embedding
phi = norm_embedding
print(r"Normalized embedding vector $\phi$ shape:", phi.shape)

# Incoherence score
C = compute_incoherence_score(generation_log_likelihood_list)
print("Incoherence score C:", C)

Normalized embedding vector $\phi$ shape: (10, 4096)
Incoherence score C: [0.9950841  0.92494653 0.9538083  0.96402587 0.98102351 0.94508971
 0.9538083  0.94508971 0.99997448 0.99974541]


In [28]:
# Logdet 
V = compute_logdet(np.matmul(phi, phi.T))

# UMPIRE
alpha = 30
V_tilde = compute_umpire(V, C, alpha)
print("UMPIRE V_tilde:", V_tilde) # Range of V_tilde should be from (-inf, 2xx)

UMPIRE V_tilde: 232.42780981658478
