# Model Evaluation

This code will output a Markdown file with an evaluation report for each model in the list

In [1]:
models = [
  {"name": "blip-image-captioning-base-8bit-pre-peft", "int8": True, "peft": False, "noToDevice": True}, # Salesforce BLIP model in int8, no PEFT
  {"name": "blip-image-captioning-base-quantized", "int8": True, "peft": True, "noToDevice": False},      # Salesforce BLIP int8+PEFT
  {"name": "blip-image-captioning-base-finetuned", "int8": True, "peft": True, "noToDevice": False},      # Model from training
  {"name": "Salesforce/blip-image-captioning-base", "int8": False, "peft": False, "noToDevice": False},   # Base model (full not-int8)
]

test_filename = "ids_test.csv"

report_sample_size = 5


In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Prep & Common Functions

In [3]:
from datasets import Dataset, Image

def prep_data(df):
  files = [f"media/{media_id}.jpg" for media_id in df['media_id'].to_list()]
  descriptions = [text for text in df['description'].to_list()]

  return Dataset.from_dict({ "image": files, "text": descriptions }).cast_column("image", Image())


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd

# DataFrame
df_test = pd.read_csv(test_filename)


In [5]:
#def get_dataset(processor):
#  print("Loading Dataset & setting processor")
#
#  # Create Dataset from DataFrame
#  ds_test = prep_data(df_test)
#
#  # Define transform function using processor
#  # https://huggingface.co/docs/transformers/tasks/image_captioning
#  def transforms(example_batch):
#      images = [x.convert("RGB").resize((100,100)) for x in example_batch["image"]]
#      captions = [x for x in example_batch["text"]]
#      inputs = processor(images=images, text=captions, padding="max_length", return_tensors="pt")
#      #inputs.update({"labels": inputs["input_ids"]})
#      #inputs = processor(images, return_tensors="pt").to(device)
#      return inputs
#
#  # Set transform
#  ds_test.set_transform(transforms)
#
#  # Return Dataset with transform
#  return ds_test


In [6]:
ds_test = prep_data(df_test)


In [7]:
file = open('report.md','w', encoding="utf-8")

def out(text):
  global file
  #print(text)
  file.write(text+"\n")

out("# Model Evaluation Report\n")


In [8]:
out("## Dataset Sample")
out("| Image | Description |")
out("|-------|-------------|")

sample_images = df_test['media_id'].head(report_sample_size).to_list()
sample_descriptions = df_test['description'].head(5).to_list()

for i in range(len(sample_images)):
  out(f"| ![](media/{sample_images[i]}.jpg) | {sample_descriptions[i]} |")


## Model Evaluation

In [9]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from peft import LoraConfig, LoraConfig, get_peft_model, PeftConfig, PeftModel

#PEFT Config, this should match training
peft_config = LoraConfig(r=32, lora_alpha=64, target_modules=["qkv"], lora_dropout=0.05, bias="none")

def load_model(model_data):
  model_name = model_data['name']
  int8 = model_data['int8']
  peft = model_data['peft']

  print(f"Loading model {model_name}")

  processor = BlipProcessor.from_pretrained(model_name)
  model = BlipForConditionalGeneration.from_pretrained(model_name, load_in_8bit=int8)

  #if int8==True:
  #  model = prepare_model_for_int8_training(model)
  
  if peft==True:
    peft_config = PeftConfig.from_pretrained(model_name)
    model = PeftModel.from_pretrained(model, model_name, config=peft_config)
    #model = get_peft_model(model, peft_config)
    print(model.device)
  
  if model_data['noToDevice'] == False:
    model = model.to(device)

  return model, processor


In [10]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
wer = evaluate.load("wer")
meteor = evaluate.load("meteor")


2023-11-29 01:49:53.482358: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-29 01:49:53.504439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-29 01:49:53.504457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-29 01:49:53.505034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-29 01:49:53.508576: I tensorflow/core/platform/cpu_feature_guar

In [11]:
def generate_description(model, processor, data):
  image = data['image'].convert('RGB').resize((100,100))
  inputs = processor(image, return_tensors="pt")
  pixels = inputs['pixel_values']
  #print("Model device:",model.device)
  #print("Pixel Tensor Device:",pixels.get_device())
  #if pixels.get_device() != model.device:
    #print("Attempting to move Tensor to model's device")
  pixels = pixels.to(model.device)
  #print("Pixel Tensor Device:",pixels.get_device())
  generated_tensor = model.generate(pixel_values=pixels, max_length=200)
  decoded_desc = processor.batch_decode(generated_tensor, skip_special_tokens=True)[0]
  return decoded_desc


In [12]:
def evaluate_descriptions(descriptions_human, descriptions_generated):
  metrics = {}
  rouge_score = rouge.compute(predictions=descriptions_generated, references=descriptions_human)
  for k, v in rouge_score.items():
    metrics[k] = v
  bleu_score = bleu.compute(predictions=descriptions_generated, references=descriptions_human)
  for k, v in bleu_score.items():
    metrics['bleu_'+k] = v
  wer_score = wer.compute(predictions=descriptions_generated, references=descriptions_human)
  metrics['wer'] = wer_score
  meteor_score = meteor.compute(predictions=descriptions_generated, references=descriptions_human)
  for k, v in meteor_score.items():
    metrics[k] = v
  
  #print(metrics)

  return metrics


In [13]:
def evaluate_model(model_data):
  model_name = model_data['name']
  out(f"## Model: {model_name}")
  
  model, processor = load_model(model_data)

  print("Model device:",model.device)

  print('Generating Descriptions')
  descriptions_human = []
  descriptions_generated = []

  for i in ds_test:
    descriptions_human.append(i['text'])
    generated_text = generate_description(model, processor, i)
    descriptions_generated.append(generated_text)
  
  out("Sample Descriptions:")
  out("| Image | Generated Description | Human Description |")
  out("|-|-|-|")
  for i in range(report_sample_size):
    out(f"| ![](media/{sample_images[i]}.jpg) | {descriptions_generated[i]} | {descriptions_human[i]} |")
  
  # Newline
  out("")

  print("Evaluating")
  metrics = evaluate_descriptions(descriptions_human, descriptions_generated)

  out("Metrics:")
  out("| Metric | Score |")
  out("|--------|-------|")
  for metric_name, metric_score in metrics.items():
    out(f"| {metric_name} | `{metric_score}` |")


In [14]:
import gc

#for model_metadata in [models[0]]:
for model_metadata in models:
  evaluate_model(model_metadata)
  gc.collect()


You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


Loading model blip-image-captioning-base-8bit-pre-peft
Model device: cuda:0
Generating Descriptions
Evaluating
Loading model blip-image-captioning-base-quantized
cuda:0
Model device: cuda:0
Generating Descriptions
Evaluating
Loading model blip-image-captioning-base-finetuned
cuda:0
Model device: cuda:0
Generating Descriptions
Evaluating
Loading model Salesforce/blip-image-captioning-base
Model device: cuda:0
Generating Descriptions
Evaluating


## Cleanup

In [15]:
file.close()
