In [1]:
from blip_quantizer import BlipQuantizer, QuantConfig, ModelPart, LayerGroup, LayerType
from quant_functions import uniform_quantization
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import COCODataset
from tqdm import tqdm
from PIL import Image
from torch.utils.data import DataLoader
from utils import print_model_structure

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
model = model.to(device)


quantizer = BlipQuantizer(model)
configs = [
    QuantConfig(ModelPart.VIT, LayerGroup.MIDDLE, LayerType.MLP, 
                uniform_quantization, num_bits=8),
    QuantConfig(ModelPart.QFORMER, LayerGroup.MIDDLE, LayerType.MLP, 
                uniform_quantization, num_bits=4),
    QuantConfig(ModelPart.LLM, LayerGroup.MIDDLE, LayerType.MLP, 
                uniform_quantization, num_bits=4)
]


print("Quantizing model...")
quantizer.apply_quantization(configs)

# print_model_structure(model)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Quantizing model...


In [2]:
from evaluation_pipeline import EvaluationPipeline

# from evaluation_pipeline import EvaluationPipeline
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)

# Create evaluator
evaluator = EvaluationPipeline(model, processor, device)

# Evaluate
print("Starting evaluation...")
coco_results = evaluator.evaluate(coco_dataset, task='image_captioning', max_samples=1000)

# Save results
evaluator.save_results(coco_results, './results/coco_quantized_evaluation.json')

# Print overall CIDEr score
print(f"COCO CIDEr score: {coco_results['overall_cider']}")

# Print a few example predictions
print("\nExample predictions:")
for i in range(5):  # Print first 5 predictions
    print(f"Image ID: {coco_results['predictions'][i]['image_id']}")
    print(f"Prediction: {coco_results['predictions'][i]['caption']}")
    print(f"References: {coco_results['references'][i]}")
    print(f"Individual CIDEr score: {coco_results['individual_cider'][i]}")
    print()


loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
Starting evaluation...


100%|███████████████████████████████████████████████████████████████████████| 1000/1000 [03:31<00:00,  4.73it/s]
PTBTokenizer tokenized 61766 tokens at 310803.25 tokens per second.
PTBTokenizer tokenized 5489 tokens at 55702.28 tokens per second.


COCO CIDEr score: 0.6399606432560109

Example predictions:
Image ID: 397133
Prediction: a woman in a kitchen
References: ['A man is in a kitchen making pizzas.', 'Man in apron standing on front of oven with pans and bakeware', 'A baker is working in the kitchen rolling dough.', 'A person standing by a stove in a kitchen.', 'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']
Individual CIDEr score: 0.7323661412056455

Image ID: 37777
Prediction: kitchen
References: ['The dining table near the kitchen has a bowl of fruit on it.', 'A small kitchen has various appliances and a table.', 'The kitchen is clean and ready for us to see.', 'A kitchen and dining area decorated in white.', 'A kitchen that has a bowl of fruit on the table.']
Individual CIDEr score: 0.3339097712588217

Image ID: 252219
Prediction: man and woman walking down the street
References: ['a person with a shopping cart on a city street ', 'City dwellers walk by as a home

In [4]:
from inference_pipeline import InferencePipeline

model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)

# Set up the model, processor, and dataset as before
model_name = "Salesforce/blip2-opt-2.7b"
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

# Run inference
inferencer = InferencePipeline(model, processor, device)
print("Starting inference...")
results = inferencer.run_inference(coco_dataset, task='image_captioning', max_samples=20)
inferencer.save_results(results, './results/coco_quantized_inference.json')

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
Starting inference...


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  5.41it/s]


Tokenizing...


PTBTokenizer tokenized 1235 tokens at 13214.43 tokens per second.
PTBTokenizer tokenized 92 tokens at 1219.63 tokens per second.


Computing scores...
Computing ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'] score...
{'testlen': 73, 'reflen': 173, 'guess': [73, 53, 35, 20], 'correct': [63, 25, 10, 2]}
ratio: 0.42196531791663605
Computing METEOR score...
Computing ROUGE_L score...
Computing CIDEr score...
Computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.1 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7

SPICE evaluation took: 5.914 s
Bleu_1: 0.21932782985573562
Bleu_2: 0.162150004990946
Bleu_3: 0.12405491633945606
Bleu_4: 0.0834602148134694
METEOR: 0.15735645084223648
ROUGE_L: 0.35939849127554524
CIDEr: 0.6174471337969347
SPICE: 0.15014796021888632

Example predictions:
Image ID: 397133
Prediction: a woman in a kitchen
References: ['A man is in a kitchen making pizzas.', 'Man in apron standing on front of oven with pans and bakeware', 'A baker is working in the kitchen rolling dough.', 'A person standing by a stove in a kitchen.', 'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']
Individual CIDEr score: 0.689941342710598

Image ID: 37777
Prediction: kitchen
References: ['The dining table near the kitchen has a bowl of fruit on it.', 'A small kitchen has various appliances and a table.', 'The kitchen is clean and ready for us to see.', 'A kitchen and dining area decorated in white.', 'A kitchen that has a bowl of fruit on the tabl

In [3]:
from scoring_pipeline import ScoringPipeline

# Compute scores
scorer = ScoringPipeline()
loaded_results = scorer.load_results('./results/coco_quantized_inference.json')
scores = scorer.compute_scores(loaded_results, task='image_captioning')

# Print scores
for metric, score in scores.items():
    if not metric.endswith('_per_caption'):
        print(f"{metric}: {score}")

Tokenizing...


PTBTokenizer tokenized 1235 tokens at 13131.68 tokens per second.
PTBTokenizer tokenized 92 tokens at 1169.08 tokens per second.


Computing scores...
Computing METEOR score...
Computing CIDEr score...
METEOR: 0.15735645084223648
CIDEr: 0.6174471337969347

Example predictions:


NameError: name 'results' is not defined

In [3]:
import gc

model.to('cpu')
del model, evaluator
gc.collect()
torch.cuda.empty_cache()