# Qwen2.5-VL su CM1-COVER

In questa sezione utilizziamo **Qwen2.5-VL-7B-Instruct** (modello multimodale) per estrarre i campi `Name`, `Vorname`, `Geb-Dat` dalle stesse 10 immagini giÃ  testate con GPT, DONUT e PaliGemma.


In [7]:
!pip install transformers accelerate pillow




In [8]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [19]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [9]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch

model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
)

device = model.device
print("Modello caricato su:", device)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Modello caricato su: cuda:0


In [10]:
from google.colab import files
import os

uploaded = files.upload()  # carica le 10 immagini
os.makedirs("cm1_images", exist_ok=True)

for fname in uploaded.keys():
    os.rename(fname, f"cm1_images/{fname}")

print("Immagini salvate in cm1_images/")


Immagini salvate in cm1_images/


In [11]:
id_to_file = {
    "78867961": "78867962.jpg",
    "78867179": "78867180.jpg",
    "78867223": "78867224.jpg",
    "78867406": "78867407.jpg",
    "78867679": "78867680.jpg",
    "78912579": "78912580.jpg",
    "78950724": "78950725.jpg",
    "78938299": "78938300.jpg",
    "78982956": "78982957.jpg",
    "78917020": "78917021.jpg"
}


In [12]:
!pip install qwen-vl-utils



In [16]:
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

def predict_qwen(img_path, question="Estrai i campi: Name, Vorname, Geb-Dat in formato JSON."):
    image = Image.open(img_path).convert("RGB")
    image.thumbnail((448, 448))  # lato lungo max 448 px


    # Chat multimodale: immagine + testo
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},   # ðŸ‘ˆ qui ora l'immagine Ã¨ inclusa
                {"type": "text", "text": question},
            ],
        }
    ]

    # Prompt con template chat
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Estrae info visive dai messaggi
    from qwen_vl_utils import process_vision_info
    image_inputs, video_inputs = process_vision_info(messages)

    # Costruisce input multimodale
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt"
    ).to(device)

    # Generazione
    output_ids = model.generate(**inputs, max_new_tokens=256)
    generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return result

In [17]:
import re
import json

def parse_output(raw_output):
    try:
        # se Ã¨ JSON valido
        data = json.loads(raw_output)
        return {
            "Name": data.get("Name", ""),
            "Vorname": data.get("Vorname", ""),
            "Geb-Dat": data.get("Geb-Dat", "")
        }
    except:
        # fallback con regex
        name = re.search(r"(?:Name)[: ]+([A-Za-zÃ€-Ã¿\-]+)", raw_output)
        vorname = re.search(r"(?:Vorname)[: ]+([A-Za-zÃ€-Ã¿\-]+)", raw_output)
        gebdat = re.search(r"(?:Geb[- ]?Dat)[: ]+([0-9\-]+)", raw_output)

        return {
            "Name": name.group(1) if name else "",
            "Vorname": vorname.group(1) if vorname else "",
            "Geb-Dat": gebdat.group(1) if gebdat else ""
        }


In [20]:
predictions = {}
for doc_id, filename in id_to_file.items():
    img_path = f"cm1_images/{filename}"
    raw = predict_qwen(img_path)
    parsed = parse_output(raw)
    predictions[doc_id] = [parsed]
    print(f"ID {doc_id} â†’ {parsed}")



AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [30]:
with open("predictions_qwen.json", "w", encoding="utf-8") as f:
    json.dump(predictions, f, ensure_ascii=False, indent=2)

print(" File salvato: predictions_qwen.json")


 File salvato: predictions_qwen.json
