#  Test con DONUT su CM1-COVER

In questa sezione useremo **DONUT (Document Understanding Transformer)**
per estrarre le informazioni dalle stesse 10 immagini già processate con GPT,
e confronteremo i risultati.  
DONUT è un modello OCR-free, quindi lavora direttamente sulle immagini senza passare da un motore OCR tradizionale.


In [1]:
# Installazione librerie
!pip install transformers accelerate datasets
!pip install pillow




In [3]:
# Upload delle immagini (10 file dal tuo PC)
from google.colab import files
import os

uploaded = files.upload()  # seleziona i 10 file immagine
os.makedirs("cm1_images", exist_ok=True)

for fname in uploaded.keys():
    os.rename(fname, f"cm1_images/{fname}")

print("Immagini salvate in cm1_images/")


KeyboardInterrupt: 

In [15]:
# Import DONUT fine-tunato
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel

# Usiamo il checkpoint fine-tunato su DocVQA
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Device:", device)



Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/803M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/803M [00:00<?, ?B/s]

Device: cuda


In [16]:
# Funzione predizione con DONUT
def predict_with_donut(img_path):
    image = Image.open(img_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    outputs = model.generate(pixel_values, max_length=512)
    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return result

# 🔍 Parsing output DONUT in formato Name/Vorname/Geb-Dat
def parse_donut_output(raw_output):
    try:
        data = json.loads(raw_output)
        name = data.get("Name", "")
        vorname = data.get("Vorname", "")
        gebdat = data.get("Geb-Dat", data.get("GebDat", ""))
    except:
        name_match = re.search(r"(?:Name|<name>)([^<}]*)", raw_output, re.IGNORECASE)
        vorname_match = re.search(r"(?:Vorname|<vorname>)([^<}]*)", raw_output, re.IGNORECASE)
        gebdat_match = re.search(r"(?:Geb[- ]?Dat|<gebdat>)([^<}]*)", raw_output, re.IGNORECASE)

        name = name_match.group(1).strip() if name_match else ""
        vorname = vorname_match.group(1).strip() if vorname_match else ""
        gebdat = gebdat_match.group(1).strip() if gebdat_match else ""

    return {"Name": name, "Vorname": vorname, "Geb-Dat": gebdat}


In [17]:
id_to_file = {
    "78867961": "78867962.jpg",
    "78867179": "78867180.jpg",
    "78867223": "78867224.jpg",
    "78867406": "78867407.jpg",
    "78867679": "78867680.jpg",
    "78912579": "78912580.jpg",
    "78950724": "78950725.jpg",
    "78938299": "78938300.jpg",
    "78982956": "78982957.jpg",
    "78917020": "78917021.jpg"
}



In [19]:
from PIL import Image

def predict_with_prompt(img_path, question="What is the Name, Vorname and Geb-Dat?"):
    # Apri immagine
    image = Image.open(img_path).convert("RGB")

    # Prepara immagine per il modello
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

    # Prompt in stile DocVQA
    task_prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
    decoder_input_ids = processor.tokenizer(
        task_prompt,
        add_special_tokens=False,
        return_tensors="pt"
    ).input_ids.to(device)

    # Genera output
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=512
    )
    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return result


In [7]:
# Genera predizioni per tutte le immagini
predictions = {}

for doc_id, filename in id_to_file.items():
    img_path = f"cm1_images/{filename}"
    try:
        raw_output = predict_with_donut(img_path)
        parsed = parse_donut_output(raw_output)
        predictions[doc_id] = [parsed]
        print(f"ID {doc_id}: {parsed}")
    except Exception as e:
        print(f"Errore con {doc_id}: {e}")


✅ ID 78867961: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78867179: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78867223: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78867406: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78867679: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78912579: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78950724: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78938299: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78982956: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}
✅ ID 78917020: {'Name': '', 'Vorname': '', 'Geb-Dat': ''}


In [8]:
# Salva le predizioni in JSON
with open("predictions_donut.json", "w", encoding="utf-8") as f:
    json.dump(predictions, f, ensure_ascii=False, indent=2)

print(" File salvato: predictions_donut.json")


📂 File salvato: predictions_donut.json
