In [1]:
from data import Data
from pathlib import Path
from train import CollateFn
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from llamore import SchemaPrompter, F1
from rich.progress import track

In [2]:
data_path = Path("./data/data.json")
data = Data.model_validate_json(data_path.read_text())
examples = data.examples
examples = [ex for ex in data.examples if ex.refs]
train_data, valid_data = examples[:-200], examples[-200:]

In [3]:
processor = AutoProcessor.from_pretrained(
    "numind/NuExtract-2.0-2B",
    trust_remote_code=True,
    padding_side="right",  # make sure to set padding to right for training
    use_fast=True,
)

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [4]:
collate = CollateFn(processor=processor, input_dir="/raven/u/dcfidalgo/projects/cupido/data/PLOS_1000/")

In [15]:
model = AutoModelForVision2Seq.from_pretrained(
    "./finetune_lora/checkpoint-998",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    # use_cache=False, # for training,
    #config=config,
    #ignore_mismatched_sizes=is_mock_model,
    # quantization_config=bnb_config
)

In [16]:
compiled_model = torch.compile(model, mode="max-autotune")

In [17]:
refs_model = SchemaPrompter().schema_model

In [18]:
def predict(example, max_new_tokens: int = 10000):
    batch = collate([example])
    batch = {k: v.to("cuda") for k, v in batch.items()}
    
    # Generate output
    idx = (batch["labels"][0] != -100).nonzero()[0][0].item()
    batch["input_ids"] = batch["input_ids"][:, :idx + 1]
    batch["attention_mask"] = batch["attention_mask"][:, :idx + 1]
    with torch.inference_mode():
        output = compiled_model.generate(**batch, max_new_tokens=max_new_tokens)
    output = processor.tokenizer.decode(
        output[0][idx:], skip_special_tokens=True
    )  # Only keep the generated tokens
    idx = output.find("{")
    try:
        refs = refs_model.model_validate_json(output[idx:].strip())
    except Exception:
        predicted_references = []
    else:
        predicted_references = refs.references

    # Get gold references
    labels = batch["labels"][0]
    label = processor.tokenizer.decode(labels[labels != -100], skip_special_tokens=True)
    idx = label.find("{")
    gold_references = refs_model.model_validate_json(label[idx:].strip())
    
    return predicted_references, gold_references.references

In [38]:
my_references, gold_references = [], []
for example in track(valid_data[:10]):
    preds, gold = predict(example, max_new_tokens=10000)
    my_references.append(preds)
    gold_references.append(gold)

Output()

In [39]:
F1().compute_macro_average(my_references, gold_references, num_processes=0)

Output()

0.5935766743096721

In [7]:
import os

with open(".env", "r") as f:
    GEMINI_API_KEY = f.readline().split("=")[1].strip()[1:-1]

In [30]:
import pymupdf
from llamore import GeminiExtractor


extractor = GeminiExtractor(api_key=GEMINI_API_KEY)


def predict_gemini(example):
    pdfs_dir = Path("data/PLOS_1000/")
    pdf_path = pdfs_dir / example.file / f"{example.file}.pdf"
    doc = pymupdf.open(pdf_path)
    new_doc = pymupdf.open()
    new_doc.insert_pdf(doc, from_page=example.page-1, to_page=example.page-1)
    new_doc.save("test.pdf")
    
    return extractor(pdf="test.pdf")


In [40]:
gemini_references = []

for example in track(valid_data[:10]):
    gemini_references.append(predict_gemini(example))

Output()

In [41]:
F1().compute_macro_average(gemini_references, gold_references, num_processes=0)

Output()

0.6698353357578595