# Prepare data

In [None]:
from data import Data, split, export_page_as_pdf
from pathlib import Path
from grobid_client.grobid_client import GrobidClient


data_path = Path("./data")

data_path.mkdir(exist_ok=True)

data = Data.model_validate_json((data_path / "data.json").read_text())
_, valid_data = split(data.examples, only_with_refs=True)

In [None]:
# Export pages of the validation dataset as PDFs
pdfs_path = data_path / "PLOS_1000"
valid_path = data_path / "pdfs_valid"

pdfs_path.mkdir(exist_ok=True)
valid_path.mkdir(exist_ok=True)

for i, example in enumerate(valid_data):
    pdf_file = pdfs_path / example.file / f"{example.file}.pdf"
    output = valid_path / f"{i:03d}.pdf"
    export_page_as_pdf(pdf_file, example.page, output)

In [None]:
output_path = data_path / "grobid_valid"
output_path.mkdir(exist_ok=True)

# Inference

In [7]:
# sudo docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.1
client = GrobidClient(grobid_server="http://localhost:8070")

GROBID server is up and running


In [None]:
client.process("processReferences", input_path=valid_path.absolute(), output=output_path.absolute())

Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/082.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/065.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/113.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/074.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/141.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/066.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/032.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/024.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/192.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data/pdfs_valid/030.pdf failed with error 204 , 
Processing of /home/david/mpcdf/mplhlt/cupido/data

# Results

In [20]:
from llamore import References


references_gold = [References(ex.refs) for ex in valid_data]

references_grobid = []
files_sorted = sorted(output_path.iterdir())
for file in files_sorted:
    if file.suffix != ".xml":
        references_grobid.append(References([]))
    else:
        refs = References.from_xml(file)
        references_grobid.append(refs)

In [21]:
from llamore import F1


F1().compute_macro_average(references_grobid, references_gold, num_processes=0)

Output()

0.5090445861402239

In [22]:
non_single_grobid, non_single_gold = [], []
for ref_grobid, ref_gold in zip(references_grobid, references_gold):
    if len(ref_gold) > 1:
        non_single_grobid.append(ref_grobid)
        non_single_gold.append(ref_gold)

F1().compute_macro_average(non_single_grobid, non_single_gold, num_processes=0)

Output()

0.5235358974896732

In [23]:
F1().compute_micro_average(references_grobid, references_gold)

Output()

{'Reference.analytic_title': {'precision': 0.5020219526285384,
  'recall': 0.42744712247909494,
  'f1': 0.4617428267800213},
 'Reference.journal_title': {'precision': 0.9037780401416765,
  'recall': 0.7221698113207548,
  'f1': 0.8028316727844783},
 'Reference.authors.Person.first_name': {'precision': 0.5614358683314415,
  'recall': 0.46580341377280754,
  'f1': 0.5091681142636557},
 'Reference.authors.Person.surname': {'precision': 0.9033830275229358,
  'recall': 0.7198172472872644,
  'f1': 0.8012205199923716},
 'Reference.editors.Person.first_name': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'Reference.editors.Person.surname': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'Reference.publisher': {'precision': 0.41,
  'recall': 0.3565217391304348,
  'f1': 0.38139534883720927},
 'Reference.publication_date': {'precision': 0.8406113537117904,
  'recall': 0.7247058823529412,
  'f1': 0.7783674500884509},
 'Reference.publication_place': {'precision': 0.16216216216216217,
  'recal