In [19]:
import json
import os
import re

from gliner import GLiNER

In [20]:
model = GLiNER.from_pretrained("urchade/gliner_small")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 14413.42it/s]


In [2]:
def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r"\w+(?:[-_]\w+)*|\S", text)

In [None]:
with open("./data/interim/maccrobat2020.json", "r", encoding="utf8") as f:
    data = json.load(f)

In [4]:
example = data[0]

In [10]:
example

{'text': "A 48-year-old man presented at our hospital complaining of intracranial bruits.\nThe cranial nerve and laboratory examinations were normal.\nA DVAF was considered, conventional digital subtraction angiography was performed, which showed a Cognard II DVAF in the location of right hypoglossal canal (HC).\nThe DVAF was fed by the meningeal branches of the bilateral ascending pharyngeal arteries, and the right anterior condylar veins within the hypoglossal canal that was the fistulous point mainly drained into the jugular vein.\nEndovascular treatment was insisted upon by the patient because of the associated intolerable intracranial bruit.\nTreatment was performed by transarterial approach under general anesthesia using a biplane angiographic unit.\nStandard coaxial techniques were used.\nThe guide catheter was navigated into the left ascendingtrawt pharyngeal artery.\nMarathon flow directed catheter (eV3) was subsequently navigated over a Mirage.008 microwire (eV3) to reach as 

In [42]:
text = example["text"]
tokenized_text = model.data_processor.words_splitter(text)

In [43]:
token_offsets = []
for idx, token in enumerate(tokenized_text):
    token_offsets.append((token[0], token[1], token[2], idx))

In [44]:
token_offsets

[('A', 0, 1, 0),
 ('48-year-old', 2, 13, 1),
 ('man', 14, 17, 2),
 ('presented', 18, 27, 3),
 ('at', 28, 30, 4),
 ('our', 31, 34, 5),
 ('hospital', 35, 43, 6),
 ('complaining', 44, 55, 7),
 ('of', 56, 58, 8),
 ('intracranial', 59, 71, 9),
 ('bruits', 72, 78, 10),
 ('.', 78, 79, 11),
 ('The', 80, 83, 12),
 ('cranial', 84, 91, 13),
 ('nerve', 92, 97, 14),
 ('and', 98, 101, 15),
 ('laboratory', 102, 112, 16),
 ('examinations', 113, 125, 17),
 ('were', 126, 130, 18),
 ('normal', 131, 137, 19),
 ('.', 137, 138, 20),
 ('A', 139, 140, 21),
 ('DVAF', 141, 145, 22),
 ('was', 146, 149, 23),
 ('considered', 150, 160, 24),
 (',', 160, 161, 25),
 ('conventional', 162, 174, 26),
 ('digital', 175, 182, 27),
 ('subtraction', 183, 194, 28),
 ('angiography', 195, 206, 29),
 ('was', 207, 210, 30),
 ('performed', 211, 220, 31),
 (',', 220, 221, 32),
 ('which', 222, 227, 33),
 ('showed', 228, 234, 34),
 ('a', 235, 236, 35),
 ('Cognard', 237, 244, 36),
 ('II', 245, 247, 37),
 ('DVAF', 248, 252, 38),
 ('in',

In [50]:
gliner_entities = []
entities = example["entities"]
for entity in entities:
    entity_start = entity["start"]
    entity_end = entity["end"]
    entity_label = entity["label"]
    entity_tokens = []
    for i, (token_text, token_start, token_end, idx) in enumerate(token_offsets):
        if token_start >= entity_start and token_end <= entity_end:
            entity_tokens.append((token_text, idx))
    if entity_tokens:
        gliner_entities.append([
            entity_tokens[0][1],
            entity_tokens[-1][1],
            entity_label.replace("_", " ").capitalize()
        ])

In [52]:
sorted(gliner_entities, key=lambda x: x[0])

[[1, 1, 'Age'],
 [2, 2, 'Sex'],
 [3, 3, 'Clinical event'],
 [6, 6, 'Nonbiological location'],
 [9, 9, 'Biological structure'],
 [10, 10, 'Sign symptom'],
 [13, 14, 'Diagnostic procedure'],
 [16, 17, 'Diagnostic procedure'],
 [19, 19, 'Lab value'],
 [22, 22, 'Disease disorder'],
 [26, 29, 'Diagnostic procedure'],
 [36, 38, 'Disease disorder'],
 [43, 45, 'Biological structure'],
 [47, 47, 'Biological structure'],
 [51, 51, 'Coreference'],
 [56, 57, 'Biological structure'],
 [60, 63, 'Biological structure'],
 [67, 70, 'Biological structure'],
 [73, 74, 'Biological structure'],
 [78, 79, 'Sign symptom'],
 [81, 85, 'Detailed description'],
 [87, 88, 'Therapeutic procedure'],
 [99, 99, 'Detailed description'],
 [100, 100, 'Biological structure'],
 [101, 101, 'Coreference'],
 [107, 108, 'Therapeutic procedure'],
 [110, 111, 'Medication'],
 [114, 116, 'Therapeutic procedure'],
 [118, 120, 'Therapeutic procedure'],
 [125, 128, 'Therapeutic procedure'],
 [131, 134, 'Biological structure'],
 [136