In [49]:
# imports
import tqdm as notebook_tqdm
import os
import re
import logging
import pandas as pd
from pprint import pprint
import numpy as np
from seqeval.metrics import (precision_score,
                             recall_score,
                             f1_score,
                             classification_report)

from datasets import (load_dataset,
                      DatasetDict, 
                      Features, 
                      Sequence, 
                      ClassLabel, 
                      Value, 
                      interleave_datasets, 
                      get_dataset_config_names, 
                      load_dataset, 
                      load_from_disk
)

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed
)

from utility_functions import (split_sources,
                            #    whitespace_tokens_with_spans,
                            #    spans_to_bio_labels,
                               build_bio_label_list_from_sources, 
                            #    make_to_features, 
                            #    process_all,
                            #    normalize_types
)

## 1. Data Loading and Exploration

In [50]:
# We list all available configurations of the dataset:
# configs = get_dataset_config_names("bigbio/swedish_medical_ner")
configs = get_dataset_config_names("community-datasets/swedish_medical_ner")
print("Available configurations:")
for config in configs:
    print(f"- {config}")

Available configurations:
- 1177
- lt
- wiki


In [51]:
# The dataset is loaded with all the chosen configurations

kb_datasets =[]
for config in configs:
    print(f"Attempting to load from: data/swedish_medical_ner_{config}")
    if os.path.isdir(f"data/swedish_medical_ner_{config}/train"):
        print(f"Loading configuration: {config} from disk")
        try:
            ds = load_from_disk(f"data/swedish_medical_ner_{config}")
            ds.config_name = config  # Attach config name as an attribute for later access.
            kb_datasets.append(ds)
            pprint(ds["train"].features) #Display schema  
        except Exception as e:
            print(f"Failed to load dataset from disk for config {config}: {e}")
        continue

    else:
        print(f"Loading configuration: {config} from the huggingface hub")
        ds = load_dataset("community-datasets/swedish_medical_ner", config)
        ds.config_name = config  # Attach config name as an attribute for later access.
        print(f"- {ds.config_name}")
        pprint(ds["train"].features) #Display schema
        kb_datasets.append(ds)
        #Saving to disk
        ds.save_to_disk(f"data/swedish_medical_ner_{ds.config_name}")

Attempting to load from: data/swedish_medical_ner_1177
Loading configuration: 1177 from disk
{'entities': Sequence(feature={'end': Value(dtype='int32', id=None),
                               'start': Value(dtype='int32', id=None),
                               'text': Value(dtype='string', id=None),
                               'type': ClassLabel(names=['Disorder and Finding',
                                                         'Pharmaceutical Drug',
                                                         'Body Structure'],
                                                  id=None)},
                      length=-1,
                      id=None),
 'sentence': Value(dtype='string', id=None),
 'sid': Value(dtype='string', id=None)}
Attempting to load from: data/swedish_medical_ner_lt
Loading configuration: lt from disk
{'entities': Sequence(feature={'end': Value(dtype='int32', id=None),
                               'start': Value(dtype='int32', id=None),
                   

In [71]:
# The three configurations are explored

pd.set_option('display.max_colwidth', None)

for i, dataset in enumerate(kb_datasets):
    print(f"### Configuration {i + 1}: {dataset.config_name}")
    print(f"Rows: {dataset['train'].num_rows}")  # Shows splits and number of examples
    print(f"Columns: {len(dataset['train'].features)}")  # Number of columns/features

    # Convert a slice of the dataset to a pandas dataframe and display it
    example = dataset["train"].select(range(10)).to_pandas()
    display(example)

### Configuration 1: 1177
Rows: 927
Columns: 3


Unnamed: 0,sid,sentence,entities
0,1177_0,Memantin ( Ebixa ) ger sällan några biverkningar.,"{'start': [9], 'end': [18], 'text': ['Ebixa'], 'type': [0]}"
1,1177_1,Det är också lättare att dosera [ flytande medicin ] än att dela på tabletter.,"{'start': [32], 'end': [52], 'text': ['flytande medicin'], 'type': [1]}"
2,1177_2,( Förstoppning ) är ett vanligt problem hos äldre.,"{'start': [0], 'end': [16], 'text': ['Förstoppning'], 'type': [0]}"
3,1177_3,[ Medicinen ] kan också göra att man blöder lättare eftersom den påverkar { blodets } förmåga att levra sig.,"{'start': [0, 74], 'end': [13, 85], 'text': ['Medicinen', 'blodets'], 'type': [1, 2]}"
4,1177_4,Barn har större möjligheter att samarbeta om de i förväg får veta vad som ska hända.,"{'start': [], 'end': [], 'text': [], 'type': []}"
5,1177_5,Eftersom de påverkar hela kroppen mer än övriga mediciner bör man bara ta dem när olika kombinationer av receptfria mediciner inte hjälper.,"{'start': [], 'end': [], 'text': [], 'type': []}"
6,1177_6,För att få ett skydd mot ( hepatit B ) behövs tre doser vaccin.,"{'start': [25], 'end': [38], 'text': ['hepatit B'], 'type': [0]}"
7,1177_7,Effekten av naproxen sitter i längre och varar cirka 12 timmar jämfört med cirka 6 timmar för ibuprofen.,"{'start': [], 'end': [], 'text': [], 'type': []}"
8,1177_8,[ Cox-hämmare ] finns även som gel och sprej.,"{'start': [0], 'end': [15], 'text': ['Cox-hämmare'], 'type': [1]}"
9,1177_9,"Det är bra om ett litet barn är mätt och utsövt, eftersom de flesta påfrestningar då känns mindre.","{'start': [], 'end': [], 'text': [], 'type': []}"


### Configuration 2: lt
Rows: 745753
Columns: 3


Unnamed: 0,sid,sentence,entities
0,lt_0,", (hjärtinfarkt) och (syndrom) som vi nu år 1999 inte ens vet na","{'start': [2, 21], 'end': [16, 30], 'text': ['hjärtinfarkt', 'syndrom'], 'type': [0, 0]}"
1,lt_1,"tinernas goda effekt på morbiditeten är välkänd, och data hi","{'start': [], 'end': [], 'text': [], 'type': []}"
2,lt_2,"[sukralfat], [lakrits] och vismut) som kunde utgöra ett skydd öv","{'start': [0, 13], 'end': [11, 22], 'text': ['sukralfat', 'lakrits'], 'type': [1, 1]}"
3,lt_3,och tveksamhet {vad} gäller operationsindikationen kan man ha,"{'start': [16], 'end': [21], 'text': ['vad'], 'type': [2]}"
4,lt_4,1989 blev en anmälningspliktig (sjukdom) enligt Smittskyddsla,"{'start': [32], 'end': [41], 'text': ['sjukdom'], 'type': [0]}"
5,lt_5,kombinerat med remodellering av (hjärtat). Detta säkras genom,"{'start': [32], 'end': [41], 'text': ['hjärtat'], 'type': [0]}"
6,lt_6,olyckshändelse radikalt förändrat deras liv. {Sigmoideum} är,"{'start': [46], 'end': [58], 'text': ['Sigmoideum'], 'type': [2]}"
7,lt_7,ra att hon samtidigt ordinerade [Cyklokapron] i en mängd av 5,"{'start': [32], 'end': [45], 'text': ['Cyklokapron'], 'type': [1]}"
8,lt_8,till vara erfarenheterna och föra ut kunskapen till sjukvård,"{'start': [], 'end': [], 'text': [], 'type': []}"
9,lt_9,es kring behandling med betablockad vid (kronisk hjärtsvikt).,"{'start': [40], 'end': [60], 'text': ['kronisk hjärtsvikt'], 'type': [0]}"


### Configuration 3: wiki
Rows: 48720
Columns: 3


Unnamed: 0,sid,sentence,entities
0,wiki_0,"{kropp} beskrivs i till exempel människokroppen, anatomi och f","{'start': [0], 'end': [7], 'text': ['kropp'], 'type': [2]}"
1,wiki_1,"sju miljoner år gammalt hominint {kranium}, klassificerad som","{'start': [33], 'end': [42], 'text': ['kranium'], 'type': [2]}"
2,wiki_2,autosomer och ett par könskromosomer. Varje {kromosom} består,"{'start': [45], 'end': [55], 'text': ['kromosom'], 'type': [2]}"
3,wiki_3,{kromosom} består av en DNA-molekyl och {protein}. En DNA-molek,"{'start': [1], 'end': [50], 'text': ['kromosom} består av en DNA-molekyl och {protein'], 'type': [2]}"
4,wiki_4,tikel:Människans {skelett} Människans skelett är det skelett s,"{'start': [17], 'end': [26], 'text': ['skelett'], 'type': [2]}"
5,wiki_5,os människor. En vuxen människas {skelett} består av 206 till,"{'start': [33], 'end': [42], 'text': ['skelett'], 'type': [2]}"
6,wiki_6,"{lett} består av 206 till 220 {ben}, beroende på hur man räknar.","{'start': [0], 'end': [35], 'text': ['lett} består av 206 till 220 {ben'], 'type': [2]}"
7,wiki_7,v kroppsvikten.Ett nyfött barn har ca 300 {ben} i kroppen vilk,"{'start': [42], 'end': [47], 'text': ['ben'], 'type': [2]}"
8,wiki_8,kollektivet i mindre bitar såsom länder > städer > orter {Hud},"{'start': [57], 'end': [62], 'text': ['Hud'], 'type': [2]}"
9,wiki_9,sdjur. {Huden} utgör ett mekaniskt skydd mot omvärlden och bid,"{'start': [7], 'end': [14], 'text': ['Huden'], 'type': [2]}"


We have three configurations (subsets) with varying size

The data of interest are:
* Passage text (a full sentence (1177) or part of a sentence (wiki, lt)).
* The Named Entities (bracketed using: (), [] {}),
* their starting and ending positions: start, end,
* and their types (0,1 and 2).

The types refer to the the different types of named entities (we may also call them labels or classes):
* 'Pharmaceutical Drug': 0
* 'Disorder and Finding': 1
* 'Body Structure': 2


## The task: NER


The task of Named entity Recognition (NER) is one of sequence labeling, where each token in a sentence must be tagged.

The following sentence contains two Named Entities of different types (1,2):

`[ Medicinen ] kan också göra att man blöder lättare eftersom den påverkar { blodets } förmåga att levra sig.	{'start': [0, 74], 'end': [13, 85], 'text': ['Medicinen', 'blodets'], 'type': [1, 2]}`

As we can see, the raw data gives the named entities as *spans* with start/end positions.
The logical next step is to convert the spans to *per-token labels*, i e to associate each token within a sentence with it's corresponding type label.

First however, we split the configurations into training and validation sets

### 1) Split each config into train/val

In [53]:
per_source_raw= split_sources(kb_datasets, val_fraction=0.05, seed=42)
print(per_source_raw)

{'1177': DatasetDict({
    train: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 880
    })
    validation: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 47
    })
}), 'lt': DatasetDict({
    train: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 708465
    })
    validation: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 37288
    })
}), 'wiki': DatasetDict({
    train: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 46284
    })
    validation: Dataset({
        features: ['sid', 'sentence', 'entities', 'source'],
        num_rows: 2436
    })
})}


### 2) Convert entities to list-of-dicts format.

We keep `type` as `int` `ClassLabel`in order to keep the downstream code cleaner, and to simplify iteration.

In [54]:
def dict_of_lists_to_list_of_dicts(entities_dict):
    """Convert a dictionary of lists to a list of dictionaries."""
    return [
        {"start": s, "end": e, "text": txt, "type": t}
        for s, e, txt, t in zip(
            entities_dict["start"],
            entities_dict["end"],
            entities_dict["text"],
            entities_dict["type"]
        )
    ]

# Apply to every split inalready-split dict: per_source_raw (1177/lt/wiki)
per_source_norm = {}
for cfg, ds in per_source_raw.items():
    per_source_norm[cfg] = ds.map(
        lambda ex: {
            **ex,
            "entities": dict_of_lists_to_list_of_dicts(ex["entities"])
        }
    )
# We inspect the first few examples 
#per_source_norm["1177"]["train"].select(range(3)).to_pandas()[["sid","sentence","entities"]]
#per_source_norm["1177"]["validation"].select(range(3)).to_pandas()[["sid","sentence","entities"]]


In [55]:
# We do a check
for cfg, ds in per_source_norm.items():
    print(f"Config: {cfg}")
    for split in ds.keys():
        print(f"  Split: {split}, first example 'entities': {ds[split][0]['entities']}")
    print("---")


Config: 1177
  Split: train, first example 'entities': [{'start': 0, 'end': 11, 'text': 'Alvedon', 'type': 1}, {'start': 46, 'end': 56, 'text': 'munnen', 'type': 2}, {'start': 70, 'end': 101, 'text': 'munsönderfallande tabletter', 'type': 1}]
  Split: validation, first example 'entities': [{'start': 0, 'end': 10, 'text': 'Demens', 'type': 0}]
---
Config: lt
  Split: train, first example 'entities': [{'start': 11, 'end': 20, 'text': 'syndrom', 'type': 0}]
  Split: validation, first example 'entities': [{'start': 41, 'end': 52, 'text': 'läkemedel', 'type': 1}]
---
Config: wiki
  Split: train, first example 'entities': [{'start': 34, 'end': 43, 'text': 'lysosom', 'type': 2}]
  Split: validation, first example 'entities': [{'start': 13, 'end': 32, 'text': 'limbiska systemet', 'type': 2}]
---


We now have the entities as a list: `list[{"start","end","text","type"}]`, and the type as an `int` (ClassLabel id), matching the dataset’s schema.


### 3) Build global BIO labels (union over configs)
We'll keep the BIO tags readable (B-body_structure, etc.) by mapping the int codes to the official names during featurization. No dataset mutation needed.

In [56]:
# 
label_list = build_bio_label_list_from_sources(per_source_raw)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
print(f"label_list: {label_list}")
print(f"label2id: {label2id}")
print(f"id2label: {id2label}")


label_list: ['O', 'B-body_structure', 'I-body_structure', 'B-disorder_finding', 'I-disorder_finding', 'B-pharmaceutical_drug', 'I-pharmaceutical_drug']
label2id: {'O': 0, 'B-body_structure': 1, 'I-body_structure': 2, 'B-disorder_finding': 3, 'I-disorder_finding': 4, 'B-pharmaceutical_drug': 5, 'I-pharmaceutical_drug': 6}
id2label: {0: 'O', 1: 'B-body_structure', 2: 'I-body_structure', 3: 'B-disorder_finding', 4: 'I-disorder_finding', 5: 'B-pharmaceutical_drug', 6: 'I-pharmaceutical_drug'}


### 4) Normalize entities (shape + types)

In [57]:
###
# 3a) Iterator that yields (start, end, type_token) regardless of shape
NAME_TO_TOKEN = {
    "Disorder and Finding": "disorder_finding",
    "Pharmaceutical Drug": "pharmaceutical_drug",
    "Body Structure": "body_structure",
}

def iter_entities(example, type_names):
    """Yield entity spans and their type tokens from an example."""
    entities = example.get("entities", None)  # Get entities from the example

    for d in entities:
        t = d.get("type")
        name = type_names[t] if isinstance(t, int) else str(t)
        yield (d.get("start"), d.get("end"), NAME_TO_TOKEN.get(name, name.lower().replace(" ", "_")))




In [73]:
WHITESPACE_OR_BRACKETS = set(" \n\t()[]{}")

def trim_spans(sentence, start, end):
    while start < end and sentence[start] in WHITESPACE_OR_BRACKETS:
        start += 1
    while start < end and sentence[end-1] in WHITESPACE_OR_BRACKETS:
        end -= 1
    return (start, end)

def make_to_features_offset(tokenizer, label2id, type_names, max_length=256):
    """Create a function to convert examples to features with offset mapping."""
    NAME_TO_TOKEN = {
        "Disorder and Finding": "disorder_finding",
        "Pharmaceutical Drug": "pharmaceutical_drug",
        "Body Structure": "body_structure",
    }

    def type_token(t):
        """Map entity type to token."""
        name = type_names[t] if isinstance(t, int) else str(t)
        return NAME_TO_TOKEN.get(name, name.lower().replace(" ", "_"))

    def to_features(ex, TRIM_SPANS=True):
        """Convert a single example to features."""
        text = ex["sentence"]
        entities = ex.get("entities", []) or []

        enc = tokenizer(text, truncation=True, max_length=max_length, return_offsets_mapping=True)
        offsets = enc["offset_mapping"]

        # --- ONLY LOCAL COPIES: build trimmed spans for labeling ---
        ent_spans = []
        for e in entities:
            est, eend = e["start"], e["end"]
            if TRIM_SPANS:
                est, eend = trim_spans(text, est, eend)   # trim away brackets
            if est >= eend:
                continue              
            etok = type_token(e["type"])
            ent_spans.append((est, eend, etok))
        # -----------------------------------------------------------

        labels = []
        for (ts, te) in offsets:
            if ts == te:         
                labels.append(-100)
                continue
            lab = "O"
            for (est, eend, etok) in ent_spans:
                if not (te <= est or ts >= eend):    # overlap
                    lab = f"B-{etok}" if ts <= est < te else f"I-{etok}"
                    break
            labels.append(label2id.get(lab, label2id["O"]))

        enc.pop("offset_mapping")
        enc["labels"] = labels
        return enc

    return to_features


In [59]:
# # 3b) Minimal offset-based featurizer (no whitespace pre-tokens)
# def make_to_features_offset(tokenizer, label2id, type_names, max_length=256):
#     def to_features(ex):
#         text = ex["sentence"]
#         enc = tokenizer(text, truncation=True, max_length=max_length, return_offsets_mapping=True)
#         offsets = enc["offset_mapping"]

#         # collect entity spans once
#         ent_spans = list(iter_entities(ex, type_names))  # [(start, end, 'pharmaceutical_drug'), ...]

#         labels = []
#         for (ts, te) in offsets:
#             if ts == te:          # special tokens
#                 labels.append(-100)
#                 continue
#             lab = "O"
#             for (est, eend, etok) in ent_spans:
#                 if not (te <= est or ts >= eend):   # overlap
#                     lab = f"B-{etok}" if ts <= est < te else f"I-{etok}"
#                     break
#             labels.append(label2id.get(lab, label2id["O"]))

#         enc.pop("offset_mapping")
#         enc["labels"] = labels
#         return enc
#     return to_features


In [74]:
from transformers import AutoTokenizer
from datasets import DatasetDict

tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased", use_fast=True)

# Read names from RAW split (has the ClassLabel feature)
type_names = per_source_raw["1177"]["validation"].features["entities"].feature["type"].names # Ad hoc fix

split = per_source_norm["1177"]["validation"]   # <- normalized
# ['Disorder and Finding', 'Pharmaceutical Drug', 'Body Structure']

to_features = make_to_features_offset(tokenizer, label2id, type_names, max_length=256)

ds_1177_val_feats = split.map(
    to_features,
    batched=False,
    remove_columns=split.column_names,
    desc="[1177] validation featurization",
)



[1177] validation featurization: 100%|██████████| 47/47 [00:00<00:00, 2129.48 examples/s]


In [None]:
#split.select(range(2)).to_pandas()[["sid", "sentence", "entities"]]

Unnamed: 0,sid,sentence,entities
0,1177_650,( Demens ) innebär att man på olika sätt får svårt att minnas och att tolka sin omgivning.,"[{'start': 0, 'end': 10, 'text': 'Demens', 'type': 0}]"
1,1177_408,"Rökare har stor risk att utveckla sjukdomen ( kol ) , ( kronisk obstruktiv lungsjukdom ) .","[{'start': 44, 'end': 51, 'text': 'kol', 'type': 0}, {'start': 54, 'end': 88, 'text': 'kronisk obstruktiv lungsjukdom', 'type': 0}]"


In [76]:
ex0 = per_source_norm["1177"]["validation"][2]
print(ex0["sentence"])
print("entities:", list(iter_entities(ex0, type_names)))

enc0 = tokenizer(ex0["sentence"], return_offsets_mapping=True, truncation=True, max_length=256)
labs0 = make_to_features_offset(tokenizer, label2id, type_names)(ex0)["labels"]
tokens = tokenizer.convert_ids_to_tokens(enc0["input_ids"])

for tok, off, lab_id in zip(tokens, enc0["offset_mapping"], labs0):
    if lab_id == -100: 
        continue
    print(f"{tok:15} {off}  {id2label[lab_id]}")


Nysningar och ( nästäppa ) kan ofta dämpas av [ nässprej ] , kliande och ( svullna ögon ) går att behandla med [ ögondroppar ] .
entities: [(14, 26, 'disorder_finding'), (46, 58, 'pharmaceutical_drug'), (73, 89, 'disorder_finding'), (111, 126, 'pharmaceutical_drug')]
Ny              (0, 2)  O
##sn            (2, 4)  O
##ingar         (4, 9)  O
och             (10, 13)  O
(               (14, 15)  O
näst            (16, 20)  B-disorder_finding
##äpp           (20, 23)  I-disorder_finding
##a             (23, 24)  I-disorder_finding
)               (25, 26)  O
kan             (27, 30)  O
ofta            (31, 35)  O
dämpa           (36, 41)  O
##s             (41, 42)  O
av              (43, 45)  O
[               (46, 47)  O
näs             (48, 51)  B-pharmaceutical_drug
##spre          (51, 55)  I-pharmaceutical_drug
##j             (55, 56)  I-pharmaceutical_drug
]               (57, 58)  O
,               (59, 60)  O
kli             (61, 64)  O
##ande          (64, 68)  O
och        

Note that brackets `()`, `[]`, `{}` are now labeled as O. However, the start and end positions are the same as before pointing to the brackets. The BIO labels align with real entity content only, not the brackets. More importantly, the text has been further tokenized, using AutoTokenizer.