# package requirements & setup

In [1]:
import json  # Work with JSON files (load, save, parse)
import re  # Use regular expressions for pattern matching and text cleaning
from pathlib import Path  # Handle file/folder paths in an OS-independent way
import pandas as pd  # Load, manipulate, and analyze tabular data

from sklearn.model_selection import train_test_split  # Split data into training and testing sets
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score  # Evaluate model performance

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
# AutoTokenizer: prepares text for model input
# AutoModelForTokenClassification: loads a pre-trained model for tasks like NER
# Trainer: simplifies training and evaluation of Hugging Face models

from datasets import Dataset, DatasetDict
# Dataset: a Hugging Face object for structured data (like a DataFrame)
# DatasetDict: stores multiple datasets (e.g., train/test splits)

import evaluate  # Load prebuilt evaluation metrics (like accuracy, precision, etc.)

import torch  # Deep learning framework used by Hugging Face models (handles tensors, GPU support)

import os # For mainflow parts
from transformers import TrainingArguments # from model setup part
import accelerate # for model setup part

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Define directories
BASE_DIR = Path().resolve()  # resolves to current working directory
TXT_DIR = BASE_DIR / "Inclusion_Criteria_Text_500_File"
SEMANTIC_DIR = BASE_DIR / "Semantic_Entity_Dictionary"
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)

ENTITY_OUTPUT_FILE = BASE_DIR / "entity_extraction_BERT.xlsx"
REPORT_FILE = BASE_DIR / "training_report_BERT.txt"


# ------------------ TEXT CLEANING & LOADING ------------------ #

In [3]:

# Load semantic dictionary
def load_semantics(directory):
    semantics = {}
    for file in directory.glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            semantics[file.stem] = [line.strip().lower() for line in f]
    return semantics


# Clean text for model input
def clean_text(text):
    # Extract inclusion criteria section
    pattern = r"inclusion criteria.*?(?=(exclusion criteria|key exclusion criteria|$))"
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    extracted = match.group(0) if match else text

    # Clean up slashes and line breaks
    extracted = re.sub(r'\\', '', extracted)
    extracted = extracted.replace('\n', '~').replace('\r', '~')
    extracted = re.sub(r'\s+', ' ', extracted)
    return extracted.strip()


# Preprocess Text to Replace <240 with a Placeholder
def preprocess_text_for_bert(text):
    text = re.sub(r'\s*([<>≤≥=])\s*(\d+)', r' \1 \2', text)  # Normalize spaces, keep numeric expressions intact
    return text


# Load txt and apply text cleaning
def load_txt(directory):
    data = []
    for file in directory.glob("*.txt"):
        print(f"Loading file: {file}")  # Debugging line to check which files are being loaded
        with open(file, encoding="utf-8") as f:
            content = f.read()
            print(f"File content preview: {content[:100]}")  # Preview first 100 characters
            text = clean_text(content)
            text = preprocess_text_for_bert(text)
            data.append({"filename": file.name, "text": text})
    return data




# ------------------ TOKENIZATION AND LABELING ------------------ #

In [4]:
# Tokenization using regex, label with BIO format
def tokenize_and_label(text, semantics, filename):
    # Tokenization using regex (custom defined to include < and handle complex values)
    tokens = re.findall(r"(?:[<>≤≥=]\s?\d+\s?mg/dl)|[a-zA-Z0-9\(\)\-\'%≤≥=<>/.]+|\b\d{4}\b", text.lower())  # Add \b\d{4}\b for year capture
    labels = ["O"] * len(tokens)
    entities_found = []
    captured_ranges = []
    lowered_text = text.lower()

    # Build entity list with pre-tokenized versions
    sorted_semantics = []
    for group, entities in semantics.items():
        for entity in entities:
            sorted_semantics.append({
                "group": group,
                "entity": entity,
                "tokens": re.findall(r"[a-zA-Z0-9\(\)\-\'%≤≥=<>/.]+|[ENTITY_NUMBER]+", entity.lower())
                # Updated regex for < symbols
            })

    # Sort entities by length for accurate BIO tagging
    sorted_semantics = sorted(sorted_semantics, key=lambda x: len(x["entity"]), reverse=True)

    for entity_info in sorted_semantics:
        group = entity_info["group"]
        entity = entity_info["entity"]
        pattern = fr"(?<!\w){re.escape(entity)}(?!\w)"

        for match in re.finditer(pattern, lowered_text):
            start_idx, end_idx = match.start(), match.end()

            # Skip overlapping ranges
            if any(start_idx < e and end_idx > s for s, e in captured_ranges):
                continue

            entity_tokens = entity_info["tokens"]
            for i in range(len(tokens) - len(entity_tokens) + 1):
                if tokens[i:i + len(entity_tokens)] == entity_tokens:
                    labels[i] = f"B-{group}"
                    for j in range(1, len(entity_tokens)):
                        labels[i + j] = f"I-{group}"
                    break

            entities_found.append({
                "semantic_group": group,
                "entity": entity,
                "filename": filename,
                "start": start_idx,
                "end": end_idx
            })
            captured_ranges.append((start_idx, end_idx))

    return tokens, labels, entities_found


# Apply tokenizer and labeler to full dataset
def prepare_dataset(txt_data, semantics):
    data = []
    all_entities = []
    for item in txt_data:
        tokens, labels, entities_found = tokenize_and_label(item["text"], semantics, item["filename"])
        data.append({
            "filename": item["filename"],
            "tokens": tokens,
            "ner_tags": labels
        })
        all_entities.extend(entities_found)
    return data, all_entities


# Filter out overlaps and invalid entities
def filter_entities(entities):
    entities.sort(key=lambda x: (x["filename"], x["semantic_group"], x["start"], x["end"]))
    filtered_entities = []
    seen_groups = {}

    for entity in entities:
        key = (entity["filename"], entity["semantic_group"], entity["entity"])
        if key not in seen_groups:
            filtered_entities.append(entity)
            seen_groups[key] = 1

    # Remove entities with negative indices
    filtered_entities = [e for e in filtered_entities if e["start"] >= 0 and e["end"] >= 0]
    return filtered_entities


# ------------------ MAIN FLOW ------------------ #

In [5]:

# Load semantics and txt
semantics = load_semantics(SEMANTIC_DIR)
txt_data = load_txt(TXT_DIR)


# Save clean text preview
cleaned_text_df = pd.DataFrame(txt_data)
CLEAN_TEXT_FILE = BASE_DIR / "clean_text_BERT.xlsx"
cleaned_text_df.to_excel(CLEAN_TEXT_FILE, index=False)
print(f"✅ Cleaned text saved to: {CLEAN_TEXT_FILE}")




Loading file: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\Inclusion_Criteria_Text_500_File\NCT00000172.txt
File content preview: Inclusion Criteria:~* Probable Alzheimer's disease~* Mini-Mental State Examination (MMSE) 10-22 and 
Loading file: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\Inclusion_Criteria_Text_500_File\NCT00000173.txt
File content preview: Inclusion Criteria:~* Memory complaints and memory difficulties which are verified by an informant.~
Loading file: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\Inclusion_Criteria_Text_500_File\NCT00000174.txt
File content preview: Inclusion Criteria:~* Are aged 55-85 years, inclusive. Subjects older than 85 years may be eligible 
Loading file: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\Inclusion_Criteria_Text_500_File\NCT00001662.txt
File content preview: INCLUSION CRITERIA - ALZHEIMER'S DISE

In [6]:
# Check if the directory exists
if not os.path.exists(TXT_DIR):
    print(f"❌ Directory does not exist: {TXT_DIR}")
else:
    print(f"✔️ Directory found: {TXT_DIR}")
    # List all files in the directory
    print("Files in TXT_DIR:", os.listdir(TXT_DIR))

if not txt_data:
    raise ValueError("❌ No data loaded from TXT files.")

✔️ Directory found: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\Inclusion_Criteria_Text_500_File
Files in TXT_DIR: ['NCT00000172.txt', 'NCT00000173.txt', 'NCT00000174.txt', 'NCT00001662.txt', 'NCT00001933.txt', 'NCT00004845.txt', 'NCT00006187.txt', 'NCT00007189.txt', 'NCT00010803.txt', 'NCT00013923.txt', 'NCT00018382.txt', 'NCT00024531.txt', 'NCT00031018.txt', 'NCT00036114.txt', 'NCT00041678.txt', 'NCT00051909.txt', 'NCT00053599.txt', 'NCT00056225.txt', 'NCT00069849.txt', 'NCT00071721.txt', 'NCT00074529.txt', 'NCT00082602.txt', 'NCT00083421.txt', 'NCT00083590.txt', 'NCT00087724.txt', 'NCT00088387.txt', 'NCT00088673.txt', 'NCT00090116.txt', 'NCT00093951.txt', 'NCT00095719.txt', 'NCT00096473.txt', 'NCT00097916.txt', 'NCT00099242.txt', 'NCT00100334.txt', 'NCT00103649.txt', 'NCT00104013.txt', 'NCT00104273.txt', 'NCT00105105.txt', 'NCT00105547.txt', 'NCT00112073.txt', 'NCT00130429.txt', 'NCT00142805.txt', 'NCT00151333.txt', 'NCT00151398.txt', 'NCT00153010

In [7]:
# Create training data
dataset, all_entities = prepare_dataset(txt_data, semantics)
filtered_entities = filter_entities(all_entities)

# Save entities
entities_df = pd.DataFrame(filtered_entities).sort_values(by=["filename", "start"]).reset_index(drop=True)
entities_df.to_excel(ENTITY_OUTPUT_FILE, index=False)
print(f"✅ Filtered entity extraction results saved to: {ENTITY_OUTPUT_FILE}")


✅ Filtered entity extraction results saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\entity_extraction_BERT.xlsx


# ------------------ MODEL SETUP ------------------ #

In [8]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
dataset_dict = DatasetDict({
    "train": Dataset.from_list(train),
    "test": Dataset.from_list(test)
})
print(f"Training data size: {len(train)}")
print(f"Testing data size: {len(test)}")
print(f"Training data preview: {train[:3]}")  # Preview of the first 3 rows
print(f"Testing data preview: {test[:3]}")    # Preview of the first 3 rows

Training data size: 400
Testing data size: 100
Training data preview: [{'filename': 'NCT02244541.txt', 'tokens': ['inclusion', 'criteria', '1.', 'diagnosis', 'of', 'probable', 'ad', 'in', 'accordance', 'with', 'nincds-adrda', 'criteria.', '2.', 'a', 'brain', 'ct', 'or', 'mri', 'scan', 'performed', 'within', 'last', '12', 'months', 'from', 'day', 'of', 'screening', 'consistent', 'with', 'the', 'clinical', 'diagnosis', 'of', 'probable', 'ad.', '3.', 'age', 'from', '55', 'to', '85', 'years', 'inclusive.', '4.', 'mmse', 'score', 'of', '16-28', 'inclusive.', '5.', 'rosen', 'modified', 'hachinski', 'ischemic', 'score', '≤', '4.', '6.', 'community', 'dwelling', 'with', 'caregiver', 'who', 'has', 'regular', 'contact', 'with', 'the', 'subject', 'for', 'at', 'least', '10', 'hours', 'per', 'week', 'and', 'is', 'able', 'to', 'oversee', 'the', "patient's", 'compliance', 'with', 'study', 'medication', 'and', 'participate', 'in', 'the', "patient's", 'clinical', 'assessment', 'and', 'is', 'capable', '

In [9]:
label_list = sorted({label for row in dataset for label in row["ner_tags"]})
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## ⚠️ This is where BERT tokenizer aligns our tokenized words to original labels

In [10]:
# ⚠️ This is where BERT tokenizer aligns our tokenized words to original labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 400/400 [00:00<00:00, 735.25 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 688.88 examples/s]


In [11]:
# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Evaluation metric
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    # Flatten lists for evaluation
    flat_predictions = [item for sublist in true_predictions for item in sublist]
    flat_labels = [item for sublist in true_labels for item in sublist]

    # Generate classification report
    report = classification_report(flat_labels, flat_predictions, output_dict=True)

    return report

# don't run - load instead

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    do_eval=True  # You can manually control evaluation if needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and evaluate
train_result = trainer.train()

  trainer = Trainer(


Step,Training Loss


In [14]:
# Save model
trainer.save_model(OUTPUT_DIR)
trainer.save_state()
print(f"✅ Pretrained model is saved to: {OUTPUT_DIR}")


✅ Pretrained model is saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\outputs


In [15]:
# Save tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Tokenizer saved to: {OUTPUT_DIR}")

✅ Tokenizer saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\outputs


In [16]:
# Run predictions on test set
results = trainer.predict(tokenized_datasets["test"])



In [17]:
# Save full classification report as plain .txt
REPORT_FILE = BASE_DIR / "classification_report.txt"
report = compute_metrics((results.predictions, results.label_ids))

with open(REPORT_FILE, "w", encoding="utf-8") as f:
    for key, value in report.items():
        f.write(f"{key}: {value}\n")

print(f"✅ Classification report saved to: {REPORT_FILE}")

# Save label list as plain .txt
with open(OUTPUT_DIR / "label_list.txt", "w", encoding="utf-8") as f:
    for label in label_list:
        f.write(f"{label}\n")

# Save label mappings as plain .txt
with open(OUTPUT_DIR / "label_mappings.txt", "w", encoding="utf-8") as f:
    f.write("label2id:\n")
    for label, idx in label2id.items():
        f.write(f"  {label}: {idx}\n")
    f.write("\nid2label:\n")
    for idx, label in id2label.items():
        f.write(f"  {idx}: {label}\n")


print(f"✅ label_list, label2id, and id2label saved successfully to : {OUTPUT_DIR}.")

✅ Classification report saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\classification_report.txt
✅ label_list, label2id, and id2label saved successfully to : C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\outputs.


# ------------------ SAVE TRAINING AND TESTING OUTPUT ------------------ #

In [18]:
# Save training and testing outputs
print("Sample of `train`:", train[0])

train_filenames = [item["filename"] for item in train]
test_filenames = [item["filename"] for item in test]

train_entities = [e for e in filtered_entities if e["filename"] in train_filenames]
test_entities = [e for e in filtered_entities if e["filename"] in test_filenames]


train_output_df = pd.DataFrame(train_entities).sort_values(by=["filename", "start"]).reset_index(drop=True)
train_output_df.to_excel(BASE_DIR / "train_output_BERT.xlsx", index=False)
print("✅ Training is saved as train_output_bert2")
print("Training output preview:")
print(train_output_df.head())

test_output_df = pd.DataFrame(test_entities).sort_values(by=["filename", "start"]).reset_index(drop=True)
test_output_df.to_excel(BASE_DIR / "test_output_BERT.xlsx", index=False)
print("✅ Testing is saved as test_output_bert2")
print("Testing output preview:")
print(test_output_df.head())



Sample of `train`: {'filename': 'NCT02244541.txt', 'tokens': ['inclusion', 'criteria', '1.', 'diagnosis', 'of', 'probable', 'ad', 'in', 'accordance', 'with', 'nincds-adrda', 'criteria.', '2.', 'a', 'brain', 'ct', 'or', 'mri', 'scan', 'performed', 'within', 'last', '12', 'months', 'from', 'day', 'of', 'screening', 'consistent', 'with', 'the', 'clinical', 'diagnosis', 'of', 'probable', 'ad.', '3.', 'age', 'from', '55', 'to', '85', 'years', 'inclusive.', '4.', 'mmse', 'score', 'of', '16-28', 'inclusive.', '5.', 'rosen', 'modified', 'hachinski', 'ischemic', 'score', '≤', '4.', '6.', 'community', 'dwelling', 'with', 'caregiver', 'who', 'has', 'regular', 'contact', 'with', 'the', 'subject', 'for', 'at', 'least', '10', 'hours', 'per', 'week', 'and', 'is', 'able', 'to', 'oversee', 'the', "patient's", 'compliance', 'with', 'study', 'medication', 'and', 'participate', 'in', 'the', "patient's", 'clinical', 'assessment', 'and', 'is', 'capable', 'of', 'accompanying', 'the', 'participant', 'on', 'al

# ------------------ EVALUATION ------------------ #

In [19]:

# Save evaluation result in text file
# After the training and evaluation
evaluation_result = compute_metrics((trainer.predict(tokenized_datasets["test"]).predictions, tokenized_datasets["test"]["labels"]))

# Convert the evaluation result into a pandas DataFrame for better readability
report_df = pd.DataFrame(evaluation_result).transpose()

# Save the report to a text file in a readable format
with open(REPORT_FILE, "w") as f:
    f.write("Evaluation Results:\n")
    report_df_str = report_df.to_string()
    f.write(report_df_str)


# Optionally, also print to the console
print("✅ Evaluation Results:")
print(report_df)




✅ Evaluation Results:
               precision    recall  f1-score       support
B-caregiver     0.721649  0.823529  0.769231     85.000000
B-condition     0.733945  0.800000  0.765550    200.000000
B-demography    0.888889  0.914286  0.901408    105.000000
B-drug          0.810000  0.952941  0.875676     85.000000
B-measurement   0.831250  0.917241  0.872131    145.000000
B-procedure     0.606061  0.833333  0.701754     24.000000
B-time          0.666667  0.657534  0.662069     73.000000
B-value         0.629630  0.739130  0.680000     92.000000
I-caregiver     0.722222  0.866667  0.787879     30.000000
I-condition     0.607143  0.708333  0.653846    192.000000
I-demography    0.830986  0.867647  0.848921     68.000000
I-drug          0.781250  0.961538  0.862069     26.000000
I-measurement   0.681818  0.859031  0.760234    227.000000
I-procedure     0.585366  0.941176  0.721805     51.000000
I-time          0.709845  0.728723  0.719160    188.000000
I-value         0.611111  0.619718

# Data exploration Duco

evaluation results seem to be at token level, now see for entity level.


In [None]:
# Recompute predictions and align them with true labels and tokens
predictions = results.predictions.argmax(-1)
labels = results.label_ids

# Get test tokens from original (pre-tokenized) test set
test_tokens = dataset_dict["test"]["tokens"]

# Store per-token comparison
comparison = []

for i in range(len(labels)):  # For each example in the test set
    word_ids = tokenized_datasets["test"].features["labels"].feature  # sequence features
    input_tokens = test_tokens[i]
    true_label_seq = labels[i]
    pred_label_seq = predictions[i]

    tokenized = tokenizer(input_tokens, is_split_into_words=True, truncation=True)
    aligned_word_ids = tokenized.word_ids()

    previous_word_idx = None
    token_index = 0
    for j, word_idx in enumerate(aligned_word_ids):
        if word_idx is None:
            continue
        if word_idx != previous_word_idx:
            token_text = input_tokens[word_idx]
            true_label = id2label[true_label_seq[j]] if true_label_seq[j] != -100 else "IGN"
            pred_label = id2label[pred_label_seq[j]] if pred_label_seq[j] != -100 else "IGN"

            if "I-value" in (true_label, pred_label):  # Only track I-value-related tokens
                comparison.append({
                    "token": token_text,
                    "true_label": true_label,
                    "pred_label": pred_label,
                    "match": true_label == pred_label,
                    "sequence_id": i
                })
        previous_word_idx = word_idx

# Convert to DataFrame for inspection
i_value_df = pd.DataFrame(comparison)

# Save to Excel or CSV
i_value_df.to_excel(BASE_DIR / "i_value_token_comparison.xlsx", index=False)
print(f"✅ I-value comparison saved to: {BASE_DIR / 'i_value_token_comparison.xlsx'}")


✅ I-value comparison saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\i_value_token_comparison.xlsx


In [21]:
def extract_entities_from_labels(tokens, labels, target="I-value"):
    entities = []
    current = []
    for token, label in zip(tokens, labels):
        if label.endswith(target):
            current.append(token)
        else:
            if current:
                entities.append(" ".join(current))
                current = []
    if current:
        entities.append(" ".join(current))
    return entities

# Step 1: collect predicted and true I-value phrases per sample
records = []

for i in range(len(test)):
    tokens = test[i]["tokens"]
    label_ids = results.label_ids[i]
    pred_ids = results.predictions[i].argmax(-1)

    true_labels = [id2label[id_] if id_ != -100 else "O" for id_ in label_ids]
    pred_labels = [id2label[id_] if id_ != -100 else "O" for id_ in pred_ids]

    true_entities = extract_entities_from_labels(tokens, true_labels, target="I-value")
    pred_entities = extract_entities_from_labels(tokens, pred_labels, target="I-value")

    for ent in set(true_entities + pred_entities):
        records.append({
            "sequence_id": i,
            "entity": ent,
            "in_true": ent in true_entities,
            "in_pred": ent in pred_entities,
            "status": (
                "TP" if ent in true_entities and ent in pred_entities else
                "FN" if ent in true_entities else
                "FP"
            )
        })

# Convert to DataFrame
i_value_entity_df = pd.DataFrame(records)
i_value_entity_df.to_excel(BASE_DIR / "i_value_entity_comparison.xlsx", index=False)
print(f"✅ Entity-level I-value comparison saved to: {BASE_DIR / 'i_value_entity_comparison.xlsx'}")


✅ Entity-level I-value comparison saved to: C:\Users\duco.veen\OneDrive - Julius Clinical\Documents\github\AlzheimerNER_txt\i_value_entity_comparison.xlsx


In [22]:
tp = (i_value_entity_df["status"] == "TP").sum()
fp = (i_value_entity_df["status"] == "FP").sum()
fn = (i_value_entity_df["status"] == "FN").sum()

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Entity-level precision: {precision:.3f}")
print(f"Entity-level recall:    {recall:.3f}")
print(f"Entity-level F1-score:  {f1:.3f}")


Entity-level precision: 0.306
Entity-level recall:    0.441
Entity-level F1-score:  0.361


check if > < etc is consistently classified as 0, B, or I

In [25]:
# Count how often operator tokens are labeled as part of a value entity
operator_tokens = {'>', '<', '≥', '≤', '='}
operator_stats = []

for item in dataset_dict["test"]:
    tokens = item["tokens"]
    labels = item["ner_tags"]
    for token, label in zip(tokens, labels):
        if token in operator_tokens:
            operator_stats.append({"token": token, "label": label})

pd.DataFrame(operator_stats).value_counts()


token  label  
≥      O          43
≤      O          22
>      O          17
=      O          14
<      O          13
≥      B-value     8
<      B-value     6
≥      B-time      5
≤      B-value     3
       I-value     2
>      B-value     1
Name: count, dtype: int64

# Save and restore variables

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)