In [None]:
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, pipeline, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import pandas as pd

# ========== 1. Load Data ==========
with open("labels_for_bert_training.json", "r") as f:
    data = json.load(f)

# Optional: Stratified split using dominant entity label
def get_primary_label(sample):
    for label in sample["labels"]:
        if label != "O":
            return label
    return "O"

labels_for_split = [get_primary_label(sample) for sample in data]
train_data, test_data = train_test_split(
    data, test_size=0.2, stratify=labels_for_split, random_state=42
)

# Create HuggingFace dataset
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# ========== 2. Label Mapping ==========
label_list = [
    "B-DOSAGE", "I-DOSAGE",
    "B-FREQUENCY", "I-FREQUENCY",
    "B-INSTRUCTION", "I-INSTRUCTION",
    "B-MEDICATION_NAME", "I-MEDICATION_NAME",
    "B-NOTE", "I-NOTE",
    "O"
]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(label_list)

# ========== 3. Tokenization and Alignment ==========
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
max_length = 128

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    word_ids = tokenized_inputs.word_ids()

    labels = [
        -100 if word_idx is None else label_to_id[example["labels"][word_idx]]
        for word_idx in word_ids
    ]

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map and preprocess
tokenized_datasets = dataset.map(tokenize_and_align_labels)

# ========== 4. Load Model ==========
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

# ========== 5. Metrics ==========
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels, true_predictions = [], []

    for pred, label in zip(predictions, labels):
        current_preds, current_labels = [], []
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                current_preds.append(id_to_label[p_i])
                current_labels.append(id_to_label[l_i])
        true_predictions.append(current_preds)
        true_labels.append(current_labels)

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# ========== 6. Training Setup ==========
training_args = TrainingArguments(
    output_dir="./ner_model_distilled",
    save_safetensors=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_steps=-1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ========== 7. Train Model ==========
trainer.train()

# ========== 8. Save Model ==========
model_path = "./ner_medication_model_distilled"
model.config.id2label = id_to_label
model.config.label2id = label_to_id
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ========== 9. Evaluate ==========
eval_results = trainer.evaluate()
print("\nEvaluation Metrics:")
for k, v in eval_results.items():
    print(f"{k}: {v:.4f}")

# ========== 10. Inference Example ==========
ner_pipe = pipeline("ner", model=model_path, tokenizer=model_path, aggregation_strategy="simple")
text = "Take 2 tablets 3 times a day after food"
results = ner_pipe(text)
print("\nInference Output:")
for entity in results:
    print(f"{entity['word']} → {entity['entity_group']}")

# ========== 11. Optional Detailed Report ==========
preds, labels = trainer.predict(tokenized_datasets["test"])
preds = preds.argmax(axis=-1)
true_preds, true_labels = [], []

for p, l in zip(preds, labels):
    temp_preds, temp_labels = [], []
    for p_i, l_i in zip(p, l):
        if l_i != -100:
            temp_preds.append(id_to_label[p_i])
            temp_labels.append(id_to_label[l_i])
    true_preds.append(temp_preds)
    true_labels.append(temp_labels)

print("\nClassification Report:")
print(classification_report(true_labels, true_preds))

# ========== 12. Plotting Graphs (Loss + Combined Metrics) ==========
logs = trainer.state.log_history
df_logs = pd.DataFrame(logs)

if "epoch" in df_logs.columns:
    fig, axs = plt.subplots(2, 1, figsize=(10, 10), sharex=True)

    # 1. Combined Loss Plot (Training + Eval)
    if "loss" in df_logs.columns:
        axs[0].plot(
            df_logs["epoch"],
            df_logs["loss"],
            label="Training Loss",
            marker='o',
            color='blue',
            linestyle='dashed'
        )
    if "eval_loss" in df_logs.columns:
        axs[0].plot(
            df_logs["epoch"],
            df_logs["eval_loss"],
            label="Eval Loss",
            marker='x',
            color='orange',
            linestyle='dashed'
        )
    axs[0].set_title("Training and Eval Loss over Epochs")
    axs[0].legend()
    axs[0].grid(True)

    # 2. Combined (F1, Precision, Recall)
    if all(m in df_logs.columns for m in ["eval_f1", "eval_precision", "eval_recall"]):
        axs[1].plot(df_logs["epoch"], df_logs["eval_f1"], label="F1", marker='o', color='green', linestyle='dashed')
        axs[1].plot(df_logs["epoch"], df_logs["eval_precision"], label="Precision", marker='^', color='purple', linestyle='dashed')
        axs[1].plot(df_logs["epoch"], df_logs["eval_recall"], label="Recall", marker='v', color='brown', linestyle='dashed')
        axs[1].set_title("F1 vs Precision vs Recall")
        axs[1].legend()
        axs[1].grid(True)

    axs[1].set_xlabel("Epoch")
    plt.tight_layout()
    plt.show()
else:
    print("No 'epoch' column found in logs to plot.")


In [1]:
import pandas as pd

src = r"C:\Users\prisc\Downloads\drugbank_vocabulary.csv"
dst = r"C:\Users\prisc\Downloads\drugbank_vocabulary_lower.csv"  # write to new file

df = pd.read_csv(src)

# Lowercase every object/string column
for col in df.select_dtypes(include=["object", "string"]).columns:
    df[col] = df[col].astype(str).str.lower()

df.to_csv(dst, index=False)
print(f"Saved: {dst}")


Saved: C:\Users\prisc\Downloads\drugbank_vocabulary_lower.csv


In [3]:
import pandas as pd

src = r"C:\Users\prisc\Downloads\drugbank_vocabulary_lower.csv"
dst = r"C:\Users\prisc\Downloads\drugbank_vocabulary_clean.csv"

df = pd.read_csv(src)

# remove rows that are entirely NaN
df = df.dropna(how="all")

# (optional) remove rows with any NaN across the dataframe
# df = df.dropna(how="any")

# drop fully-duplicate rows (across all columns)
df = df.drop_duplicates()

df.to_csv(dst, index=False)
print(f"Saved: {dst}")


Saved: C:\Users\prisc\Downloads\drugbank_vocabulary_clean.csv
