Preprocessing function 

In [21]:
import re
import pandas as pd

def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }

    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    #remove irrelevant data
    lines = text.splitlines()
    cleaned = []

    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)
    

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

df = pd.read_csv("ocr2_output_cleaned.csv")
df["extracted_text"] = df["extracted_text"].apply(preprocess_ocr_text)
df.to_csv("cleaned_ocr_labels2.csv", index=False)

#do not change to lowercase as it will affect the keyword matching later for medication!

    


labeling of the text

In [29]:
import pandas as pd
import json
import re

# Define your CSV file and column
csv_file = "cleaned_ocr_labels2.csv"
column_name = "extracted_text"

# Define keyword groups for automatic tagging
keywords1 = {
    "DOSAGE": ["tablet", "tablets", "teblet", "tab", "tabs", "tab/s","cap/s","cap", "capsule", "capsules"],
    "FREQUENCY": ["times", "time", "hour", "hours","hourly","morning", "evening", "afternoon", "bedtime", "night"]
}

keywords2 = {
    "FREQUENCY": ["once", "twice"],
    "INSTRUCTION": ["when","needed", "after", "use","before", "after", "with", "without", "meal", "meals", "food", "swallow","chew"],
    "NOTE" : ["fever","pain", "cough", "cold", "flu", "runny", "allergy", "infection", "inflammation", "swelling", "sore throat", "headache", "nausea","gastric", "drowsiness", "vomiting", "diarrhea", "constipation", "rash", "itching", "fatigue", "dizziness"]
}


# split sentence into tokens using regex
def tokenize(text):
    return re.findall(r"\b\w+(?:/\w+)?\b", text.lower())

# Auto-label function (basic BIO tagging)
def auto_label(tokens):
    labels = ["O"] * len(tokens)

    quantity_words = {
        "1", "2", "3", "4", "5", "10", "one", "two", "three", "four", "five", "half", "quarter"
    }

    # Collect known keywords
    known_keywords = set()
    for group in [*keywords1.values(), *keywords2.values()]:
        known_keywords.update(word.lower() for word in group)

    # ========== 1. MEDICATION_NAME tagging (first unknown word) ==========
    for i, token in enumerate(tokens):
        token_lower = token.lower()
        if (
            labels[i] == "O"
            and token_lower not in known_keywords
            and token_lower.isalpha()
            and len(token) > 3
        ):
            labels[i] = "B-MEDICATION_NAME"
            break  # only tag the first one

    # ========== 2. DOSAGE tagging ==========
    dosage_candidates = [
        i for i in range(1, len(tokens))
        if tokens[i].lower() in keywords1["DOSAGE"] and tokens[i - 1].lower() in quantity_words
    ]

    if len(dosage_candidates) == 1:
        i = dosage_candidates[0]
        labels[i - 1] = "B-DOSAGE"
        labels[i] = "I-DOSAGE"
    elif len(dosage_candidates) >= 2:
        i = dosage_candidates[1]  # tag only the second
        labels[i - 1] = "B-DOSAGE"
        labels[i] = "I-DOSAGE"

    # ========== 3. FREQUENCY tagging ==========
    for i in range(1, len(tokens)):
        if (
            tokens[i].lower() in keywords1["FREQUENCY"] and 
            tokens[i - 1].lower()
        ):
            labels[i - 1] = "B-FREQUENCY"
            labels[i] = "I-FREQUENCY"

    # ========== 4. keywords2 tagging ==========
    for i, token in enumerate(tokens):
        token_lower = token.lower()
        for label, keyword_list in keywords2.items():
            if token_lower in keyword_list and labels[i] == "O":
                labels[i] = f"B-{label.upper()}"
                if (
                    i + 1 < len(tokens) and 
                    tokens[i + 1].lower() in keyword_list
                ):
                    labels[i + 1] = f"I-{label.upper()}"

    return labels



# Load CSV
df = pd.read_csv(csv_file)

# Clean nulls and whitespace
df = df[df[column_name].notnull()]
df[column_name] = df[column_name].astype(str).str.strip()

# Build token-label pairs
data = []
for text in df[column_name]:
    tokens = tokenize(text)
    labels = auto_label(tokens)
    data.append({"tokens": tokens, "labels": labels})

# Save to JSON
with open("auto_labeled_ner_data2.json", "w") as f:
    json.dump(data, f, indent=2)

print("✅ Auto-labeling complete! Saved to auto_labeled_ner_data.json")


✅ Auto-labeling complete! Saved to auto_labeled_ner_data.json


In [31]:
import json
import pandas as pd

# Load the JSON file
with open("auto_labeled_ner_data2.json", "r") as f:
    data = json.load(f)

# Flatten into token-label pairs
rows = []
for entry in data:
    tokens = entry["tokens"]
    labels = entry["labels"]
    rows.extend(zip(tokens, labels))

# Create DataFrame
df = pd.DataFrame(rows, columns=["Token", "Label"])

# Save to CSV
df.to_csv("auto_labeled_ner_data2_flat.csv", index=False)

print("Saved as auto_labeled_ner_data2_flat.csv")


Saved as auto_labeled_ner_data2_flat.csv


In [36]:
import pandas as pd
import json

# Load the flat CSV
df = pd.read_csv("auto_labeled_ner_data3_flat_cleaned.csv")

# Fill missing values to avoid NaNs
df.fillna("", inplace=True)

# Split into samples using blank lines as boundaries
data = []
current_tokens = []
current_labels = []

for _, row in df.iterrows():
    token = row["Token"].strip()
    label = row["Label"].strip()

    if token == "":  # new sample
        if current_tokens:
            data.append({"tokens": current_tokens, "labels": current_labels})
            current_tokens = []
            current_labels = []
    else:
        current_tokens.append(token)
        current_labels.append(label)

# Add the last one if not already added
if current_tokens:
    data.append({"tokens": current_tokens, "labels": current_labels})

# Save to JSON
with open("auto_labeled_ner_data3.json", "w") as f:
    json.dump(data, f, indent=2)

print(f"✅ Converted {len(data)} samples to JSON format.")


✅ Converted 1 samples to JSON format.


In [1]:
import json

# Path to your raw data
input_path = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\auto_labeled_ner_data3_flat.csv"
output_path = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\clean_clean.json"

# Parse and group tokens by B-MEDICATION_NAME start
data_blocks = []
current_tokens = []
current_labels = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line or line.lower().startswith("token"):  # Skip empty lines/header
            continue
        try:
            token, label = line.split(",", 1)
            token = token.strip()
            label = label.strip().upper()

            if label == "B-MEDICATION_NAME":
                # Start new block if one already exists
                if current_tokens:
                    data_blocks.append({
                        "tokens": current_tokens,
                        "labels": current_labels
                    })
                    current_tokens = []
                    current_labels = []

            current_tokens.append(token)
            current_labels.append(label)
        except ValueError:
            continue

# Append final group
if current_tokens:
    data_blocks.append({
        "tokens": current_tokens,
        "labels": current_labels
    })

# Save as JSON
with open(output_path, "w", encoding="utf-8") as out:
    json.dump(data_blocks, out, indent=2)

print(f"✅ Converted {len(data_blocks)} medication blocks. Saved to: {output_path}")

✅ Converted 84 medication blocks. Saved to: C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\clean_clean.json


In [19]:
!pip install transformers datasets seqeval scikit-learn


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
!pip install "accelerate>=0.26.0"


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
!pip install --upgrade transformers



Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
!pip install torchvision
!pip install torch


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
pip uninstall torch torchvision torchaudio -y


Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Found existing installation: torchvision 0.22.1
Uninstalling torchvision-0.22.1:
  Successfully uninstalled torchvision-0.22.1
Note: you may need to restart the kernel to use updated packages.




In [1]:
import torch
import torchvision

print(torch.__version__)
print(torchvision.__version__)


2.7.1+cpu
0.22.1+cpu


In [35]:
import pandas as pd

# Load CSV
df = pd.read_csv("auto_labeled_ner_data3_flat.csv")

# Clean 'Label' column
df["Label"] = df["Label"].astype(str).str.upper().replace("O", "O")

# Save cleaned CSV
df.to_csv("auto_labeled_ner_data3_flat_cleaned.csv", index=False)

print("✅ Labels cleaned and saved to auto_labeled_ner_data3_flat_cleaned.csv")


✅ Labels cleaned and saved to auto_labeled_ner_data3_flat_cleaned.csv


training BERT model 

In [3]:
import json
from datasets import Dataset
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

with open("clean_clean.json", "r") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)
dataset

#create a label list
label_list = sorted({label for d in data for label in d["labels"]})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

print("Label list:", label_list)

#tokenise and align labels
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, padding='max_length',  truncation=True)
    word_ids = tokenized_inputs.word_ids()
    
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example["labels"][word_idx]])
        else:
            labels.append(label_to_id[example["labels"][word_idx]])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

#load the model with tokenized labels
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=11)

#train the model
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_strategy="epoch"
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,

)

trainer.train()

#save the model
trainer.save_model("./ner_medication_model")
tokenizer.save_pretrained("./ner_medication_model")


#inference testing
ner_pipe = pipeline("ner", model="./ner_medication_model", tokenizer="./ner_medication_model", aggregation_strategy="simple")

text = "Take 2 tablets 3 times a day after food"
results = ner_pipe(text)

for entity in results:
    print(entity["word"], "→", entity["entity_group"])

Label list: ['B-DOSAGE', 'B-FREQUENCY', 'B-INSTRUCTION', 'B-MEDICATION_NAME', 'B-NOTE', 'I-DOSAGE', 'I-FREQUENCY', 'I-INSTRUCTION', 'I-MEDICATION_NAME', 'I-NOTE', 'O']


Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
11,1.3922
22,0.6659
33,0.4225


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [5]:
import json
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import pandas as pd

# ===== 1. Load Data =====
with open("clean_clean.json", "r") as f:
    data = json.load(f)

# ===== 2. Split into Train and Eval =====
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "eval": Dataset.from_list(eval_data)
})

# ===== 3. Create Label Mapping =====
label_list = sorted({label for d in data for label in d["labels"]})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(label_list)

# ===== 4. Tokenizer =====
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# ===== 5. Tokenization & Label Alignment =====
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, padding='max_length', truncation=True)
    word_ids = tokenized_inputs.word_ids()

    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        else:
            labels.append(label_to_id[example["labels"][word_idx]])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels)

# ===== 6. Load Model =====
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

# ===== 7. Compute Metrics =====
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        current_preds = []
        current_labels = []
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                current_preds.append(id_to_label[p_i])
                current_labels.append(id_to_label[l_i])
        true_predictions.append(current_preds)
        true_labels.append(current_labels)

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# ===== 8. Training Arguments =====
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch"
)

# ===== 9. Trainer Setup =====
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ===== 10. Train Model =====
trainer.train()

# ===== 11. Save Model =====
model_path = "./ner_medication_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# ===== 12. Evaluate on Eval Set =====
eval_results = trainer.evaluate()
print("\n📊 Evaluation Metrics:")
for k, v in eval_results.items():
    print(f"{k}: {v:.4f}")

# ===== 13. Inference Example =====
ner_pipe = pipeline("ner", model=model_path, tokenizer=model_path, aggregation_strategy="simple")
text = "Take 2 tablets 3 times a day after food"
results = ner_pipe(text)

print("\n📌 Inference Output:")
for entity in results:
    print(f"{entity['word']} → {entity['entity_group']}")

# ===== 14. Extract Trainer Logs =====
logs = trainer.state.log_history
df_logs = pd.DataFrame(logs)

# ===== 15. Plot Loss and F1 =====
plt.figure(figsize=(12, 6))

# Plot training loss
if "loss" in df_logs.columns:
    plt.plot(df_logs["step"], df_logs["loss"], label="Training Loss", marker='o')

# Plot evaluation loss
if "eval_loss" in df_logs.columns:
    plt.plot(df_logs["step"], df_logs["eval_loss"], label="Eval Loss", marker='x')

# Plot F1 score
if "eval_f1" in df_logs.columns:
    plt.plot(df_logs["step"], df_logs["eval_f1"], label="Eval F1 Score", marker='s')

# ===== 16. Final Plot Styling =====
plt.title("Training/Eval Loss and F1 Score Over Time")
plt.xlabel("Training Steps")
plt.ylabel("Value")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [8]:
!which python




'which' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
from transformers import pipeline
from collections import defaultdict

# ===== 1. Load BERT NER pipeline =====
ner_pipe = pipeline(
    "ner",
    model="./ner_medication_model",
    tokenizer="./ner_medication_model",
    aggregation_strategy="simple"  # ensures "2 tablets" is treated as one
)

# ===== 2. Define input text (replace with dynamic input later) =====
text = "Take 2 tablets 3 times a day after food loratadine may cause drowsiness"

# ===== 3. Run inference =====
results = ner_pipe(text)

# ===== 4. Group output by entity type =====
grouped_output = defaultdict(list)

for entity in results:
    label = entity["entity_group"]  # e.g. B-DOSAGE
    word = entity["word"]

    # Normalize label (strip B- or I- prefixes)
    base_label = label.split("-")[-1] if "-" in label else label
    grouped_output[base_label].append(word)

# ===== 5. Format results for display =====
print("📱 Output for Mobile App:\n")
final_output = {}

for field in ["MEDICATION_NAME", "DOSAGE", "FREQUENCY", "INSTRUCTION", "NOTE"]:
    value = " ".join(grouped_output.get(field, []))
    if value:
        final_output[field.lower()] = value
        print(f"{field.capitalize()}: {value}")

# ===== 6. (Optional) Use as JSON in your app =====
# print(final_output)
