Preprocessing function 

In [None]:
import re
import pandas as pd

def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }

    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    #remove irrelevant data
    lines = text.lower().splitlines()
    cleaned = []

    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)
    

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip().lower()

df = pd.read_csv("ocr_labels.csv")
df["extracted_text"] = df["extracted_text"].apply(preprocess_ocr_text)
df.to_csv("cleaned_ocr_labels.csv", index=False)    

    


labeling of the text

In [None]:
import pandas as pd
import json
import re

# Define your CSV file and column
csv_file = "50cleaned_ocr_labels.csv"
column_name = "extracted_text"

# Define keyword groups for automatic tagging
keywords = {
    "DOSAGE": ["tablet", "tablets", "tab", "tabs", "capsule", "capsules", "ml", "mg"],
    "FREQUENCY": ["once", "twice", "daily", "every", "hour", "hours", "day", "night", "morning", "evening"],
    "TIMING": ["before", "after", "with", "without", "meal", "meals", "food"],
    "MEDICATION_NAME": ["ibuprofen","montelukast","prednisolone","telfast","celebrax","augmentin", "paracetamol", "enhancin", "amoxicillin"]  # Add more
}

# split sentence into tokens using regex
def tokenize(text):
    return re.findall(r"\b\w+(?:/\w+)?\b", text.lower())

# Auto-label function (basic BIO tagging)
def auto_label(tokens):
    labels = ["O"] * len(tokens)
    for i, token in enumerate(tokens):
        for label, keyword_list in keywords.items():
            if token in keyword_list:
                labels[i] = f"B-{label}"
                # Check for multi-word terms
                if i+1 < len(tokens) and tokens[i+1] in keyword_list:
                    labels[i+1] = f"I-{label}"
    return labels

# Load CSV
df = pd.read_csv(csv_file)

# Clean nulls and whitespace
df = df[df[column_name].notnull()]
df[column_name] = df[column_name].astype(str).str.strip()

# Build token-label pairs
data = []
for text in df[column_name]:
    tokens = tokenize(text)
    labels = auto_label(tokens)
    data.append({"tokens": tokens, "labels": labels})

# Save to JSON
with open("auto_labeled_ner_data.json", "w") as f:
    json.dump(data, f, indent=2)

print("✅ Auto-labeling complete! Saved to auto_labeled_ner_data.json")


✅ Auto-labeling complete! Saved to auto_labeled_ner_data.json


In [3]:
!pip install transformers datasets seqeval scikit-learn




In [4]:
!pip install "accelerate>=0.26.0"




In [5]:
!pip install --upgrade transformers





training BERT model 

In [6]:
import json
from datasets import Dataset
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

with open("auto_labeled_ner_data.json", "r") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)
dataset

#create a label list
label_list = sorted({label for d in data for label in d["labels"]})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

print("Label list:", label_list)

#tokenise and align labels
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, padding='max_length',  truncation=True)
    word_ids = tokenized_inputs.word_ids()
    
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example["labels"][word_idx]])
        else:
            labels.append(label_to_id[example["labels"][word_idx]])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

#load the model with tokenized labels
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=9)

#train the model
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_strategy="epoch"
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,

)

trainer.train()

#save the model
trainer.save_model("./ner_medication_model")
tokenizer.save_pretrained("./ner_medication_model")


#inference testing
ner_pipe = pipeline("ner", model="./ner_medication_model", tokenizer="./ner_medication_model", aggregation_strategy="simple")

text = "Take 2 tablets 3 times a day after food"
results = ner_pipe(text)

for entity in results:
    print(entity["word"], "→", entity["entity_group"])


Label list: ['B-DOSAGE', 'B-FREQUENCY', 'B-MEDICATION_NAME', 'B-TIMING', 'O']


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
7,1.012
14,0.2277
21,0.1412


Device set to use cpu


Take 2 tablets 3 times a → LABEL_4
day → LABEL_1
after → LABEL_3
food → LABEL_4
