Converted medication label images to text and using similar process as below. Output is clean_clean.json


Preprocessing function to clean manually keyed in 50 of the medication label

In [1]:
import re
import pandas as pd

def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }

    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    #remove irrelevant data
    lines = text.splitlines()
    cleaned = []

    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)
    

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

df = pd.read_csv("50_new_sample.csv")
df["extracted_text"] = df["extracted_text"].apply(preprocess_ocr_text)
df.to_csv("cleaned_50_new_sample.csv", index=False)

#do not change to lowercase as it will affect the keyword matching later for medication!

    


labeling of the text

In [2]:
import pandas as pd
import json
import re

# Define your CSV file and column
csv_file = "cleaned_50_new_sample.csv"
column_name = "extracted_text"

# Define keyword groups for automatic tagging
keywords1 = {
    "DOSAGE": ["tablet", "tablets", "teblet", "tab", "tabs", "tab/s","cap/s","cap", "capsule", "capsules"],
    "FREQUENCY": ["times", "time", "hour", "hours","hourly","morning", "evening", "afternoon", "bedtime", "night"]
}

keywords2 = {
    "FREQUENCY": ["once", "twice"],
    "INSTRUCTION": ["when","needed", "after", "use","before", "after", "with", "without", "meal", "meals", "food", "swallow","chew"],
    "NOTE" : ["fever","pain", "cough", "cold", "flu", "runny", "allergy", "infection", "inflammation", "swelling", "sore throat", "headache", "nausea","gastric", "drowsiness", "vomiting", "diarrhea", "constipation", "rash", "itching", "fatigue", "dizziness"]
}


# split sentence into tokens using regex
def tokenize(text):
    return re.findall(r"\b\w+(?:/\w+)?\b", text.lower())

# Auto-label function (basic BIO tagging)
def auto_label(tokens):
    labels = ["O"] * len(tokens)

    quantity_words = {
        "1", "2", "3", "4", "5", "10", "one", "two", "three", "four", "five", "half", "quarter"
    }

    # Collect known keywords
    known_keywords = set()
    for group in [*keywords1.values(), *keywords2.values()]:
        known_keywords.update(word.lower() for word in group)

    #MEDICATION_NAME tagging to tag the first unknown word
    for i, token in enumerate(tokens):
        token_lower = token.lower()
        if (
            labels[i] == "O"
            and token_lower not in known_keywords
            and token_lower.isalpha()
            and len(token) > 3
        ):
            labels[i] = "B-MEDICATION_NAME"
            break  # only tag the first one

    #DOSAGE tagging
    dosage_candidates = [
        i for i in range(1, len(tokens))
        if tokens[i].lower() in keywords1["DOSAGE"] and tokens[i - 1].lower() in quantity_words
    ]

    if len(dosage_candidates) == 1:
        i = dosage_candidates[0]
        labels[i - 1] = "B-DOSAGE"
        labels[i] = "I-DOSAGE"
    elif len(dosage_candidates) >= 2:
        i = dosage_candidates[1]  # tag only the second
        labels[i - 1] = "B-DOSAGE"
        labels[i] = "I-DOSAGE"

    #FREQUENCY tagging
    for i in range(1, len(tokens)):
        if (
            tokens[i].lower() in keywords1["FREQUENCY"] and 
            tokens[i - 1].lower()
        ):
            labels[i - 1] = "B-FREQUENCY"
            labels[i] = "I-FREQUENCY"

    #keywords2 tagging
    for i, token in enumerate(tokens):
        token_lower = token.lower()
        for label, keyword_list in keywords2.items():
            if token_lower in keyword_list and labels[i] == "O":
                labels[i] = f"B-{label.upper()}"
                if (
                    i + 1 < len(tokens) and 
                    tokens[i + 1].lower() in keyword_list
                ):
                    labels[i + 1] = f"I-{label.upper()}"

    return labels



# Load CSV
df = pd.read_csv(csv_file)

# Clean nulls and whitespace
df = df[df[column_name].notnull()]
df[column_name] = df[column_name].astype(str).str.strip()

# Build token-label pairs
data = []
for text in df[column_name]:
    tokens = tokenize(text)
    labels = auto_label(tokens)
    data.append({"tokens": tokens, "labels": labels})

# Save to JSON
with open("auto_labeled_ner_data50.json", "w") as f:
    json.dump(data, f, indent=2)

print("Auto-labeling complete!")




Auto-labeling complete!


to do augmentation on the new 50 labels to increase to 300 samples

In [None]:
import json
import random
import nltk
from nltk.corpus import wordnet
import os

nltk.download('wordnet')

#  Load original data 
with open("auto_labeled_ner_data50.json", "r") as f:
    original_data = json.load(f)

# Get synonyms 
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").lower()
            if synonym != word.lower() and synonym.isalpha():
                synonyms.add(synonym)
    return list(synonyms)

# Augment one sample
def augment_sample(sample, num_aug=5):
    tokens, labels = sample['tokens'], sample['labels']
    augmented = []

    for _ in range(num_aug):
        new_tokens = []
        for token, label in zip(tokens, labels):
            if label == "O" and random.random() < 0.3:
                synonyms = get_synonyms(token)
                new_token = random.choice(synonyms) if synonyms else token
            else:
                new_token = token
            new_tokens.append(new_token)

        augmented.append({
            "tokens": new_tokens,
            "labels": labels.copy()
        })

    return augmented

#  Apply to all data 
augmented_data = []
for sample in original_data:
    augmented_data.append(sample)  # keep original
    augmented_data.extend(augment_sample(sample, num_aug=5))  # add 5 augmented

#  Save to file 
output_file = "50augmented_data.json"
with open(output_file, "w") as f:
    json.dump(augmented_data, f, indent=2)

print(f"✅ Augmented data saved to {output_file} with {len(augmented_data)} samples.")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prisc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Augmented data saved to 50augmented_data.json with 300 samples.


combine clean_clean.json has 116 samples and 50augmented_data.json has 300 samples manually - combined_for_bert_training.json

In [3]:
import json

with open("combined_for_bert_training.json", "r") as f:
    data = json.load(f)

# Fix typo in labels
for example in data:
    example["labels"] = [
        "I-INSTRUCTION" if label == "I-NSTRUCTION" else label
        for label in example["labels"]
    ]

# Save fixed file
with open("labels_for_bert_training.json", "w") as f:
    json.dump(data, f, indent=2)

print("labels_for_bert_training.json")


labels_for_bert_training.json
