In [7]:
#!/usr/bin/env python3
"""
Quick test of current model with improved preprocessing
No retraining required!
"""

from transformers import pipeline, BertTokenizerFast, BertForTokenClassification
import re

def test_current_model_with_improvements():
    """Test current model with improved preprocessing"""
    
    print("=== Testing Current Model with Improved Processing ===")
    
    # Load your current model directly to handle the size mismatch
    model_path = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    
    # Create model with correct number of labels (9 based on the saved weights)
    model = BertForTokenClassification.from_pretrained(
        model_path, 
        num_labels=11,  # Use 9 labels to match the saved weights
        ignore_mismatched_sizes=True
    )
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")
    
    def preprocess_text(text):
        """Clean and preprocess OCR text to reduce noise"""
        # Remove excessive whitespace and newlines
        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        
        # Remove common OCR artifacts
        text = re.sub(r'[^\w\s\-\.\,\:\;\(\)\/]', '', text)
        
        # Normalize common medication-related terms
        text = re.sub(r'\bTAB\b', 'TABLET', text, flags=re.IGNORECASE)
        text = re.sub(r'\bMG\b', 'MG', text)
        text = re.sub(r'\bML\b', 'ML', text)
        
        return text.strip()
    
    def enhanced_ner_processing(text, confidence_threshold=0.6):
        """Enhanced NER processing with better noise handling"""
        
        # Preprocess text
        cleaned_text = preprocess_text(text)
        print(f"Cleaned text: {cleaned_text}")
        
        # Get NER results
        ner_results = ner_pipeline(cleaned_text)
        
        # Filter by confidence and merge subwords
        filtered_results = []
        current_entity = None
        
        for result in ner_results:
            if result['score'] >= confidence_threshold:
                word = result['word']
                entity = result['entity']
                
                # Handle subword tokens
                if word.startswith('##'):
                    if current_entity:
                        current_entity['word'] += word[2:]
                else:
                    if current_entity:
                        filtered_results.append(current_entity)
                    current_entity = {
                        'word': word,
                        'entity': entity,
                        'score': result['score']
                    }
        
        if current_entity:
            filtered_results.append(current_entity)
        
        return filtered_results
    
    # Your noisy input
    noisy_text = """KEEP FluVOXAMine MALEATE SOMG TAB
STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT
TAKE WITH OR AFTER FOOD. AVOID ALCOHOL.
TEL: 6930 2262"""
    
    print("Original noisy text:")
    print(noisy_text)
    print("\n" + "="*50)
    
    # Test with improved processing
    results = enhanced_ner_processing(noisy_text)
    
    print("\nImproved NER Results:")
    for result in results:
        print(f"{result['word']} -> {result['entity']} (confidence: {result['score']:.3f})")
    
    # Group by entity type
    grouped = {}
    for result in results:
        entity_type = result['entity'].replace('B-', '').replace('I-', '')
        if entity_type not in grouped:
            grouped[entity_type] = []
        grouped[entity_type].append({
            'word': result['word'],
            'score': result['score']
        })
    
    print("\nGrouped Results:")
    for entity_type, items in grouped.items():
        print(f"{entity_type}: {[item['word'] for item in items]}")
    
    # Extract structured information
    print("\nExtracted Information:")
    
    # Medication name
    med_names = grouped.get("MEDICATION_NAME", [])
    if med_names:
        best_med = max(med_names, key=lambda x: x['score'])
        print(f"Medication: {best_med['word']} (confidence: {best_med['score']:.3f})")
    
    # Dosage
    dosages = grouped.get("DOSAGE", [])
    if dosages:
        best_dosage = max(dosages, key=lambda x: x['score'])
        print(f"Dosage: {best_dosage['word']} (confidence: {best_dosage['score']:.3f})")
    
    # Instructions
    instructions = grouped.get("INSTRUCTION", [])
    if instructions:
        print(f"Instructions: {' '.join([inst['word'] for inst in instructions])}")
    
    # Notes
    notes = grouped.get("NOTE", [])
    if notes:
        print(f"Notes: {' '.join([note['word'] for note in notes])}")

if __name__ == "__main__":
    test_current_model_with_improvements()


=== Testing Current Model with Improved Processing ===
Original noisy text:
KEEP FluVOXAMine MALEATE SOMG TAB
STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT
TAKE WITH OR AFTER FOOD. AVOID ALCOHOL.
TEL: 6930 2262

Cleaned text: KEEP FluVOXAMine MALEATE SOMG TABLET STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT TAKE WITH OR AFTER FOOD. AVOID ALCOHOL. TEL: 6930 2262

Improved NER Results:

Grouped Results:

Extracted Information:


In [None]:
import os, json, torch, pprint

p = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"

# -- Read config.json safely
cfg_path = os.path.join(p, "config.json")
with open(cfg_path, "r") as f:
    cfg = json.load(f)

print("== CONFIG KEYS ==")
print(sorted(cfg.keys())[:20], "...")
print("\n_config.model_type:", cfg.get("model_type"))
print("_config.id2label present?:", "id2label" in cfg)
print("_config.num_labels present?:", "num_labels" in cfg)

# Derive label count from id2label if needed
derived_num_labels = len(cfg.get("id2label", {})) if "id2label" in cfg else None
print("Derived num_labels from id2label:", derived_num_labels)

# -- Load weights (bin or safetensors)
sd = None
binp = os.path.join(p, "pytorch_model.bin")
stp = os.path.join(p, "model.safetensors")

if os.path.exists(binp):
    sd = torch.load(binp, map_location="cpu")
    print("\nLoaded pytorch_model.bin")
elif os.path.exists(stp):
    from safetensors.torch import load_file
    sd = load_file(stp)
    print("\nLoaded model.safetensors")
else:
    print("\nNo weights file found in:", p)

# -- Print classifier head shapes if weights found
if sd is not None:
    w_shape = b_shape = None
    for k, v in sd.items():
        if k.endswith("classifier.weight"):
            w_shape = tuple(v.shape)
        elif k.endswith("classifier.bias"):
            b_shape = tuple(v.shape)
    print("classifier.weight shape:", w_shape)
    print("classifier.bias shape:", b_shape)


== CONFIG KEYS ==
['_name_or_path', 'architectures', 'attention_probs_dropout_prob', 'classifier_dropout', 'gradient_checkpointing', 'hidden_act', 'hidden_dropout_prob', 'hidden_size', 'id2label', 'initializer_range', 'intermediate_size', 'label2id', 'layer_norm_eps', 'max_position_embeddings', 'model_type', 'num_attention_heads', 'num_hidden_layers', 'pad_token_id', 'position_embedding_type', 'torch_dtype'] ...

_config.model_type: bert
_config.id2label present?: True
_config.num_labels present?: False
Derived num_labels from id2label: 11

Loaded pytorch_model.bin
classifier.weight shape: (11, 768)
classifier.bias shape: (11,)


In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from collections import defaultdict
import json

# Load NER model
model_path = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

# Converts word-like numbers to numeric
def word_to_number(word):
    word_map = {
        "one": 1, "1": 1,
        "two": 2, "2": 2,
        "three": 3, "3": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

# Merges BERT subword tokens (e.g., lo ##rata ##dine → loratadine)
def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

# Groups entities by label (removes BIO prefix)
def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        if "-" in tag:
            prefix, label = tag.split("-")
        else:
            prefix, label = "O", tag

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

# Cleans text like "##tablet" to "tablet"
def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

# Main function to process input text
def infer_and_format(text):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    # Medication Name
    meds = grouped_output.get("MEDICATION_NAME", [])
    if meds:
        final["medicationName"] = clean_text(meds[0])

    # Dosage
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity: break
    final["intakeQuantity"] = quantity

    # Frequency
    freqs = grouped_output.get("FREQUENCY", [])
    freq_number = 0
    for f in freqs:
        for word in f.split():
            num = word_to_number(word)
            if num is not None:
                freq_number = num
                break
        if freq_number: break
    final["frequency"] = freq_number

    # Instructions
    instr = grouped_output.get("INSTRUCTION", [])
    if instr:
        final["instructions"] = clean_text(" ".join(instr))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save JSON
    with open("ner_output2.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4)

    # Print final JSON only
    print(json.dumps(final, indent=4))
    return final

# 🔍 Run test
infer_and_format("Paracetamol Take 1 tablet 2 times a day after food for fever and pain relief")


{
    "medicationName": "Paracetamol",
    "intakeQuantity": 1,
    "frequency": 2,
    "instructions": "after food",
    "notes": "fever pain"
}


{'medicationName': 'Paracetamol',
 'intakeQuantity': 1,
 'frequency': 2,
 'instructions': 'after food',
 'notes': 'fever pain'}

In [11]:
from transformers import pipeline, BertTokenizerFast, BertForTokenClassification
import re

MODEL_PATH = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"

def preprocess_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text).strip()
    # light OCR normalizations
    text = re.sub(r'\bTAB\b', 'TABLET', text, flags=re.IGNORECASE)
    text = re.sub(r'\bS0MG\b', '50MG', text, flags=re.IGNORECASE)   # common OCR slip (zero vs O)
    text = re.sub(r'\bSOMG\b', '50MG', text, flags=re.IGNORECASE)   # your sample had "SOMG"
    return text

# Load EXACTLY as trained (11 labels); no ignore_mismatched_sizes
tok = BertTokenizerFast.from_pretrained(MODEL_PATH, local_files_only=True)
model = BertForTokenClassification.from_pretrained(MODEL_PATH, local_files_only=True)

print("num_labels:", model.num_labels)
print("id2label:", model.config.id2label)

ner = pipeline("ner", model=model, tokenizer=tok, aggregation_strategy="simple")

noisy_text = """KEEP FluVOXAMine MALEATE SOMG TAB
STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT
TAKE WITH OR AFTER FOOD. AVOID ALCOHOL.
TEL: 6930 2262"""

cleaned = preprocess_text(noisy_text)
print("Cleaned:", cleaned)

preds = ner(cleaned)  # [{'entity_group': 'BLAH', 'word': 'xxx', 'score': 0.87, ...}, ...]

# Optional confidence filter
preds = [p for p in preds if p['score'] >= 0.50]

print("\nPredictions:")
for p in preds:
    print(f"{p['word']} -> {p['entity_group']} ({p['score']:.3f})")

# Quick grouping
from collections import defaultdict
grouped = defaultdict(list)
for p in preds:
    grouped[p['entity_group']].append(p['word'])

print("\nGrouped:")
for k,v in grouped.items():
    print(k, ":", v)


num_labels: 11
id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}
Cleaned: KEEP FluVOXAMine MALEATE 50MG TABLET STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT TAKE WITH OR AFTER FOOD. AVOID ALCOHOL. TEL: 6930 2262

Predictions:
F -> MEDICATION_NAME (0.506)

Grouped:
MEDICATION_NAME : ['F']


In [17]:
#!/usr/bin/env python3
"""
Preprocess -> 11-label NER (no Tesseract)

- Uses your preprocess_ocr_text() exactly (with the tiny regex fix).
- Loads model from MODEL_DIR.
- Runs NER with aggregation and prints a structured summary.

If you want the raw TEST_TEXT lowercased before preprocessing,
set LOWERCASE_TEST_TEXT = True.
"""

import os
import re
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# === Configuration ===
MODEL_DIR = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
CONF_THRESHOLD = 0.35
LOWERCASE_TEST_TEXT = True  # <-- toggle to True if you want test text lowercased

TEST_TEXT = """KEEP AWAY FROM CHILDREN Total: 15
2 2C-1-112 145 TAB (1/1) eee
= FluVOXAMine MALEATE SOMG TAB

z
<
3
=
S
=
°
5

STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT
TAKE WITH OR AFTER FOOD. AVOID ALCOHOL.

DIONIS WEE YUN RU °y-SKH 4010/2023
XXXXx982H
MDS / SOC1-SKH-23-1505924001 00003WEK

SENGKANG GENERAL

§ at
HOSPITAL, Outpatient Pharmacy
410 Sengkang East Way, Singapore 644886 TEL: 6930 2262"""
 

# === Your preprocessing function (kept as-is except for a tiny regex fix) ===
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    # (Fix: removed a stray empty alternative '||' that could over-match)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=|•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def group_entities(preds: List[Dict[str, Any]], min_score: float) -> Dict[str, List[str]]:
    grouped: Dict[str, List[str]] = {}
    for p in preds:
        if p.get("score", 0) < min_score:
            continue
        k = p.get("entity_group") or p.get("entity")
        if not k:
            continue
        grouped.setdefault(k, []).append(p["word"])
    return grouped

def main():
    raw_text = TEST_TEXT.lower() if LOWERCASE_TEST_TEXT else TEST_TEXT
    print("=== Raw text ===")
    print(raw_text)

    cleaned = preprocess_ocr_text(raw_text)
    print("\n=== Cleaned text ===")
    print(cleaned)

    # Load NER (11 labels)
    tok = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)

    ner = pipeline("ner", model=model, tokenizer=tok, aggregation_strategy="simple")

    preds = ner(cleaned)
    print("\n=== Aggregated predictions ===")
    for p in preds:
        print(f"{p['word']:20} -> {p['entity_group']:18} ({p['score']:.3f})")

    grouped = group_entities(preds, min_score=CONF_THRESHOLD)
    print("\n=== Grouped entities (threshold {:.2f}) ===".format(CONF_THRESHOLD))
    for k, v in grouped.items():
        print(f"{k:18}: {v}")

    # Structured summary
    join = lambda xs: " ".join(xs) if xs else ""
    summary = {
        "medication":  join(grouped.get("MEDICATION_NAME", [])),
        "dosage":      join(grouped.get("DOSAGE", [])),
        "frequency":   join(grouped.get("FREQUENCY", [])),
        "instruction": join(grouped.get("INSTRUCTION", [])),
        "note":        join(grouped.get("NOTE", [])),
        "text":        cleaned,
    }

    print("\n=== Structured ===")
    for k, v in summary.items():
        print(f"{k:12}: {v}")

if __name__ == "__main__":
    main()


=== Raw text ===
keep away from children total: 15
2 2c-1-112 145 tab (1/1) eee
= fluvoxamine maleate somg tab

z
<
3
=
s
=
°
5

step 3: take half tablet every other night
take with or after food. avoid alcohol.

dionis wee yun ru °y-skh 4010/2023
xxxxx982h
mds / soc1-skh-23-1505924001 00003wek

sengkang general

§ at
hospital, outpatient pharmacy
410 sengkang east way, singapore 644886 tel: 6930 2262

=== Cleaned text ===
keep away from children total: 15 2 2c-1-112 145 tab (1/1) eee fluvoxamine maleate somg tab z 3 s ° 5 step 3: take half tablet every other night take with or after food. avoid alcohol. dionis wee yun ru °y-skh 4010/2023 xxxxx982h mds / soc1-skh-23-1505924001 00003wek sengkang general at hospital, outpatient pharmacy 410 sengkang east way, singapore 644886 tel: 6930 2262

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-N

In [25]:
#!/usr/bin/env python3
"""
Preprocess -> 11-label NER (no Tesseract)

- Uses your preprocess_ocr_text() exactly (with the tiny regex fix).
- Loads model from MODEL_DIR.
- Runs NER with aggregation and prints a structured summary.

If you want the raw TEST_TEXT lowercased before preprocessing,
set LOWERCASE_TEST_TEXT = True.
"""

import os
import re
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# === Configuration ===
MODEL_DIR = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
CONF_THRESHOLD = 0.35
LOWERCASE_TEST_TEXT = True  # <-- toggle to True if you want test text lowercased

TEST_TEXT = """KEEP FluVOXAMine MALEATE SOMG TAB
STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT
TAKE WITH OR AFTER FOOD. AVOID ALCOHOL.
TEL: 6930 2262"""

# === Your preprocessing function (kept as-is except for a tiny regex fix) ===
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    # (Fix: removed a stray empty alternative '||' that could over-match)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=|•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def group_entities(preds: List[Dict[str, Any]], min_score: float) -> Dict[str, List[str]]:
    grouped: Dict[str, List[str]] = {}
    for p in preds:
        if p.get("score", 0) < min_score:
            continue
        k = p.get("entity_group") or p.get("entity")
        if not k:
            continue
        grouped.setdefault(k, []).append(p["word"])
    return grouped

def main():
    raw_text = TEST_TEXT.lower() if LOWERCASE_TEST_TEXT else TEST_TEXT
    print("=== Raw text ===")
    print(raw_text)

    cleaned = preprocess_ocr_text(raw_text)
    print("\n=== Cleaned text ===")
    print(cleaned)

    # Load NER (11 labels)
    tok = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)

    ner = pipeline("ner", model=model, tokenizer=tok, aggregation_strategy="simple")

    preds = ner(cleaned)
    print("\n=== Aggregated predictions ===")
    for p in preds:
        print(f"{p['word']:20} -> {p['entity_group']:18} ({p['score']:.3f})")

    grouped = group_entities(preds, min_score=CONF_THRESHOLD)
    print("\n=== Grouped entities (threshold {:.2f}) ===".format(CONF_THRESHOLD))
    for k, v in grouped.items():
        print(f"{k:18}: {v}")

    # Structured summary
    join = lambda xs: " ".join(xs) if xs else ""
    summary = {
        "medication":  join(grouped.get("MEDICATION_NAME", [])),
        "dosage":      join(grouped.get("DOSAGE", [])),
        "frequency":   join(grouped.get("FREQUENCY", [])),
        "instruction": join(grouped.get("INSTRUCTION", [])),
        "note":        join(grouped.get("NOTE", [])),
        "text":        cleaned,
    }

    print("\n=== Structured ===")
    for k, v in summary.items():
        print(f"{k:12}: {v}")

if __name__ == "__main__":
    main()


=== Raw text ===
keep fluvoxamine maleate somg tab
step 3: take half tablet every other night
take with or after food. avoid alcohol.
tel: 6930 2262

=== Cleaned text ===
keep fluvoxamine maleate somg tab step 3: take half tablet every other night take with or after food. avoid alcohol. tel: 6930 2262

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}

=== Aggregated predictions ===
keep                 -> MEDICATION_NAME    (0.973)
flu                  -> MEDICATION_NAME    (0.988)
##vo                 -> MEDICATION_NAME    (0.988)
##xa                 -> MEDICATION_NAME    (0.995)
##mine               -> MEDICATION_NAME    (0.987)
##ate                -> MEDICATION_NAME    (0.690)
take                 -> NOTE               (0.593)
half tablet          -> DOSAGE             (0.985)
every                -> FREQU

In [26]:
#!/usr/bin/env python3
"""
Image -> Tesseract OCR -> preprocess_ocr_text -> 11-label NER
"""

import os
import re
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# === Paths / Config ===
IMAGE_PATH = r"C:\Users\prisc\Downloads\DionisMed.jpeg"
MODEL_DIR  = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   # block of text
CONF_THRESHOLD = 0.35
LOWERCASE_AFTER_OCR = True     # keep False (you said lowercase affects med-name matching)

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# === Your preprocessing function (unchanged except for a tiny regex fix) ===
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols
    # (Fix: removed a stray empty alternative '||' that could over-match)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=|•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def group_entities(preds: List[Dict[str, Any]], min_score: float) -> Dict[str, List[str]]:
    grouped: Dict[str, List[str]] = {}
    for p in preds:
        if p.get("score", 0) < min_score:
            continue
        k = p.get("entity_group") or p.get("entity")
        if not k:
            continue
        grouped.setdefault(k, []).append(p["word"])
    return grouped

def main():
    # 1) OCR from image
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)
    if LOWERCASE_AFTER_OCR:
        ocr_text = ocr_text.lower()

    print("=== OCR raw ===")
    print(ocr_text)

    # 2) Preprocess
    cleaned = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned)

    # 3) Load NER (11 labels) & run
    tok = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)

    ner = pipeline("ner", model=model, tokenizer=tok, aggregation_strategy="simple")
    preds = ner(cleaned)

    print("\n=== Aggregated predictions ===")
    for p in preds:
        print(f"{p['word']:20} -> {p['entity_group']:18} ({p['score']:.3f})")

    grouped = group_entities(preds, min_score=CONF_THRESHOLD)
    print("\n=== Grouped entities (threshold {:.2f}) ===".format(CONF_THRESHOLD))
    for k, v in grouped.items():
        print(f"{k:18}: {v}")

    # 4) Structured summary
    join = lambda xs: " ".join(xs) if xs else ""
    summary = {
        "medication":  join(grouped.get("MEDICATION_NAME", [])),
        "dosage":      join(grouped.get("DOSAGE", [])),
        "frequency":   join(grouped.get("FREQUENCY", [])),
        "instruction": join(grouped.get("INSTRUCTION", [])),
        "note":        join(grouped.get("NOTE", [])),
        "ocr_text":    cleaned,
    }

    print("\n=== Structured ===")
    for k, v in summary.items():
        print(f"{k:12}: {v}")

if __name__ == "__main__":
    main()


=== OCR raw ===
bebrg fluvvoxamine maleate somg tab
step 3: take half tablet every other night
take with or after food. avoid alcohol.


=== Cleaned text ===
bebrg fluvvoxamine maleate somg tab step 3: take half tablet every other night take with or after food. avoid alcohol.

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}

=== Aggregated predictions ===
be                   -> MEDICATION_NAME    (0.991)
##b                  -> MEDICATION_NAME    (0.967)
##rg                 -> MEDICATION_NAME    (0.961)
##mine maleate       -> MEDICATION_NAME    (0.694)
take                 -> NOTE               (0.766)
half tablet          -> DOSAGE             (0.951)
every                -> FREQUENCY          (0.987)
other night          -> FREQUENCY          (0.727)
with or after food   -> INSTRUCTION        (0.986)
avoi

In [9]:
#!/usr/bin/env python3
"""
Image -> Tesseract OCR -> preprocess_ocr_text (UNMODIFIED) -> NER (agg='none')
-> merge subwords -> group IOB -> DrugBank fuzzy-correct -> structured JSON
"""

import os
import re
import json
import difflib
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ======= PATHS / CONFIG =======
IMAGE_PATH   = r"C:\Users\prisc\Downloads\DionisMed.jpeg"
MODEL_DIR    = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
DRUGBANK_CSV = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\drugbank_vocabulary.csv"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   # block of text
LOWERCASE_AFTER_OCR = True      # <-- set False if you don't want lowercasing

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# ======= PREPROCESSING (YOUR ORIGINAL VERSION, UNCHANGED) =======
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols (original pattern preserved)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }

    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []

    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# ======= POSTPROCESSING (YOURS) =======
def load_drugbank_vocab(csv_path, column="name"):
    df = pd.read_csv(csv_path)
    return df[column].dropna().str.lower().unique().tolist()

def correct_drug_name(name, drugbank_vocab):
    matches = difflib.get_close_matches(name.lower(), drugbank_vocab, n=1, cutoff=0.7)
    return matches[0] if matches else name

def word_to_number(word):
    word_map = {
        "one": 1, "1": 1,
        "two": 2, "2": 2,
        "three": 3, "3": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        prefix, label = tag.split("-") if "-" in tag else ("O", tag)

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

def infer_and_format(text, drugbank_vocab, ner_pipeline):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    # Medication Name
    meds = grouped_output.get("MEDICATION_NAME", [])
    if meds:
        med_name = clean_text(meds[0])
        corrected = correct_drug_name(med_name, drugbank_vocab)
        final["medicationName"] = corrected

    # Dosage or Quantity
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity: break
    final["intakeQuantity"] = quantity

    # Frequency
    freqs = grouped_output.get("FREQUENCY", [])
    freq_number = 0
    for f in freqs:
        for word in f.split():
            num = word_to_number(word)
            if num is not None:
                freq_number = num
                break
        if freq_number: break
    final["frequency"] = freq_number

    # Instructions
    instr = grouped_output.get("INSTRUCTION", [])
    if instr:
        final["instructions"] = clean_text(" ".join(instr))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save to JSON
    with open("ner_v5_output1.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4, ensure_ascii=False)

    return final

# ======= MAIN =======
def main():
    # 1) OCR
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)
    if LOWERCASE_AFTER_OCR:
        ocr_text = ocr_text.lower()

    print("=== OCR raw ===")
    print(ocr_text)

    # 2) Preprocess (your original function)
    cleaned_text = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned_text)

    # 3) NER (agg='none' so your merge_subwords + grouping work)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # 4) DrugBank vocab
    if not os.path.exists(DRUGBANK_CSV):
        raise FileNotFoundError(f"DrugBank CSV not found: {DRUGBANK_CSV}")
    vocab = load_drugbank_vocab(DRUGBANK_CSV, column="name")

    # 5) Inference + postprocessing
    final = infer_and_format(cleaned_text, vocab, ner_pipe)

    print("\n=== Final JSON ===")
    print(json.dumps(final, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()


=== OCR raw ===
bebrg fluvvoxamine maleate somg tab
step 3: take half tablet every other night
take with or after food. avoid alcohol.


=== Cleaned text ===
bebrg fluvvoxamine maleate somg tab step 3: take half tablet every other night take with or after food. avoid alcohol.

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}

=== Final JSON ===
{
    "medicationName": "bebrgmine maleate",
    "intakeQuantity": 0.5,
    "frequency": 0,
    "instructions": "with or after food",
    "notes": "avoid alcohol"
}


In [None]:
#!/usr/bin/env python3
"""
Image -> Tesseract OCR -> preprocess_ocr_text (UNMODIFIED) -> NER (agg='none')
-> merge subwords -> group IOB -> DrugBank fuzzy-correct -> structured JSON
frequency work but medication name doesnt
"""

import os
import re
import json
import difflib
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ======= PATHS / CONFIG =======
IMAGE_PATH   = r"C:\Users\prisc\Downloads\WhatsApp Image 2025-08-07 at 12.38.50 PM.jpeg"
MODEL_DIR    = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
DRUGBANK_CSV = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\drugbank_vocabulary.csv"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   # block of text
LOWERCASE_AFTER_OCR = False      # <-- set False if you don't want lowercasing

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# ======= PREPROCESSING (YOUR ORIGINAL VERSION, UNCHANGED) =======
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols (original pattern preserved)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }

    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []

    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# ======= POSTPROCESSING (YOURS) =======
def load_drugbank_vocab(csv_path, column="name"):
    df = pd.read_csv(csv_path)
    return df[column].dropna().str.lower().unique().tolist()

def correct_drug_name(name, drugbank_vocab):
    matches = difflib.get_close_matches(name.lower(), drugbank_vocab, n=1, cutoff=0.7)
    return matches[0] if matches else name

def word_to_number(word):
    word_map = {
        "one": 1, "1": 1, "once": 1,
        "two": 2, "2": 2, "twice": 2,
        "three": 3, "3": 3, "thrice": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        prefix, label = tag.split("-") if "-" in tag else ("O", tag)

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

def infer_and_format(text, drugbank_vocab, ner_pipeline):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    # Medication Name
    meds = grouped_output.get("MEDICATION_NAME", [])
    if meds:
        med_name = clean_text(meds[0])
        corrected = correct_drug_name(med_name, drugbank_vocab)
        final["medicationName"] = corrected

    # === Dosage / intakeQuantity (rule: first numeric else first word-number; default 0)
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity:
            break
    final["intakeQuantity"] = quantity

    # === Frequency (rule: first numeric/word-number; else 1 if leftover words like 'daily'; else 0)
    freq_phrases = grouped_output.get("FREQUENCY", [])
    freq_nums = []
    freq_words = []

    for phrase in freq_phrases:
        for w in phrase.split():
            lw = w.lower()
            if lw in {"times", "time"}:
                continue  # skip filler words
            if w.isnumeric():
                freq_nums.append(int(w))
            else:
                val = word_to_number(w)
                if val is not None:
                    freq_nums.append(val)
                else:
                    freq_words.append(w)

    if freq_nums:
        final["frequency"] = freq_nums[0]
    elif freq_words:
        final["frequency"] = 1
    else:
        final["frequency"] = 0

    # === Instructions (rule: frequency leftover words + INSTRUCTION tokens)
    instr_tokens = freq_words + grouped_output.get("INSTRUCTION", [])
    if instr_tokens:
        final["instructions"] = clean_text(" ".join(instr_tokens))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save to JSON
    with open("ner_v5_output1.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4, ensure_ascii=False)

    return final


# ======= MAIN =======
def main():
    # 1) OCR
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)
    if LOWERCASE_AFTER_OCR:
        ocr_text = ocr_text.lower()

    print("=== OCR raw ===")
    print(ocr_text)

    # 2) Preprocess (your original function)
    cleaned_text = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned_text)

    # 3) NER (agg='none' so your merge_subwords + grouping work)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # 4) DrugBank vocab
    if not os.path.exists(DRUGBANK_CSV):
        raise FileNotFoundError(f"DrugBank CSV not found: {DRUGBANK_CSV}")
    vocab = load_drugbank_vocab(DRUGBANK_CSV, column="name")

    # 5) Inference + postprocessing
    final = infer_and_format(cleaned_text, vocab, ner_pipe)

    print("\n=== Final JSON ===")
    print(json.dumps(final, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()


=== OCR raw ===
¢
rad
a
&
Ms
Wy ¥ peer an oat
Me hak e
— - hes
z KEEP AWAY FROM CHILDREN Total: 157
SRA 2C-1-112 45 TAB (1/1) ci aa
- = FluVOXAMine MALEATE 50MG TAB
| STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT i
f TAKE WITH OR AFTER FOOD. AVOID ALCOHOL. :
: 4 :
DIONIS WEE YUN RU Ap SKH Toi102023
XXXXx982H ~ We
MDS/ §OC1-SKH-23-1505924001 OO003WEK 3 NES
SENGKANG GENERAL HOSPITAL, Outpatient Pharmacy er
410 Sengkang East Way, Singapore 544886 TEL: 6930 2262


=== Cleaned text ===
rad a & Ms Wy ¥ peer an oat Me hak e - hes z KEEP AWAY FROM CHILDREN Total: 157 SRA 2C-1-112 45 TAB (1/1) ci aa - FluVOXAMine MALEATE 50MG TAB STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT i f TAKE WITH OR AFTER FOOD. AVOID ALCOHOL. : : 4 : DIONIS WEE YUN RU Ap SKH Toi102023 XXXXx982H We MDS/ OC1-SKH-23-1505924001 OO003WEK 3 NES SENGKANG GENERAL HOSPITAL, Outpatient Pharmacy er 410 Sengkang East Way, Singapore 544886 TEL: 6930 2262

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY

In [41]:
#!/usr/bin/env python3
"""
Image -> Tesseract OCR -> preprocess_ocr_text (UNMODIFIED) -> NER (agg='none')
-> merge subwords -> group IOB -> DrugBank fuzzy-correct -> structured JSON
is in the current OCR pipeline v5 in final model. added preprocessing of Strict filter. added drug name extraction, fuzzy etc
"""

import os
import re
import json
import difflib
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ======= PATHS / CONFIG =======
IMAGE_PATH   = r"C:\Users\prisc\Downloads\WhatsApp Image 2025-08-13 at 1.09.17 AM.jpeg"
MODEL_DIR    = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
DRUGBANK_CSV = r"C:\Users\prisc\OneDrive\Desktop\Final_ML_model\drugbank_vocabulary.csv"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   # block of text
LOWERCASE_AFTER_OCR = False      # <-- set False if you don't want lowercasing

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# ======= PREPROCESSING (YOUR ORIGINAL VERSION, UNCHANGED) =======
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols (original pattern preserved)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Strict filter: keep only letters, numbers, and spaces
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# ======= POSTPROCESSING (YOURS) =======
def load_drugbank_vocab(csv_path, column="name"):
    df = pd.read_csv(csv_path)
    return df[column].dropna().str.lower().unique().tolist()

def correct_drug_name(name, drugbank_vocab):
    matches = difflib.get_close_matches(name.lower(), drugbank_vocab, n=1, cutoff=0.7)
    return matches[0] if matches else name

def is_all_upper_words(text: str) -> bool:
    """True if there is at least one alphabetic token and all such tokens are UPPERCASE."""
    tokens = re.findall(r"[A-Za-z]+", text)
    if not tokens:
        return False
    return all(t.isupper() for t in tokens)

def correct_drug_name_caseaware(name: str, drugbank_vocab, cutoff: float = 0.8):
    """
    Return (best_name, matched_bool). Prefer exact case-insensitive match to DrugBank,
    else fuzzy match at >= cutoff, else return original name.
    """
    # Build a map for exact case-insensitive lookup
    lower_to_cased = {v.lower(): v for v in drugbank_vocab}

    # Exact (case-insensitive)
    lower = name.lower()
    if lower in lower_to_cased:
        return lower_to_cased[lower], True

    # Fuzzy (>= cutoff)
    cand = difflib.get_close_matches(lower, list(lower_to_cased.keys()), n=1, cutoff=cutoff)
    if cand:
        return lower_to_cased[cand[0]], True

    # No good match: keep original
    return name, False

def word_to_number(word):
    word_map = {
        "one": 1, "1": 1, "once": 1,
        "two": 2, "2": 2, "twice": 2,
        "three": 3, "3": 3, "thrice": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        prefix, label = tag.split("-") if "-" in tag else ("O", tag)

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

def find_medication_in_text(text: str, drugbank_vocab, cutoff: float = 0.8):
    """
    Try to find a medication name directly from text using DrugBank vocab.
    1) Exact (case-insensitive) substring match (prefer the longest).
    2) If none, fuzzy match on 1-3 word windows with >= cutoff.
    Return best_name or None.
    """
    txt = text.lower()

    # 1) Exact substring matches (prefer the longest)
    best_exact = None
    for name in drugbank_vocab:  # already lowercased list from load_drugbank_vocab
        if name in txt:
            if best_exact is None or len(name) > len(best_exact):
                best_exact = name
    if best_exact:
        return best_exact

    # 2) Fuzzy on 1-3 word windows
    tokens = re.findall(r"[a-z][a-z\-]+", txt)
    seen = set()
    for i in range(len(tokens)):
        for n in (3, 2, 1):  # try longer windows first
            if i + n <= len(tokens):
                cand = " ".join(tokens[i:i+n])
                if cand in seen:
                    continue
                seen.add(cand)
                match = difflib.get_close_matches(cand, drugbank_vocab, n=1, cutoff=cutoff)
                if match:
                    return match[0]
    return None

def canonical_drugbank_spelling(name_lc: str, drugbank_vocab):
    """Map a lowercased hit back to DrugBank’s cased spelling if available."""
    # If your CSV has canonical casing in a separate column, adapt this.
    # Here we just title-case as a simple presentation fix.
    return name_lc  # keep lowercased; or do: name_lc.title()


def infer_and_format(text, drugbank_vocab, ner_pipeline):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    # --- Medication Name ---
    meds = grouped_output.get("MEDICATION_NAME", [])
    if meds:
        # Model provided a name -> use your existing correction behavior
        med_name = clean_text(meds[0])
        corrected, matched = correct_drug_name_caseaware(med_name, drugbank_vocab, cutoff=0.8)
        final["medicationName"] = corrected if matched else med_name
    else:
        # Fallback: detect directly from the cleaned text (the same string you passed in)
        fallback = find_medication_in_text(text, drugbank_vocab, cutoff=0.8)
        if fallback:
            final["medicationName"] = canonical_drugbank_spelling(fallback, drugbank_vocab)


    # === Dosage / intakeQuantity (rule: first numeric else first word-number; default 0)
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity:
            break
    final["intakeQuantity"] = quantity

    # === Frequency (rule: first numeric/word-number; else 1 if leftover words like 'daily'; else 0)
    freq_phrases = grouped_output.get("FREQUENCY", [])
    freq_nums = []
    freq_words = []

    for phrase in freq_phrases:
        for w in phrase.split():
            lw = w.lower()
            if lw in {"times", "time"}:
                continue  # skip filler words
            if w.isnumeric():
                freq_nums.append(int(w))
            else:
                val = word_to_number(w)
                if val is not None:
                    freq_nums.append(val)
                else:
                    freq_words.append(w)

    if freq_nums:
        final["frequency"] = freq_nums[0]
    elif freq_words:
        final["frequency"] = 1
    else:
        final["frequency"] = 0

    # === Instructions (rule: frequency leftover words + INSTRUCTION tokens)
    instr_tokens = freq_words + grouped_output.get("INSTRUCTION", [])
    if instr_tokens:
        final["instructions"] = clean_text(" ".join(instr_tokens))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save to JSON
    with open("ner_v5_output1.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4, ensure_ascii=False)

    return final


# ======= MAIN =======
def main():
    # 1) OCR
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)

    # New rule: only lowercase if ALL words are uppercase
    if is_all_upper_words(ocr_text):
        ocr_text = ocr_text.lower()


    print("=== OCR raw ===")
    print(ocr_text)

    # 2) Preprocess (your original function)
    cleaned_text = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned_text)

    # 3) NER (agg='none' so your merge_subwords + grouping work)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # 4) DrugBank vocab
    if not os.path.exists(DRUGBANK_CSV):
        raise FileNotFoundError(f"DrugBank CSV not found: {DRUGBANK_CSV}")
    vocab = load_drugbank_vocab(DRUGBANK_CSV, column="name")

    # 5) Inference + postprocessing
    # LOWERCASE ONLY FOR THE NER MODEL INPUT
    text_for_ner = cleaned_text.lower()
    final = infer_and_format(text_for_ner, vocab, ner_pipe)


    print("\n=== Final JSON ===")
    print(json.dumps(final, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()


=== OCR raw ===
20/20 TA mes 5 > .
loRATAdine 10MG TAB 4 | i}
TAKE 1 TABLET(S) EVERY MORNING WHEN NEEDED ay
alcoholic drinks c } |
Poy )¥

XxxXx548 28/07/2025 KAL-390030
LEWIS HUANG KAIZHEN Lia

x c LINIC P J Yi
 ERREE , <AuE porcine marae B07

a : y 4

4
;

") \
Lage Zs a | |e


=== Cleaned text ===

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}

=== Final JSON ===
{
    "medicationName": "loratadine",
    "intakeQuantity": 1,
    "frequency": 1,
    "instructions": "every morning when needed with or without food"
}


In [None]:
#code inside final_ner_model ocr_ner_pipeline which is the code cleaned from the top codes. This is the submitted one
import os
import re
import json
import difflib
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

IMAGE_PATH   = r"C:\Users\prisc\Downloads\WhatsApp Image 2025-08-07 at 12.38.50 PM.jpeg"
MODEL_DIR    = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
DRUGBANK_CSV = r"C:\Users\prisc\OneDrive\Desktop\Final_ML_model\drugbank_vocabulary.csv"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   
LOWERCASE_AFTER_OCR = False   

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# preprocessing
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols (original pattern preserved)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # keep only letters, numbers, and spaces
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# postprocessing
def load_drugbank_vocab(csv_path, column="name"):
    df = pd.read_csv(csv_path)
    return df[column].dropna().str.lower().unique().tolist()

def is_all_upper_words(text: str) -> bool:
    """True if there is at least one alphabetic token and all such tokens are UPPERCASE."""
    tokens = re.findall(r"[A-Za-z]+", text)
    if not tokens:
        return False
    return all(t.isupper() for t in tokens)

# case-insensitive. match to DrugBank else match >=cutoff, else return original name
def correct_drug_name_caseaware(name: str, drugbank_vocab, cutoff: float = 0.8):
    lower_to_cased = {v.lower(): v for v in drugbank_vocab}

    lower = name.lower()
    if lower in lower_to_cased:
        return lower_to_cased[lower], True

    cand = difflib.get_close_matches(lower, list(lower_to_cased.keys()), n=1, cutoff=cutoff)
    if cand:
        return lower_to_cased[cand[0]], True

    return name, False

def word_to_number(word):
    word_map = {
        "one": 1, "1": 1, "once": 1,
        "two": 2, "2": 2, "twice": 2,
        "three": 3, "3": 3, "thrice": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        prefix, label = tag.split("-") if "-" in tag else ("O", tag)

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

# To find medication name directly from text using DrugBank vocab.
def find_medication_in_text(text: str, drugbank_vocab, cutoff: float = 0.8):
   
    txt = text.lower()

    # 1) Exact substring matches (prefer the longest)
    best_exact = None
    for name in drugbank_vocab: 
        if name in txt:
            if best_exact is None or len(name) > len(best_exact):
                best_exact = name
    if best_exact:
        return best_exact

    # 2) Fuzzy on 1-3 word windows
    tokens = re.findall(r"[a-z][a-z\-]+", txt)
    seen = set()
    for i in range(len(tokens)):
        for n in (3, 2, 1):  # try longer windows first
            if i + n <= len(tokens):
                cand = " ".join(tokens[i:i+n])
                if cand in seen:
                    continue
                seen.add(cand)
                match = difflib.get_close_matches(cand, drugbank_vocab, n=1, cutoff=cutoff)
                if match:
                    return match[0]
    return None


def infer_and_format(text, drugbank_vocab, ner_pipeline):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    meds = grouped_output.get("MEDICATION_NAME", [])
    if meds:
        
        med_name = clean_text(meds[0])
        corrected, matched = correct_drug_name_caseaware(med_name, drugbank_vocab, cutoff=0.8)
        final["medicationName"] = corrected if matched else med_name
    else:
        fallback = find_medication_in_text(text, drugbank_vocab, cutoff=0.8)
        if fallback:
            final["medicationName"] = fallback


    # Dosage/intakeQuantity(first numeric else first word-number; default 0)
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity:
            break
    final["intakeQuantity"] = quantity

    # Frequency(first numeric/word-number; else 1 if leftover words like 'daily'; else 0)
    freq_phrases = grouped_output.get("FREQUENCY", [])
    freq_nums = []
    freq_words = []

    for phrase in freq_phrases:
        for w in phrase.split():
            lw = w.lower()
            if lw in {"times", "time"}:
                continue 
            if w.isnumeric():
                freq_nums.append(int(w))
            else:
                val = word_to_number(w)
                if val is not None:
                    freq_nums.append(val)
                else:
                    freq_words.append(w)

    if freq_nums:
        final["frequency"] = freq_nums[0]
    elif freq_words:
        final["frequency"] = 1
    else:
        final["frequency"] = 0

    # Instructions (frequency leftover words + INSTRUCTION tokens)
    instr_tokens = freq_words + grouped_output.get("INSTRUCTION", [])
    if instr_tokens:
        final["instructions"] = clean_text(" ".join(instr_tokens))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save to JSON
    with open("ner_v5_output1.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4, ensure_ascii=False)

    return final


def main():
    # OCR
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)

    # only lowercase if ALL words are uppercase
    if is_all_upper_words(ocr_text):
        ocr_text = ocr_text.lower()


    print("=== OCR raw ===")
    print(ocr_text)

    # Preprocess 
    cleaned_text = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned_text)

    # NER 
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # DrugBank vocab
    if not os.path.exists(DRUGBANK_CSV):
        raise FileNotFoundError(f"DrugBank CSV not found: {DRUGBANK_CSV}")
    vocab = load_drugbank_vocab(DRUGBANK_CSV, column="name")

    # Inference + postprocessing
    # LOWERCASE ONLY FOR THE NER MODEL INPUT
    text_for_ner = cleaned_text.lower()
    final = infer_and_format(text_for_ner, vocab, ner_pipe)


    print("\n=== Final JSON ===")
    print(json.dumps(final, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()


W0813 01:52:55.057000 31140 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


=== OCR raw ===
¢
rad
a
&
Ms
Wy ¥ peer an oat
Me hak e
— - hes
z KEEP AWAY FROM CHILDREN Total: 157
SRA 2C-1-112 45 TAB (1/1) ci aa
- = FluVOXAMine MALEATE 50MG TAB
| STEP 3: TAKE HALF TABLET EVERY OTHER NIGHT i
f TAKE WITH OR AFTER FOOD. AVOID ALCOHOL. :
: 4 :
DIONIS WEE YUN RU Ap SKH Toi102023
XXXXx982H ~ We
MDS/ §OC1-SKH-23-1505924001 OO003WEK 3 NES
SENGKANG GENERAL HOSPITAL, Outpatient Pharmacy er
410 Sengkang East Way, Singapore 544886 TEL: 6930 2262


=== Cleaned text ===
rad a Ms Wy peer an oat Me hak e hes z KEEP AWAY FROM CHILDREN Total 157 SRA 2C 1 112 45 TAB 1 1 ci aa FluVOXAMine MALEATE 50MG TAB STEP 3 TAKE HALF TABLET EVERY OTHER NIGHT i f TAKE WITH OR AFTER FOOD AVOID ALCOHOL 4 DIONIS WEE YUN RU Ap SKH Toi102023 XXXXx982H We MDS OC1 SKH 23 1505924001 OO003WEK 3 NES SENGKANG GENERAL HOSPITAL Outpatient Pharmacy er 410 Sengkang East Way Singapore 544886 TEL 6930 2262

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 

In [None]:
#!/usr/bin/env python3
"""
Image -> Tesseract OCR -> preprocess_ocr_text (UNMODIFIED) -> NER (agg='none')
-> merge subwords -> group IOB -> DrugBank fuzzy-correct -> structured JSON
frequency work but medication name doesnt

Added more medication name reading function but end up is slower so not using
"""

import os
import re
import json
import difflib
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Any
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ======= PATHS / CONFIG =======
IMAGE_PATH   = r"C:\Users\prisc\Downloads\WhatsApp Image 2025-08-12 at 5.52.20 PM.jpeg"
MODEL_DIR    = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\new_tesseract_ner\ner_model5"
DRUGBANK_CSV = r"C:\Users\prisc\OneDrive\Desktop\Github\MediMind\ml\drugbank_vocabulary.csv"

TESS_LANG = "eng"
TESS_CFG  = "--oem 3 --psm 6"   # block of text
LOWERCASE_AFTER_OCR = False      # <-- set False if you don't want lowercasing

# Try to auto-detect Tesseract on Windows if not on PATH
if os.name == "nt":
    default_tess = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if os.path.exists(default_tess):
        pytesseract.pytesseract.tesseract_cmd = default_tess

# ======= PREPROCESSING (YOUR ORIGINAL VERSION, UNCHANGED) =======
def preprocess_ocr_text(text) -> str:
    if not isinstance(text, str):
        return ""
    # Replace known OCR noise characters
    text = text.replace("â€œ", "").replace("â€", "").replace("â€˜", "").replace("â€™", "'")

    # Remove unwanted symbols (original pattern preserved)
    text = re.sub(r"(â|Â|¢|§|«|©|®|€|“|”|‘|’|™|…|_|=||•|—|–|@|%|<|>|\\|\||~|`)", "", text)

    # Fix common formatting issues
    text = re.sub(r"(\d)(tab/s|tablet[s]?|cap[s]?|capsule[s]?)", r"\1 tablet", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)(times)", r"\1 times", text, flags=re.IGNORECASE)
    text = re.sub(r"(\d)\s*x\s*(a|per)?\s*day", r"\1 times a day", text, flags=re.IGNORECASE)

    # Normalize known expressions
    replacements = {
        "twice a day": "2 times a day",
        "three times daily": "3 times a day",
        "when necessary": "when needed",
        "when required": "when needed",
    }
    for wrong, correct in replacements.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)

    # remove irrelevant data
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        # Skip if line contains clinic/address info
        if any(x in line for x in ["clinic", "centre", "hospital", "#", "blk", "building", "road", "s "]):
            continue
        # Skip prices and quantities
        if re.search(r"\bqty\b|\bprice\b|\$\d+|\d+\.\d{2}", line):
            continue
        cleaned.append(line)

    # Strict filter: keep only letters, numbers, and spaces
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# ======= POSTPROCESSING (YOURS) =======
def load_drugbank_vocab(csv_path, column="name"):
    df = pd.read_csv(csv_path)
    return df[column].dropna().str.lower().unique().tolist()

def correct_drug_name(name, drugbank_vocab):
    matches = difflib.get_close_matches(name.lower(), drugbank_vocab, n=1, cutoff=0.7)
    return matches[0] if matches else name

def is_all_upper_words(text: str) -> bool:
    """True if there is at least one alphabetic token and all such tokens are UPPERCASE."""
    tokens = re.findall(r"[A-Za-z]+", text)
    if not tokens:
        return False
    return all(t.isupper() for t in tokens)

def correct_drug_name_caseaware(name: str, drugbank_vocab, cutoff: float = 0.8):
    """
    Return (best_name, matched_bool). Prefer exact case-insensitive match to DrugBank,
    else fuzzy match at >= cutoff, else return original name.
    """
    # Build a map for exact case-insensitive lookup
    lower_to_cased = {v.lower(): v for v in drugbank_vocab}

    # Exact (case-insensitive)
    lower = name.lower()
    if lower in lower_to_cased:
        return lower_to_cased[lower], True

    # Fuzzy (>= cutoff)
    cand = difflib.get_close_matches(lower, list(lower_to_cased.keys()), n=1, cutoff=cutoff)
    if cand:
        return lower_to_cased[cand[0]], True

    # No good match: keep original
    return name, False

def word_to_number(word):
    word_map = {
        "one": 1, "1": 1, "once": 1,
        "two": 2, "2": 2, "twice": 2,
        "three": 3, "3": 3, "thrice": 3,
        "four": 4, "4": 4,
        "half": 0.5, "quarter": 0.25
    }
    return word_map.get(word.lower())

def merge_subwords(entities):
    merged = []
    for ent in entities:
        if ent["word"].startswith("##") and merged:
            merged[-1]["word"] += ent["word"][2:]
        else:
            merged.append(ent.copy())
    return merged

def group_entities_by_label(entities):
    grouped = defaultdict(list)
    current_label = None
    current_words = []

    for ent in entities:
        tag = ent["entity"]
        prefix, label = tag.split("-") if "-" in tag else ("O", tag)

        if prefix == "B":
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = label
            current_words = [ent["word"]]
        elif prefix == "I" and label == current_label:
            current_words.append(ent["word"])
        else:
            if current_label and current_words:
                grouped[current_label].append(" ".join(current_words))
            current_label = None
            current_words = []

            if prefix == "B":
                current_label = label
                current_words = [ent["word"]]

    if current_label and current_words:
        grouped[current_label].append(" ".join(current_words))

    return grouped

def clean_text(text):
    return text.replace(" ##", "").replace("##", "").strip()

# Precompile once at top of file (optional but nice)
_WORD_RE = re.compile(r"[a-z]+(?:-[a-z]+)*")

def find_medication_in_text(text: str, drugbank_vocab, cutoff: float = 0.8):
    """
    Find a medication name from free text using your DrugBank vocab (lowercased list).
    Strategy:
      1) Exact whole-phrase (case-insensitive) match using word boundaries; prefer the longest match.
      2) Fuzzy match (>= cutoff) over 1–3 word n-grams; try longer windows first.
    Returns: lowercased DrugBank name or None.
    """
    txt = text.lower()

    # 1) Exact whole-phrase match (prefer the longest)
    best_exact = None
    for name in drugbank_vocab:  # drugbank_vocab is already lowercased per your loader
        # word-boundary match to avoid partial hits inside other words
        if re.search(r"\b" + re.escape(name) + r"\b", txt):
            if best_exact is None or len(name) > len(best_exact):
                best_exact = name
    if best_exact:
        return best_exact

    # 2) Fuzzy on 1–3 word windows (longer first)
    tokens = _WORD_RE.findall(txt)
    for n in (3, 2, 1):
        # sliding window
        for i in range(0, max(0, len(tokens) - n + 1)):
            cand = " ".join(tokens[i:i + n])
            match = difflib.get_close_matches(cand, drugbank_vocab, n=1, cutoff=cutoff)
            if match:
                return match[0]

    return None

def canonical_drugbank_spelling(name_lc: str, drugbank_vocab, cased_map: dict | None = None) -> str:
    """
    Map a lowercased DrugBank hit back to canonical casing if you have it.
    - If you pass a dict like {"melatonin": "Melatonin"}, we’ll use that.
    - Otherwise we keep the lowercased form.
    """
    if cased_map and name_lc in cased_map:
        return cased_map[name_lc]
    return name_lc

def choose_medication_name(med_candidate: str, text: str, drugbank_vocab, cutoff: float = 0.8):
    """
    Ensure medicationName comes from DrugBank:
      1) Try model span → exact/fuzzy via correct_drug_name_caseaware.
      2) If not matched, scan full text (exact then fuzzy).
      3) If still nothing, keep model span only if it looks like a real drug word (>=3 letters).
    Returns a string (may be "").
    """
    med_candidate = clean_text((med_candidate or "")).strip()

    if med_candidate:
        corrected, matched = correct_drug_name_caseaware(med_candidate, drugbank_vocab, cutoff=cutoff)
        if matched:
            return corrected  # standardized to DrugBank

    # Fallback: pull from the whole text
    fb = find_medication_in_text(text, drugbank_vocab, cutoff=cutoff)
    if fb:
        return canonical_drugbank_spelling(fb, drugbank_vocab)

    # Last resort: accept only if it looks plausible (>=3 letters)
    return med_candidate if looks_like_valid_drug(med_candidate) else ""


def infer_and_format(text, drugbank_vocab, ner_pipeline):
    raw_output = ner_pipeline(text)
    merged_output = merge_subwords(raw_output)
    grouped_output = group_entities_by_label(merged_output)

    final = {}

    # --- Medication Name ---
    # Medication Name (exact or >=80% fuzzy to DrugBank; keep original if no good match)
    # Medication Name — must be from DrugBank, otherwise guess
    meds = grouped_output.get("MEDICATION_NAME", [])
    med_candidate = meds[0] if meds else ""
    final_name = choose_medication_name(med_candidate, text, drugbank_vocab, cutoff=0.8)
    if final_name:
        final["medicationName"] = final_name
    else:
        # Fallback: detect directly from the cleaned text (the same string you passed in)
        fallback = find_medication_in_text(text, drugbank_vocab, cutoff=0.8)
        if fallback:
            final["medicationName"] = canonical_drugbank_spelling(fallback, drugbank_vocab)


    # === Dosage / intakeQuantity (rule: first numeric else first word-number; default 0)
    dosages = grouped_output.get("DOSAGE", [])
    quantity = 0
    for d in dosages:
        for word in d.split():
            num = word_to_number(word)
            if num is not None:
                quantity = num
                break
        if quantity:
            break
    final["intakeQuantity"] = quantity

    # === Frequency (rule: first numeric/word-number; else 1 if leftover words like 'daily'; else 0)
    freq_phrases = grouped_output.get("FREQUENCY", [])
    freq_nums = []
    freq_words = []

    for phrase in freq_phrases:
        for w in phrase.split():
            lw = w.lower()
            if lw in {"times", "time"}:
                continue  # skip filler words
            if w.isnumeric():
                freq_nums.append(int(w))
            else:
                val = word_to_number(w)
                if val is not None:
                    freq_nums.append(val)
                else:
                    freq_words.append(w)

    if freq_nums:
        final["frequency"] = freq_nums[0]
    elif freq_words:
        final["frequency"] = 1
    else:
        final["frequency"] = 0

    # === Instructions (rule: frequency leftover words + INSTRUCTION tokens)
    instr_tokens = freq_words + grouped_output.get("INSTRUCTION", [])
    if instr_tokens:
        final["instructions"] = clean_text(" ".join(instr_tokens))

    # Notes
    notes = grouped_output.get("NOTE", [])
    if notes:
        final["notes"] = clean_text(" ".join(notes))

    # Save to JSON
    with open("ner_v5_output1.json", "w", encoding="utf-8") as f:
        json.dump(final, f, indent=4, ensure_ascii=False)

    return final


# ======= MAIN =======
def main():
    # 1) OCR
    if not os.path.exists(IMAGE_PATH):
        raise FileNotFoundError(f"Image not found: {IMAGE_PATH}")
    img = Image.open(IMAGE_PATH).convert("RGB")
    ocr_text = pytesseract.image_to_string(img, lang=TESS_LANG, config=TESS_CFG)

    # New rule: only lowercase if ALL words are uppercase
    if is_all_upper_words(ocr_text):
        ocr_text = ocr_text.lower()


    print("=== OCR raw ===")
    print(ocr_text)

    # 2) Preprocess (your original function)
    cleaned_text = preprocess_ocr_text(ocr_text)
    print("\n=== Cleaned text ===")
    print(cleaned_text)

    # 3) NER (agg='none' so your merge_subwords + grouping work)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True)
    print("\nModel num_labels:", model.num_labels)
    print("Model id2label:", model.config.id2label)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # 4) DrugBank vocab
    if not os.path.exists(DRUGBANK_CSV):
        raise FileNotFoundError(f"DrugBank CSV not found: {DRUGBANK_CSV}")
    vocab = load_drugbank_vocab(DRUGBANK_CSV, column="name")

    # 5) Inference + postprocessing
    # LOWERCASE ONLY FOR THE NER MODEL INPUT
    text_for_ner = cleaned_text.lower()
    final = infer_and_format(text_for_ner, vocab, ner_pipe)


    print("\n=== Final JSON ===")
    print(json.dumps(final, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()


=== OCR raw ===
. r r =, Ar nn " — ee meee f
Melatonin 2mg PR Tab (CIRCADIN) Mi

se! Take ONE tablet(s) at bedtime ra
Take 1 to 2 hours before sleep. Swallow whole. Do not cut, crush or chew. 17
ful May cause drowsiness or dizziness. Do not drive or operate machinery. 1 7
23 i | Wi
Avoid alcohol. — ae | yy)

am
|
. i)


=== Cleaned text ===
r r Ar nn ee meee f Melatonin 2mg PR Tab CIRCADIN Mi se Take ONE tablet s at bedtime ra Take 1 to 2 hours before sleep Swallow whole Do not cut crush or chew 17 ful May cause drowsiness or dizziness Do not drive or operate machinery 1 7 23 i Wi Avoid alcohol ae yy am i

Model num_labels: 11
Model id2label: {0: 'B-DOSAGE', 1: 'I-DOSAGE', 2: 'B-FREQUENCY', 3: 'I-FREQUENCY', 4: 'B-INSTRUCTION', 5: 'I-INSTRUCTION', 6: 'B-MEDICATION_NAME', 7: 'I-MEDICATION_NAME', 8: 'B-NOTE', 9: 'I-NOTE', 10: 'O'}

=== Final JSON ===
{
    "medicationName": "melatonin",
    "intakeQuantity": 1,
    "frequency": 1,
    "instructions": "at bedtime swallow whole do not cut 