In [1]:
import re
import json
import time
import boto3
import pandas as pd
from json import JSONDecoder, JSONDecodeError
from pydantic import BaseModel

# ----------------------------
# 1. Configuration
# ----------------------------
DATA_PATH = 'filtered_keys_edit11 1.json'
REGION    = 'us-east-1'
MODEL_ARN = "arn:aws:bedrock:us-east-1:597571589726:inference-profile/us.deepseek.r1-v1:0"

# Bedrock client & model
runtime  = boto3.client("bedrock-runtime", region_name=REGION)
model_id = MODEL_ARN

# The prompt to send to Bedrock
prompt_text = """
You are a document-reading assistant. 
I will give you a block of OCR’d text from a single document. 
Parse it and respond with a single JSON object with exactly these keys at the top level (no nesting):  
- name (full name as string)  
- address (string)   
- dateOfBirth (string in DD/MM/YYYY format; if not found, "")   
- dateOfIssue (string in DD/MM/YYYY format; if not found, "") 
- dateOfExpiry (string in DD/MM/YYYY format; if not found, "")  
- licenceNumber (string; if not found, "")  
- passportNumber (string; if not found, "")  
- cardNumber (string; if not found, "")   
- identityNumber (string; if not found, "")   
- non_pii_text (array of strings):        
For each non-empty line in the OCR text:       
• Remove any numeric substrings, dates, or identifiers after the first digit or slash.        
• Trim whitespace.      
• Keep only the label portion (static text before any digit or slash).        
• Do not include empty strings in the array. If a field isn’t present in the text, set its value to an empty string. 
Return only valid JSON: a flat object with exactly those keys, no extra nesting or commentary. 

Here is one example:
"0d4068c9-c6db-435d-9ed6-c0317ecddf62-enhanced-1_pt.json": {
      "pseudo": "Learner Driver Licence New South Wales, Australia\nChung Jenkins LAMBERT\n. Card Number 2 659 265 265\n95 QMXOYCPEVJZ RD IXTZQVT NSW 2282\nLicence No 57250351 Licence Class C LRN\nDate of Birth 06 JAN 1987\nExpiry Date 03 AUG 2027",
      "ground_truth": {
        "name": "Chung Jenkins LAMBERT",
        "address": "95 QMXOYCPEVJZ RD IXTZQVT NSW 2282",
        "dateOfBirth": "06/01/1987",
        "dateOfIssue": "",
        "dateOfExpiry": "03/08/2027",
        "licenceNumber": "57250351",
        "passportNumber": "",
        "cardNumber": "2659265265",
        "identityNumber": "",
        "non_pii_text": [
          "Learner Driver Licence New South Wales, Australia",
          "Licence No",
          "Licence Class C LRN",
          "Date of Birth",
          "Expiry Date",
          "Card Number"
        ]
      }
    }

""".strip()

# ----------------------------
# 2. Pydantic model for validation
# ----------------------------
class RawDocumentInfo(BaseModel):
    name: str
    address: str
    dateOfBirth: str
    dateOfIssue: str
    dateOfExpiry: str
    licenceNumber: str
    passportNumber: str
    cardNumber: str
    identityNumber: str
    non_pii_text: list[str]

    @classmethod
    def __get_validators__(cls):
        yield from super().__get_validators__()
        yield cls._ensure_list

    @staticmethod
    def _ensure_list(v):
        return v or []

# ----------------------------
# 3. Function to parse a single OCR text block
# ----------------------------
def parse_document_text(input_text: str) -> dict:
    """
    Sends the OCR’d text to Bedrock, cleans/parses the JSON response robustly (using raw_decode),
    unwraps common wrappers, validates via Pydantic, and returns a dict with the
    10 PII keys + non_pii_text. Includes debug prints for raw/cleaned payloads.
    """
    resp = runtime.converse(
        modelId=model_id,
        messages=[{"role": "user", "content": [{"text": prompt_text}, {"text": input_text}]}],
        inferenceConfig={"maxTokens": 512, "temperature": 0.0}
    )

    # 1) extract the raw model payload
    raw = ""
    out = resp.get("output", {}).get("message", {}).get("content", [])
    if out:
        first = out[0]
        if isinstance(first, dict):
            raw = first.get("text") or first.get("content") or json.dumps(first)
        elif isinstance(first, str):
            raw = first
    if not raw:
        raw = json.dumps(resp)

    # 2) strip control tokens/fences
    cleaned = re.sub(r"<\|.*?\|>", "", raw, flags=re.S)
    cleaned = re.sub(r"^```(?:json)?\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.I)
    cleaned = cleaned.strip()

    # ===== DEBUG DUMP =====
    print("\n=== DEBUG: RAW MODEL OUTPUT ===\n")
    print(raw)
    print("\n=== DEBUG: CLEANED TEXT ===\n")
    print(cleaned)
    print("\n===============================\n")

    # 3) first JSON decode (ignore trailing)
    decoder = JSONDecoder()
    try:
        parsed_json, end_idx = decoder.raw_decode(cleaned)
    except JSONDecodeError as e:
        print("❗ JSONDecodeError on cleaned text:")
        print(cleaned)
        raise

    trailing = cleaned[end_idx:].strip()
    if trailing:
        print(f"Warning: ignored trailing data: {repr(trailing[:100])}")

    # 4) unwrap common wrappers...
    #    (same as before)

    # 5) validate & return...
    doc_info = RawDocumentInfo.parse_obj(parsed_json)
    return doc_info.dict()
# ----------------------------
# 4. Metric helper
# ----------------------------
def compute_metrics(tp: int, fp: int, fn: int):
    p = tp/(tp+fp) if tp+fp > 0 else 0.0
    r = tp/(tp+fn) if tp+fn > 0 else 0.0
    f = 2*p*r/(p+r) if p+r > 0 else 0.0
    return p, r, f

# ----------------------------
# 5. Main loop: run, compare & collect metrics
# ----------------------------
metrics_list = []

with open(DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)
#i=0
#k=0
for doc_type, files in data.items():
    #i=i+1
    #if i==20:
        #break
    for file_id, record in files.items():
        #k=k+1
        #if k==20:
            #break
        ocr_text = record.get('pseudo', '').strip()
        if not ocr_text:
            continue

        start = time.time()
        predicted = parse_document_text(ocr_text)
        latency = time.time() - start

        # predicted sets
        pii_fields = {
            predicted[k].strip()
            for k in ['name','address','dateOfBirth','dateOfIssue',
                      'dateOfExpiry','licenceNumber','passportNumber',
                      'cardNumber','identityNumber']
            if predicted.get(k, "").strip()
        }
        non_pii_fields = set(predicted.get('non_pii_text', []))

        # ground-truth sets
        gt = record.get('ground_truth', {}) or {}
        gt_pii = {
            gt[k].strip()
            for k in ['name','address','dateOfBirth','dateOfIssue',
                      'dateOfExpiry','licenceNumber','passportNumber',
                      'cardNumber','identityNumber']
            if gt.get(k, "").strip()
        }
        gt_non_pii = { s.strip() for s in gt.get('non_pii_text', []) if s.strip() }

        # compute metrics
        tp_pii = len(pii_fields & gt_pii)
        fp_pii = len(pii_fields - gt_pii)
        fn_pii = len(gt_pii - pii_fields)
        p_pii, r_pii, f1_pii = compute_metrics(tp_pii, fp_pii, fn_pii)

        tp_non = len(non_pii_fields & gt_non_pii)
        fp_non = len(non_pii_fields - gt_non_pii)
        fn_non = len(gt_non_pii - non_pii_fields)
        p_non, r_non, f1_non = compute_metrics(tp_non, fp_non, fn_non)

        total_gt = len(gt_pii) + len(gt_non_pii)
        accuracy = (tp_pii + tp_non) / total_gt if total_gt > 0 else 0.0

        # collect metrics
        metrics_list.append({
            'file_id': file_id,
            'latency_s': round(latency, 2),
            'p_pii': round(p_pii, 4),
            'r_pii': round(r_pii, 4),
            'f1_pii': round(f1_pii, 4),
            'p_non': round(p_non, 4),
            'r_non': round(r_non, 4),
            'f1_non': round(f1_non, 4),
            'accuracy': round(accuracy, 4),
        })

        # print per-doc
        print(f"File ID: {file_id}")
        print(f"Latency: {latency:.2f}s")
        print(json.dumps(predicted, indent=2))
        print(json.dumps(gt, indent=2))
        print("-"*60)


  from pandas.core.computation.check import NUMEXPR_INSTALLED



=== DEBUG: RAW MODEL OUTPUT ===

{"reasoningContent": {"reasoningText": {"text": "Okay, let's tackle this OCR text. First, I need to parse all the required fields. The name is Fox Gonzalez WEAVER, that's straightforward. The address is the line starting with 48, so that's \"48 FHQ WCMEAPQJ LHQABHJ DAGZJBC NSW 4819\". \n\nDate of Birth is 07 MAR 1983, which converts to 07/03/1983. Expiry Date is 17 DEC 2020, so 17/12/2020. There's no date of issue mentioned, so that's empty.\n\nLicence Number: The line says \"Licence Na 40419631\", which I think is a typo for \"Licence No\", so the number is 40419631. Passport number isn't present. Card Number is \"5 610 827 648\", which should be concatenated to 5610827648. Identity Number isn't here either.\n\nFor non_pii_text, each line needs processing. The first line is \"Learner Driver Licence New South Wales, Australia\" \u2013 no numbers, so it stays. \"Fox Gonzalez WEAVER\" becomes just the name label, but since it's the name line, maybe it's 

ValidationError: 10 validation errors for RawDocumentInfo
name
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
address
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
dateOfBirth
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
dateOfIssue
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
dateOfExpiry
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
licenceNumber
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
passportNumber
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
cardNumber
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
identityNumber
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
non_pii_text
  Field required [type=missing, input_value={'reasoningContent': {'re... Licence Na 40419631'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing