In [None]:
from pathlib import Path
from datetime import datetime
import pandas as pd

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.identity import DefaultAzureCredential

In [None]:
def analyze_file(file_path: Path):

    records = []
    document_id = file_path.stem

    print(f"Processing: {file_path.name}")

    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document(
            "prebuilt-document",
            document=f
        )

    result = poller.result()


In [None]:
    # Full text
    records.append({
        "document_id": document_id,
        "content_type": "full_text",
        "field_name": None,
        "content": result.content,
        "confidence_score": None,
        "extracted_at": datetime.utcnow().isoformat()
    })


In [None]:
    if result.paragraphs:
        for para in result.paragraphs:
            records.append({
                "document_id": document_id,
                "content_type": "paragraph",
                "field_name": str(para.role),
                "content": para.content,
                "confidence_score": None,
                "extracted_at": datetime.utcnow().isoformat()
            })


In [None]:
    if result.key_value_pairs:
        for kv in result.key_value_pairs:

            key = kv.key.content if kv.key else None
            value = kv.value.content if kv.value else None

            records.append({
                "document_id": document_id,
                "content_type": "key_value",
                "field_name": key,
                "content": value,
                "confidence_score": kv.confidence,
                "extracted_at": datetime.utcnow().isoformat()
            })


In [None]:
    if result.tables:
        for table_index, table in enumerate(result.tables):
            for cell in table.cells:
                records.append({
                    "document_id": document_id,
                    "content_type": "table_cell",
                    "field_name": f"table_{table_index}_r{cell.row_index}_c{cell.column_index}",
                    "content": cell.content,
                    "confidence_score": None,
                    "extracted_at": datetime.utcnow().isoformat()
                })


In [None]:
    if result.styles:
        for style in result.styles:
            if style.is_handwritten:
                records.append({
                    "document_id": document_id,
                    "content_type": "handwriting_detected",
                    "field_name": None,
                    "content": "Handwritten content present",
                    "confidence_score": style.confidence,
                    "extracted_at": datetime.utcnow().isoformat()
                })

    return records


In [None]:
all_records = []

for file in INPUT_FOLDER.iterdir():

    if file.suffix.lower() not in [".png", ".jpg", ".jpeg", ".pdf"]:
        continue

    try:
        file_records = analyze_file(file)
        all_records.extend(file_records)

    except Exception as e:
        print(f"FAILED: {file.name}")
        print(e)


In [None]:
df = pd.DataFrame(all_records)

print("Total extracted rows:", len(df))
df.head()
