# Playground Notebook

This notebook is meant to be used to experiment with different parts of the code contained in the project regarding document parsing and extraction.

In [None]:
import os
os.environ['DATABASE_URL'] = "postgresql+psycopg2://postgres:postgres@localhost:5432/rfp_db"


In [None]:

import sys
import argparse
import logging
from io import BytesIO
import numpy as np
# Azure Imports
from azure.storage.blob import BlobClient
from azure.identity import DefaultAzureCredential

# Local Imports from your existing codebase
from config import config
from shared.services.doc_intelligence import DocumentParser, MarginFilter

# from shared.services.semantic import SemanticAnalyzer

# from shared.services.semantic_5 import AgenticSemanticAnalyzer, extract_rfp_questions
from shared.extractors import ExtractorFactory, QuestionExtractorConfig, QAExtractorConfig
from shared.extractors.base import SectionType
from shared.extractors import ExtractorFactory, QuestionExtractorConfig
from shared.extractors.base import SectionType

from shared.extractors.base.models import ExtractedQAPair

from shared.extractors import ExtractorFactory
import json
import fitz
import pymupdf4llm

factory = ExtractorFactory()


In [None]:

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_blob_content(blob_url: str) -> bytes:
    """Downloads the blob content into memory."""
    logging.info(f"Connecting to Blob: {blob_url.split('?')[0]}...") # Log URL without SAS token
    
    try:
        # Try connecting with Default Credential (env vars or az login)
        # If the URL contains a SAS token, the credential is ignored automatically by the SDK
        blob_client = BlobClient.from_blob_url(
            blob_url, 
            credential=DefaultAzureCredential()
        )
        
        download_stream = blob_client.download_blob()
        return download_stream.readall()
    except Exception as e:
        logging.error(f"Failed to download blob: {e}")
        sys.exit(1)


def analyze_pdf_with_docint(blob_url: str):
    # Azure Blob Storage
    pdf_bytes = get_blob_content(blob_url)
    logging.info(f"Downloaded {len(pdf_bytes)} bytes.")

    # Azure Document Intelligence
    parser = DocumentParser()
    logging.info("Sending to Azure Document Intelligence...")
    try:
        # Parse stream expects a file-like object or bytes
        result = parser.parse_stream(pdf_bytes)
        
        logging.info("Analysis Complete.")
        return result

    except Exception as e:
        logging.error(f"Analysis failed: {e}")


def analyze_pdf_with_pymupdf4llm(blob_url: str):
    # Azure Blob Storage
    pdf_bytes = get_blob_content(blob_url)
    logging.info(f"Downloaded {len(pdf_bytes)} bytes.")

    # pymupdf4llm
    doc = fitz.open("pdf", pdf_bytes)
    markdown_text = pymupdf4llm.to_markdown(doc)
    return markdown_text


def analyze_pdf_free(pdf_filepath: str):
    # Instead of Azure services, use pymupdf4llm to read
    # a local PDF file
    output = {}
    doc = fitz.open(pdf_filepath)
    pdf_bytes = doc.tobytes() 
    markdown_text = pymupdf4llm.to_markdown(doc)
    output['content'] = markdown_text
    return output



In [None]:

def levenshtein_distance(s1: str, s2: str) -> int:
    """Compute Levenshtein distance between two strings."""
    s1, s2 = s1.lower(), s2.lower()

    if len(s1) < len(s2):
        s1, s2 = s2, s1

    previous_row = list(range(len(s2) + 1))

    for i, c1 in enumerate(s1, start=1):
        current_row = [i]
        for j, c2 in enumerate(s2, start=1):
            insertions = previous_row[j] + 1
            deletions = current_row[j - 1] + 1
            substitutions = previous_row[j - 1] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


def levenshtein_similarity(s1: str, s2: str) -> float:
    """
    Normalize Levenshtein distance into a similarity score [0, 1].
    1.0 = exact match
    """
    distance = levenshtein_distance(s1, s2)
    max_len = max(len(s1), len(s2))
    return 1 - distance / max_len if max_len > 0 else 1.0


def max_similarity_per_question(questions_dict, questions_ground_truth_2):
    results = {}

    for q_key, q_text in questions_dict.items():
        best_match = None
        best_score = 0.0

        for gt_key, gt_text in questions_ground_truth_2.items():
            score = levenshtein_similarity(q_text, gt_text)

            if score > best_score:
                best_score = score
                best_match = gt_key

        results[q_key] = {
            "best_ground_truth_question": best_match,
            "max_similarity": round(best_score, 4)
        }

    return results


def count_accuracy(extracted_items, ground_truth_dict):
    y_hat = len(extracted_items)
    y = len(ground_truth_dict)
    # acc = 1 - (abs(y_hat - y)/y)
    acc = 1 - (y_hat - y)/y
    return acc


def content_similarity(extracted_items, ground_truth_items):
    similarity_list = []
    for y_hat in extracted_items:
        similarity = max([levenshtein_similarity(y_hat, y) for y in ground_truth_items])
        similarity_list.append(similarity)
    return similarity_list


def display_metrics(extractions, ground_truth, structure=None):

    # Questions
    y_questions = [q['question'] for q in ground_truth['questions_answers']]
    yhat_questions = [q.question_text for q in extractions]
    q_similarity = content_similarity(yhat_questions, y_questions)
    print(f"Question (or pair) count accuracy: {count_accuracy(yhat_questions, y_questions):.2f}")
    print(f"Questions similarity: Average={np.mean(q_similarity):.2f} Min={np.min(q_similarity):.2f} Max={np.max(q_similarity):.2f}")

    # Answers
    if type(extractions[0]) == ExtractedQAPair:
        y_answers = [q['answer'] for q in ground_truth['questions_answers']]
        yhat_answers = [q.answer_text for q in extractions]
        yhat_answers = [text if text != None else "" for text in yhat_answers]
        a_similarity = content_similarity(yhat_answers, y_answers)
        print(f"Answers similarity: Average={np.mean(a_similarity):.2f} Min={np.min(a_similarity):.2f} Max={np.max(a_similarity):.2f}")

    else:
        print("No answers to be evaluated")

    # Section
    if structure != None:
        y_sections = [q['title'] for q in ground_truth['sections']]
        yhat_sections = [q.section_title for q in structure.sections]
        s_similarity = content_similarity(yhat_sections, y_sections)
        print(f"Section count accuracy: {count_accuracy(yhat_sections, y_sections):.2f}")
        print(f"Section similarity: Average={np.mean(s_similarity):.2f} Min={np.min(s_similarity):.2f} Max={np.max(s_similarity):.2f}")
    else:
        print("No document structure to be evaluated")



In [None]:
with open("/home/dougsgrott-wsl/projects/local-doc-extractor/data/ground_truth/rfp_email.json", "r") as f:
    questions_ground_truth_esp = json.load(f)

with open("/home/dougsgrott-wsl/projects/local-doc-extractor/data/ground_truth/rfp_market.json", "r") as f:
    questions_ground_truth_ip = json.load(f)


## Document loading
Ultimately, the PDF documents will be loaded using Azure Blob, and the text parsing can be done with either Azure Document Intelligence or some python open source package.

Pros of Azure Document Intelligence:
- Can be used for multiple file types (pdf, docx, etc)
- Every extracted text is accompanied with a bounding box representing in its location on the document.
  - This can be useful to filter out headers and footers, diminishing OCR noise.
- The service tries to extract role of each element (paragraph, header, section, etc). This would be useful to programatically reconstruct the document structure without AI. The bad news is: it sucks.

Cons of Azure Document Intelligence:
- It's very, very costly.

Here, I suggest and experiment with the possibility of using an open source python package meant only for PDFs.

In [None]:
# Compare different PDF loaders
blob_url_ = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/RFP - Email Service Provider.pdf"
pdf_filepath_ = "/home/dougsgrott-wsl/projects/local-doc-extractor/data/RFP - Email Service Provider.pdf"

text1 = analyze_pdf_with_docint(blob_url_)
text2 = analyze_pdf_free(pdf_filepath_)


# Content Extraction

## RFP - Email Service Provider
### (without answers)

In [None]:
url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/RFP - Email Service Provider.pdf"
doc = analyze_pdf_with_docint(url)
parser = DocumentParser(margin_filter=MarginFilter(top=0.08, bottom=0.05))


In [None]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_question_extractor("simple_llm") #, config
questions_esp_simple = question_extractor.extract(doc['content'])


In [None]:
display_metrics(questions_esp_simple, questions_ground_truth_esp)

In [None]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
structure_extractor = factory.create_structure_extractor("structure_aware")
question_extractor = factory.create_question_extractor("context_aware", config)
question_extractor.set_structure_extractor(structure_extractor)

questions_esp_aware = question_extractor.extract(doc['content'])
structure_aware = question_extractor.structure

In [None]:
display_metrics(questions_esp_aware, questions_ground_truth_esp, structure_aware)

### (with answers)

In [None]:
url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/RFP - Email Service Provider (with answers) v5.pdf"
doc = analyze_pdf_with_docint(url)
parser = DocumentParser(margin_filter=MarginFilter(top=0.08, bottom=0.05))


In [None]:
config = QAExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_qa_extractor("simple") #, config
questions_esp_simple = question_extractor.extract(doc['content'])


In [None]:
display_metrics(questions_esp_simple, questions_ground_truth_esp)


## RFP - Public Market, Template 1
 
### (without answers)

In [None]:

url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/Public Market - Q template 1.pdf"
doc = analyze_pdf_with_docint(url)
parser = DocumentParser(margin_filter=MarginFilter(top=0.08, bottom=0.05))


In [None]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_question_extractor("simple_llm") #, config
questions_esp_simple = question_extractor.extract(doc['content'])


In [None]:
display_metrics(questions_esp_simple, questions_ground_truth_ip)


In [None]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
structure_extractor = factory.create_structure_extractor("structure_aware")
question_extractor = factory.create_question_extractor("context_aware", config)
question_extractor.set_structure_extractor(structure_extractor)

questions_esp_aware = question_extractor.extract(doc['content'])
structure_aware = question_extractor.structure

In [None]:
display_metrics(questions_esp_aware, questions_ground_truth_ip, structure_aware)

### (with answers)

In [None]:

url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/Public Market - QA Template 1.pdf"
doc = analyze_pdf_with_docint(url)
parser = DocumentParser(margin_filter=MarginFilter(top=0.08, bottom=0.05))


In [None]:
config = QAExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_qa_extractor("simple") #, config
questions_esp_simple = question_extractor.extract(doc['content'])


In [None]:
display_metrics(questions_esp_simple, questions_ground_truth_ip)


In [None]:
config = QAExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
structure_extractor = factory.create_structure_extractor("structure_aware")
question_extractor = factory.create_qa_extractor("context_aware", config)
question_extractor.set_structure_extractor(structure_extractor)

questions_esp_aware = question_extractor.extract(doc['content'])
structure_aware = question_extractor.structure

In [None]:
display_metrics(questions_esp_aware, questions_ground_truth_ip, structure_aware)


## RFP - Public Market, Template 2

### (without answers)

In [None]:

url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/Public Market - Q template 2.pdf"
doc = analyze_pdf_with_docint(url)


config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])

structure_extractor = factory.create_structure_extractor("structure_aware")

question_extractor_ca = factory.create_question_extractor("context_aware", config)
questions_ca = question_extractor_ca.extract(doc['content'])

print('\n\n\n')
question_extractor_ag = factory.create_question_extractor("agentic", config)
questions_ag = question_extractor_ag.extract(doc['content'])

print('\n\n\n')
question_extractor_st_ca = factory.create_question_extractor("context_aware", config)
question_extractor_st_ca.set_structure_extractor(structure_extractor)
questions_st_ca = question_extractor_st_ca.extract(doc['content'])

print('\n\n\n')
question_extractor_st_ag = factory.create_question_extractor("agentic", config)
question_extractor_st_ag.set_structure_extractor(structure_extractor)
questions_st_ag = question_extractor_st_ag.extract(doc['content'])



# End-to-End Pipeline

This combines document loading, OCR/text parsing, content extraction and persistence/database.

Main idea is to implement dependency injection to reuse different components into different pipelines or Azure Functions, such that the code is reusable and modular.

In [None]:
from shared.core.processor import DocumentProcessor
from shared.models.db import create_db_engine
from sqlalchemy.orm import Session

url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/RFP - Email Service Provider.pdf"


pdf_bytes = get_blob_content(url)
engine = create_db_engine()
processor = DocumentProcessor(
    Session(engine),
    parser=None,
    analyzer=None,
    extractor=None,
)
result = processor.process_document(pdf_bytes, "file.pdf", "Client Name")
