# Playground Notebook

This notebook is meant to be temporarily used to experiment with different parts of the code contained in the project regarding document parsing and extraction.

In [1]:
import os
import sys
import argparse
import logging
from io import BytesIO
import numpy as np
import json
import fitz
import pymupdf4llm
from azure.storage.blob import BlobClient
from azure.identity import DefaultAzureCredential

# Local Imports from your existing codebase
from shared.extractors import ExtractorFactory, QuestionExtractorConfig, QAExtractorConfig
from shared.extractors.base import SectionType
from shared.extractors.base.models import ExtractedQAPair
from shared.parsers.pymupdf_parser import PyMuPDFParser
from shared.parsers.azure_parser import AzureDocumentParser
from shared.llm import create_llm_client
from config import config


factory = ExtractorFactory()

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [2]:
# This utility function can be used to get the PDF bytes
# from a PDF on a blob storage given a blob URL.

# This can also be done with local PDFs and open source
# python packages, which was done throughout recent developments
# to minimize costs.

def get_blob_content(blob_url: str) -> bytes:
    """Downloads the blob content into memory."""
    logging.info(f"Connecting to Blob: {blob_url.split('?')[0]}...") # Log URL without SAS token
    
    try:
        # Try connecting with Default Credential (env vars or az login)
        # If the URL contains a SAS token, the credential is ignored automatically by the SDK
        blob_client = BlobClient.from_blob_url(
            blob_url, 
            credential=DefaultAzureCredential()
        )
        
        download_stream = blob_client.download_blob()
        return download_stream.readall()
    except Exception as e:
        logging.error(f"Failed to download blob: {e}")
        sys.exit(1)


In [3]:
# These are very crude functions to calculate and display metrics
# related to the extraction. Since they might not be included
# in a production environment, I'm leaving them here.


def levenshtein_distance(s1: str, s2: str) -> int:
    """Compute Levenshtein distance between two strings."""
    s1, s2 = s1.lower(), s2.lower()

    if len(s1) < len(s2):
        s1, s2 = s2, s1

    previous_row = list(range(len(s2) + 1))

    for i, c1 in enumerate(s1, start=1):
        current_row = [i]
        for j, c2 in enumerate(s2, start=1):
            insertions = previous_row[j] + 1
            deletions = current_row[j - 1] + 1
            substitutions = previous_row[j - 1] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


def levenshtein_similarity(s1: str, s2: str) -> float:
    """
    Normalize Levenshtein distance into a similarity score [0, 1].
    1.0 = exact match
    """
    distance = levenshtein_distance(s1, s2)
    max_len = max(len(s1), len(s2))
    return 1 - distance / max_len if max_len > 0 else 1.0


def max_similarity_per_question(questions_dict, questions_ground_truth_2):
    results = {}

    for q_key, q_text in questions_dict.items():
        best_match = None
        best_score = 0.0

        for gt_key, gt_text in questions_ground_truth_2.items():
            score = levenshtein_similarity(q_text, gt_text)

            if score > best_score:
                best_score = score
                best_match = gt_key

        results[q_key] = {
            "best_ground_truth_question": best_match,
            "max_similarity": round(best_score, 4)
        }

    return results


def count_accuracy(extracted_items, ground_truth_dict):
    y_hat = len(extracted_items)
    y = len(ground_truth_dict)
    # acc = 1 - (abs(y_hat - y)/y)
    acc = 1 - (y_hat - y)/y
    return acc


def content_similarity(extracted_items, ground_truth_items):
    similarity_list = []
    for y_hat in extracted_items:
        similarity = max([levenshtein_similarity(y_hat, y) for y in ground_truth_items])
        similarity_list.append(similarity)
    return similarity_list


def display_metrics(extractions, ground_truth, structure=None):

    # Questions
    y_questions = [q['question'] for q in ground_truth['questions_answers']]
    yhat_questions = [q.question_text for q in extractions]
    q_similarity = content_similarity(yhat_questions, y_questions)
    print(f"Question (or pair) count accuracy: {count_accuracy(yhat_questions, y_questions):.2f}")
    print(f"Questions similarity: Average={np.mean(q_similarity):.2f} Min={np.min(q_similarity):.2f} Max={np.max(q_similarity):.2f}")

    # Answers
    if type(extractions[0]) == ExtractedQAPair:
        y_answers = [q['answer'] for q in ground_truth['questions_answers']]
        yhat_answers = [q.answer_text for q in extractions]
        yhat_answers = [text if text != None else "" for text in yhat_answers]
        a_similarity = content_similarity(yhat_answers, y_answers)
        print(f"Answers similarity: Average={np.mean(a_similarity):.2f} Min={np.min(a_similarity):.2f} Max={np.max(a_similarity):.2f}")

    else:
        print("No answers to be evaluated")

    # Section
    if structure != None:
        y_sections = [q['title'] for q in ground_truth['sections']]
        yhat_sections = [q.section_title for q in structure.sections]
        s_similarity = content_similarity(yhat_sections, y_sections)
        print(f"Section count accuracy: {count_accuracy(yhat_sections, y_sections):.2f}")
        print(f"Section similarity: Average={np.mean(s_similarity):.2f} Min={np.min(s_similarity):.2f} Max={np.max(s_similarity):.2f}")
    else:
        print("No document structure to be evaluated")



In [4]:
# Load ground truths of different documents to quantitatively
# estimate the extraction performance

with open("/home/dougsgrott-wsl/projects/local-doc-extractor/data/ground_truth/rfp_email.json", "r") as f:
    questions_ground_truth_esp = json.load(f)

with open("/home/dougsgrott-wsl/projects/local-doc-extractor/data/ground_truth/rfp_market.json", "r") as f:
    questions_ground_truth_ip = json.load(f)


## Document Loading and Parsing
Ultimately, the PDF documents will be loaded using Azure Blob, and the text parsing can be done with either Azure Document Intelligence (DI) or some python open source package.

As an alternative to DI, an open source parser is provided using pymupdf. I suggest that the decision to use it instead of DI should be based on metrics of Q/Q&A extraction on real RFPs.

Pros of Azure Document Intelligence:
- Can be used for multiple file types (pdf, docx, etc)
- If you build the text from the paragraphs directly instead of accessing the content key, you can get the table content in markdown style.
- Every extracted text is accompanied with a bounding box representing in its location on the document.
  - This can be useful to filter out headers and footers, diminishing OCR noise.
- The service tries to extract role of each element (paragraph, header, section, etc).
  - This could be useful to programatically reconstruct the document structure without AI. The bad news is: it is pretty bad, and fixing broken structures programatically can be more complex than parsing it with LLM.

Cons of Azure Document Intelligence:
- It's very, very costly (in my experimentations, it represented 65~75% of the costs)

In [5]:
pdf_filepath_ = "/home/dougsgrott-wsl/projects/local-doc-extractor/data/RFP - Email Service Provider.pdf"
doc = fitz.open(pdf_filepath_)
pdf_bytes = doc.tobytes()

free_parser = PyMuPDFParser()
free_parsed_doc = free_parser.parse(file_content=pdf_bytes)

# Should require Doc Int. endpoints configured
# azure_parser = AzureDocumentParser()
# azure_parsed_doc = azure_parser.parse(file_content=pdf_bytes)


# Parser to be used in the content extraction section
parser = free_parser

# Content Extraction

## Question

In [6]:
N = 1
RFP_TYPE = "ESP" # "ESP" or "PM"

if RFP_TYPE == "ESP":
    # Email Service Provider (with answers)
    url = "/home/dougsgrott-wsl/projects/local-doc-extractor/data/RFP - Email Service Provider.pdf"
    questions_ground_truth = questions_ground_truth_esp
elif RFP_TYPE == "PM":
    # Public Market, template N (with answers)
    url = f"https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/Public Market - Q template {N}.pdf"
    questions_ground_truth = questions_ground_truth_ip

doc = parser.parse(file_content=pdf_bytes)


### Simple extractor (baseline)

In [10]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_question_extractor("simple_llm")
questions_simple = question_extractor.extract(doc.content)


2025-12-29 13:11:02,673 - INFO - Starting simple LLM question extraction...
2025-12-29 13:11:02,675 - INFO - Processing 6 chunks...
2025-12-29 13:11:10,251 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:19,017 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:27,055 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:37,090 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:47,037 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:50,297 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:50,524 - INFO - Extracted 40 questions using simple LLM approach


In [11]:
display_metrics(questions_simple, questions_ground_truth)

Question (or pair) count accuracy: 0.75
Questions similarity: Average=0.59 Min=0.26 Max=1.00
No answers to be evaluated
No document structure to be evaluated


### Extractor with bells and whistles

In [12]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
structure_extractor = factory.create_structure_extractor("structure_aware")
question_extractor = factory.create_question_extractor("context_aware", config)
question_extractor.set_structure_extractor(structure_extractor)

questions_aware = question_extractor.extract(doc.content)
structure = question_extractor.structure

2025-12-29 13:11:54,416 - INFO - Detecting document structure...
2025-12-29 13:11:54,419 - INFO - Starting AI-powered structure detection...
2025-12-29 13:11:54,422 - INFO - Created 5 strategic samples
2025-12-29 13:11:58,915 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:11:59,142 - INFO - Format detected: formal_numbered (confidence: 0.95)
2025-12-29 13:13:13,697 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:13:14,012 - INFO - Detected 32 sections
2025-12-29 13:13:14,018 - INFO - Structure detection complete. Sections: 30
2025-12-29 13:13:14,019 - INFO - Extracting with structure awareness (30 sections)
2025-12-29 13:13:39,442 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:14:00,050 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:14:00,062 - INFO - After deduplica

In [13]:
display_metrics(questions_aware, questions_ground_truth, structure)

Question (or pair) count accuracy: 0.94
Questions similarity: Average=0.81 Min=0.21 Max=1.00
No answers to be evaluated
Section count accuracy: -1.00
Section similarity: Average=0.52 Min=0.19 Max=1.00


## Question and Answers

In [6]:
N = 1
RFP_TYPE = "ESP" # "ESP" or "PM"

if RFP_TYPE == "ESP":
    # Email Service Provider (with answers)
    url = "https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/RFP - Email Service Provider (with answers) v5.pdf"
    questions_ground_truth = questions_ground_truth_esp
elif RFP_TYPE == "PM":
    # Public Market, template N (with answers)
    url = f"https://stdocpipelinedev24o1.blob.core.windows.net/input-pdfs/Public Market - QA Template {N}.pdf"
    questions_ground_truth = questions_ground_truth_ip

doc = parser.parse(file_content=pdf_bytes)


### Simple extractor (baseline)

In [7]:
config = QAExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
question_extractor = factory.create_qa_extractor("simple")
questions_simple = question_extractor.extract(doc.content)


2025-12-29 13:16:04,701 - INFO - Split document into 5 chunks for extraction (chunk_size=8000, overlap=500)


2025-12-29 13:16:25,431 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:16:38,767 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:16:52,318 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:16:59,228 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:17:00,042 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:17:00,252 - INFO - Extracted 38 Q&A pairs (before deduplication)
2025-12-29 13:17:00,264 - INFO - After deduplication: 37 Q&A pairs


In [8]:
display_metrics(questions_simple, questions_ground_truth)


Question (or pair) count accuracy: 0.84
Questions similarity: Average=0.48 Min=0.26 Max=1.00
Answers similarity: Average=0.98 Min=0.25 Max=1.00
No document structure to be evaluated


### Extractor with bells and whistles

In [9]:
config = QuestionExtractorConfig(allowed_section_types=[SectionType.QUESTIONNAIRE])
structure_extractor = factory.create_structure_extractor("structure_aware")
question_extractor = factory.create_question_extractor("context_aware", config)
question_extractor.set_structure_extractor(structure_extractor)

questions_aware = question_extractor.extract(doc.content)
structure = question_extractor.structure

2025-12-29 13:17:04,568 - INFO - Detecting document structure...
2025-12-29 13:17:04,569 - INFO - Starting AI-powered structure detection...
2025-12-29 13:17:04,571 - INFO - Created 5 strategic samples
2025-12-29 13:17:08,874 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:17:09,092 - INFO - Format detected: formal_numbered (confidence: 0.95)
2025-12-29 13:18:23,328 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:18:23,544 - INFO - Detected 32 sections
2025-12-29 13:18:23,547 - INFO - Structure detection complete. Sections: 30
2025-12-29 13:18:23,547 - INFO - Extracting with structure awareness (30 sections)
2025-12-29 13:18:39,924 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:18:56,135 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:18:56,144 - INFO - After deduplica

In [10]:
display_metrics(questions_aware, questions_ground_truth_esp, structure)

Question (or pair) count accuracy: 0.94
Questions similarity: Average=0.81 Min=0.21 Max=1.00
No answers to be evaluated
Section count accuracy: -1.00
Section similarity: Average=0.52 Min=0.19 Max=1.00


# Triggerless End-to-End Pipeline

This combines document loading, text parsing, content extraction and persistence/database.

The code is/should be similar to the one inside the function_app:
 - Here, we can test loading the configuration and executing the code similarly as to how it will run on Azure, including text parsing (using Pymu of DI), Question/Q&A extraction (using OpenAI or Azure OpenAI) and saving to postgres.
 - However, instead of triggering the code when a PDF is uploaded, we simply run it whenever we want, loading the PDF bytes using Azure Blob or Pymupdf.



## Question

In [11]:
import azure.functions as func
import logging
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
import fitz

from config import config
from shared.extractors import ExtractorFactory
from shared.extractors.base.config import QuestionExtractorConfig
from shared.extractors.base.models import ExtractedQAPair
from shared.parsers import create_parser
from shared.models.db import RFP
from shared.models.enums import ProcessingStatus, RFPStatus, SourceType
from shared.models.converters import bulk_extracted_qa_to_answers
from shared.models.db.relationships import bulk_create_answers_with_rfp_update
from shared.extractors.base import QuestionToQAAdapter
from shared.core.processor import DocumentProcessor


# Database Connection Setup
engine = create_engine(config.database_url)

# Extraction Strategy Configuration
EXTRACTION_STRATEGY = os.getenv("EXTRACTION_STRATEGY", "context_aware")
# Options: simple_llm (fast, basic), context_aware (better quality, section-aware)


def load_config() -> QuestionExtractorConfig:
    """Load question extraction configuration from environment variables."""
    return QuestionExtractorConfig(
        chunk_size=int(os.getenv("CHUNK_SIZE", "6000")),
        overlap=int(os.getenv("CHUNK_OVERLAP", "500")),
        confidence_threshold=float(os.getenv("CONFIDENCE_THRESHOLD", "0.6")),
        deduplicate_questions=os.getenv("ENABLE_DEDUP", "true").lower() == "true",
        similarity_threshold=float(os.getenv("SIMILARITY_THRESHOLD", "0.9")),
        # LLM config auto-loaded from base ExtractorConfig
    )


def main(file_bytes):
    """
    1. Parses the PDF document (using Azure DI or pymupdf)
    2. Extracts questions only using LLM (no answer extraction)
    3. Saves questions to the database with answer_text=NULL
    """
    # logging.info(f"[Questions] Processing file: {myblob.name} ({myblob.length} bytes)")

    session = Session(engine)

    try:
        # Create question extractor with custom config
        factory = ExtractorFactory()
        config = load_config()
        question_extractor = factory.create_question_extractor(
            strategy=EXTRACTION_STRATEGY,
            config=config
        )


        logging.info(f"[Questions] Using extraction strategy: {EXTRACTION_STRATEGY}")

        # For context-aware strategy, inject structure extractor
        if EXTRACTION_STRATEGY == "context_aware":
            # Structure extractor needs its own config (not QuestionExtractorConfig)
            structure_extractor = factory.create_structure_extractor(
                strategy="structure_aware"
                # Uses default StructureExtractorConfig with LLM settings from env
            )
            question_extractor.set_structure_extractor(structure_extractor)
            logging.info("[Questions] Structure extractor injected for context-aware extraction")

        # Wrap question extractor in adapter to make it compatible with DocumentProcessor
        # The adapter converts ExtractedQuestion â†’ ExtractedQAPair with answer_text=None
        qa_extractor = QuestionToQAAdapter(question_extractor)

        # Create parser (defaults to env var DOCUMENT_PARSER)
        parser = create_parser()

        # Use DocumentProcessor for all orchestration and database logic
        processor = DocumentProcessor(
            session=session,
            parser=parser,
            extractor=qa_extractor
        )

        # Process document (handles parsing, extraction, and database persistence)
        result = processor.process_document(
            file_content=file_bytes,
            filename="myblob.name",
            client_name="Unknown Client",
            source_type="Blank RFP"
        )

        if result.success:
            logging.info(f"[Questions] Success: {result.message}")
        else:
            logging.warning(f"[Questions] Processing completed with issues: {result.message}")

    except Exception as e:
        session.rollback()
        logging.error(f"[Questions] Error processing {'myblob.name'}: {str(e)}", exc_info=True)
        raise
    finally:
        session.close()


# Loading PDF bytes with Pymu/Fitz
pdf_filepath_ = "/home/dougsgrott-wsl/projects/local-doc-extractor/data/RFP - Email Service Provider.pdf"
doc = fitz.open(pdf_filepath_)
pdf_bytes = doc.tobytes()

main(pdf_bytes)



2025-12-29 13:19:00,067 - INFO - [Questions] Using extraction strategy: context_aware
2025-12-29 13:19:00,072 - INFO - [Questions] Structure extractor injected for context-aware extraction
2025-12-29 13:19:00,073 - INFO - Processing file: myblob.name
2025-12-29 13:19:00,073 - INFO - Step 1: Extracting text from PDF...
2025-12-29 13:19:02,064 - INFO - Extracted 30023 characters of text (format: markdown)
2025-12-29 13:19:02,065 - INFO - Step 2: Identifying Questions and Answers with LLM...
2025-12-29 13:19:02,065 - INFO - Detecting document structure...
2025-12-29 13:19:02,066 - INFO - Starting AI-powered structure detection...
2025-12-29 13:19:02,066 - INFO - Created 5 strategic samples
2025-12-29 13:19:04,415 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:19:04,635 - INFO - Format detected: formal_numbered (confidence: 0.95)
2025-12-29 13:20:13,069 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 2

## Question and Answers

In [12]:

import azure.functions as func
import logging
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
import fitz

from config import config
from shared.core.processor import DocumentProcessor
from shared.extractors import ExtractorFactory
from shared.extractors.base.config import QAExtractorConfig
from shared.parsers import create_parser

app = func.FunctionApp()

# Database Connection Setup
engine = create_engine(config.database_url)

# Extraction Strategy Configuration
EXTRACTION_STRATEGY = os.getenv("EXTRACTION_STRATEGY", "simple")
# Options: simple (fast, basic), context_aware (better quality, structure-aware)


def load_config() -> QAExtractorConfig:
    """Load Q&A extraction configuration from environment variables."""
    return QAExtractorConfig(
        chunk_size=int(os.getenv("CHUNK_SIZE", "8000")),
        overlap=int(os.getenv("CHUNK_OVERLAP", "500")),
        confidence_threshold=float(os.getenv("CONFIDENCE_THRESHOLD", "0.5")),
        deduplicate_pairs=os.getenv("ENABLE_DEDUP", "true").lower() == "true",
        similarity_threshold=float(os.getenv("SIMILARITY_THRESHOLD", "0.9")),
        # LLM config auto-loaded from base ExtractorConfig
    )


def main(file_bytes):
    """
    1. Parses the PDF document (using Azure DI or pymupdf)
    2. Extracts question-answer pairs using LLM
    3. Saves both questions and answers to the database
    """
    logging.info(f"[Q&A] Processing file.")

    session = Session(engine)

    try:
        # Create extractor with custom config
        factory = ExtractorFactory()
        config = load_config()
        extractor = factory.create_qa_extractor(strategy=EXTRACTION_STRATEGY, config=config)

        logging.info(f"[Q&A] Using extraction strategy: {EXTRACTION_STRATEGY}")

        # For context-aware strategy, inject structure extractor
        if EXTRACTION_STRATEGY == "context_aware":
            # Structure extractor needs its own config (not QAExtractorConfig)
            structure_extractor = factory.create_structure_extractor(
                strategy="structure_aware"
                # Uses default StructureExtractorConfig with LLM settings from env
            )
            # Context-aware QA extractor uses structure extractor internally
            extractor.set_structure_extractor(structure_extractor)
            logging.info("[Q&A] Structure extractor injected for context-aware extraction")

        # Create parser (defaults to env var DOCUMENT_PARSER)
        parser = create_parser()

        # Process using core processor
        processor = DocumentProcessor(session, parser=parser, extractor=extractor)
        result = processor.process_document(
            file_content=file_bytes,
            filename="myblob.name",
            client_name="Unknown Client",
            source_type="Q&A Document"
        )

        # Log result
        if result.success:
            logging.info(f"[Q&A] Success: {result.message}")
        else:
            logging.error(f"[Q&A] Failed: {result.message}")
            if result.error:
                raise result.error

    except Exception as e:
        session.rollback()
        logging.error(f"[Q&A] Error processing {'myblob.name'}: {str(e)}")
        raise
    finally:
        session.close()


# Loading PDF bytes with Pymu/Fitz
pdf_filepath_ = "/home/dougsgrott-wsl/projects/local-doc-extractor/data/RFP - Email Service Provider.pdf"
doc = fitz.open(pdf_filepath_)
pdf_bytes = doc.tobytes()

main(pdf_bytes)

2025-12-29 13:20:40,770 - INFO - [Q&A] Processing file.
2025-12-29 13:20:40,777 - INFO - [Q&A] Using extraction strategy: context_aware
2025-12-29 13:20:40,782 - INFO - [Q&A] Structure extractor injected for context-aware extraction
2025-12-29 13:20:40,782 - INFO - Processing file: myblob.name
2025-12-29 13:20:40,783 - INFO - Step 1: Extracting text from PDF...
2025-12-29 13:20:42,519 - INFO - Extracted 30023 characters of text (format: markdown)
2025-12-29 13:20:42,519 - INFO - Step 2: Identifying Questions and Answers with LLM...
2025-12-29 13:20:42,520 - INFO - Starting context-aware Q+A extraction...
2025-12-29 13:20:42,520 - INFO - Detecting document structure for answer extraction...
2025-12-29 13:20:42,521 - INFO - Starting AI-powered structure detection...
2025-12-29 13:20:42,521 - INFO - Created 5 strategic samples
2025-12-29 13:20:46,147 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-29 13:20:46,366 - INFO - Format detected: f