In [8]:
import os
import re
import json
from azure.storage.blob import BlobServiceClient
from azure.cosmos import CosmosClient, PartitionKey, exceptions
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
# Load environment variables
load_dotenv()

True

In [9]:
# Azure Blob Storage details
STORAGE_CONNECTION_STRING = os.getenv("STORAGE_CONNECTION_STRING")
COSMOS_ENDPOINT = os.getenv("COSMOS_ENDPOINT")
COSMOS_KEY = os.getenv("COSMOS_KEY")
 
DOC_AI_ENDPOINT = os.getenv("DOC_AI_ENDPOINT")
DOC_AI_KEY = os.getenv("DOC_AI_KEY")

In [10]:
 
# Retrieve houseloan.pdf from Blob
def get_pdf_from_blob(blob_name, container_name="data"):
    try:
        blob_service_client = BlobServiceClient.from_connection_string(STORAGE_CONNECTION_STRING)
        container_client = blob_service_client.get_container_client(container_name)
        blob_client = container_client.get_blob_client(blob_name)
        pdf_content = blob_client.download_blob().readall()
        return pdf_content
    except Exception as e:
        raise RuntimeError(f"Failed to retrieve blob '{blob_name}' from container '{container_name}': {e}")
 


In [11]:
# Analyze the PDF with Azure Document Intelligence
def analyze_pdf(pdf_content):
    try:
        document_analysis_client = DocumentAnalysisClient(
            endpoint=DOC_AI_ENDPOINT,
            credential=AzureKeyCredential(DOC_AI_KEY)
        )
        poller = document_analysis_client.begin_analyze_document("prebuilt-document", pdf_content)
        result = poller.result()
        return result
    except Exception as e:
        raise RuntimeError(f"Failed to analyze PDF: {e}")


In [12]:

# Process extracted content
def process_analyzed_content(analyzed_result):
    content = []
    for page in analyzed_result.pages:
        for line in page.lines:
            content.append(line.content.strip())
    full_text = " ".join(content)
 
    # Debug: Stampa il testo grezzo estratto
    print("Extracted Full Text:", full_text)
 
    # Se non viene estratto alcun testo
    if not full_text.strip():
        raise RuntimeError("No text content extracted from the PDF.")
 
    # Pattern aggiornato per sezioni numerate con contenuti multilinea
    section_pattern = r"(\d+\.\s[A-Za-z\s]+)(.*?)(?=\d+\.\s[A-Za-z\s]+|$)"
    matches = re.findall(section_pattern, full_text, re.DOTALL)
 
    # Creare una struttura per salvare i dati
    structured_data = {}
    for section_title, section_content in matches:
        structured_data[section_title.strip()] = section_content.strip()
 
    # Debug: Mostra i dati strutturati
    print("Structured Data:", json.dumps(structured_data, indent=4))
   
    return structured_data
 
 
 

In [14]:

# Upload to Cosmos DB
# Upload to Cosmos DB
def upload_to_cosmos_db(data, container_name, document_id):
    """
    Uploads data to the specified container in ContosoDB.
 
    Args:
        data (dict): The structured data to upload.
        container_name (str): The name of the Cosmos DB container (e.g., "HouseLoanTerms").
        document_id (str): The unique identifier for the document.
    """
    client = CosmosClient(COSMOS_ENDPOINT, COSMOS_KEY)
    database = client.create_database_if_not_exists(id="ContosoDB")
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path="/id"),
        offer_throughput=400
    )
    # Add the document ID to the data
    data["id"] = document_id
    try:
        container.create_item(body=data)
        print(f"Data uploaded successfully to ContosoDB, container '{container_name}': {data['id']}")
    except exceptions.CosmosResourceExistsError:
        print(f"Item with ID {data['id']} already exists in ContosoDB, container '{container_name}'.")
 


In [15]:
 
# Main Function
# Main Function
def main():
    try:
        # Step 1: Retrieve and analyze PDF
        pdf_content = get_pdf_from_blob("house/houseloan.pdf")
        analyzed_result = analyze_pdf(pdf_content)
 
        # Step 2: Process and clean extracted data
        structured_data = process_analyzed_content(analyzed_result)
 
        # Step 3: Upload structured data to Cosmos DB
        if structured_data:  # Controllo se i dati estratti non sono vuoti
            upload_to_cosmos_db(
                data=structured_data,
                container_name="HouseLoanTerms",  # Contenitore per i termini dei prestiti
                document_id="houseloan_terms_001"  # ID univoco del documento
            )
        else:
            print("No structured data extracted. Skipping upload.")
    except Exception as e:
        print(f"An error occurred: {e}")
 
if __name__ == "__main__":
    main()

Extracted Full Text: Contoso Bank - House Loan Terms and Conditions 1. Introduction These terms and conditions govern the house loans provided by Contoso Bank (referred to as "the Bank") to customers (referred to as "Borrower"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein. 2. Loan Amount and Purpose · The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects. . The maximum loan amount will be determined by the Bank based on the Borrower's financial profile, creditworthiness, and property value. 3. Interest Rates . Fixed Rate: The interest rate remains constant throughout the loan term. . Variable Rate: The interest rate may fluctuate based on market conditions and will be tied to a publicly available index. Changes in the interest rate will affect the Borrower's monthly payments. . Interest rates are disclosed at the tim