In [2]:
from pydantic import BaseModel
from openai import AzureOpenAI
import os
import json
import pandas as pd
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
import re
from azure.cosmos import CosmosClient, exceptions, PartitionKey
from dotenv import load_dotenv
import os

In [3]:
# Load environment variables from .env file
load_dotenv()

def read_json_files_from_blob(folder_path):
    # Retrieve the connection string from the environment variables
    connection_string = os.getenv('STORAGE_CONNECTION_STRING')

    # Ensure the connection string is not None
    if connection_string is None:
        raise ValueError("The connection string environment variable is not set.")

    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)

    # Get the container client
    container_client = blob_service_client.get_container_client("data")

    # List all blobs in the specified folder
    blob_list = container_client.list_blobs(name_starts_with=folder_path)

    # Filter out JSON files and read their contents
    for blob in blob_list:
        if blob.name.endswith('.json'):
            blob_client = container_client.get_blob_client(blob.name)
            blob_data = blob_client.download_blob().readall()
            data = json.loads(blob_data)
            return data 

In [4]:
houseloan = read_json_files_from_blob("houseloans")

In [9]:
import re

def clean_json_data(json_data):
    # Extract relevant text content from the JSON
    content = []

    # Extract text from paragraphs
    paragraphs = json_data.get("paragraphs", [])
    for paragraph in paragraphs:
        content.append(paragraph.get("text", "").strip())

    # Extract text from pages and lines
    pages = json_data.get("pages", [])
    for page in pages:
        for line in page.get("lines", []):
            content.append(line.get("text", "").strip())

    # Join all text content into a single string with spaces between components
    plain_text_content = " ".join(content)

    # Extract 1
    customerServiceMatch = re.search(r"Customer\s+Service(.*?)Email", plain_text_content)
    customerService = customerServiceMatch.group(1).strip() if customerServiceMatch else None

    # Extract 2
    emailMatch = re.search(r"Email:\s*(\d+)", plain_text_content)
    email = emailMatch.group(1) if emailMatch else None
    
    # Extract 3
    addressMatch = re.search(r"Address:\s*(\d+)", plain_text_content)
    address = addressMatch.group(1) if addressMatch else None

    return plain_text_content, customerService, email, address

# Clean the JSON data and extract Customer ID
loanagreement_structured, customerService, email, address = clean_json_data(houseloan)

KeyboardInterrupt: 

In [None]:
from pydantic import BaseModel
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
  api_key=os.getenv("AZURE_OPENAI_KEY"),
  api_version= "2024-08-01-preview"
)

class HouseLoadModel(BaseModel):
    loan_amount: str
    interest_rates: str
    loan_tenure: str
    monthly_repayments: str
    late_payments: str
    loan_security: str
    loan_processing_fees: str
    default_and_foreclosure: str
    early_repayment_and_prepayment_penalties: str
    monthly_payment: str
    late_payment_fee: str
    collateral: str

completion = client.beta.chat.completions.parse(
    model="gpt-4o", # replace with the model deployment name of your gpt-4o 2024-08-06 deployment
    messages=[
        {"role": "system", "content": "Extract the information about this loan agreement contract."},
        {"role": "user", "content": loanagreement_structured},
    ],
    response_format=CalendarEvent,
)

finaljsonstr = completion.model_dump_json(indent=2)