In [None]:
!pip install -r requirements.txt

In [12]:
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()

# Configuration for Azure services
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_SEARCH_INDEX = os.getenv("AZURE_SEARCH_INDEX")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

In [None]:
# Executive Orders Processing with XML Parser Fix
# This script processes executive orders JSON data and adds the XML content to each record

import json
import requests
import os
import pandas as pd
from tqdm import tqdm
import time
from azure.search.documents.indexes.models import DocumentIntelligenceLayoutSkill
# Path to input and output files
INPUT_FILE = "../../data/executive-orders/executive-orders.json"
OUTPUT_FILE = "../../data/executive-orders/executive-orders-processed.json"

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)


# Load the executive orders JSON data
def load_executive_orders():
    with open(INPUT_FILE, "r") as file:
        data = json.load(file)

    # Extract results from the JSON structure
    orders = data.get("results", [])
    print(f"Loaded {len(orders)} executive orders")

    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(orders)
    return df


# Function to fetch XML content from URL
def fetch_xml_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Instead of using BeautifulSoup, just return the raw XML content
            # This avoids the need for an XML parser library
            return response.text
        else:
            print(
                f"Failed to fetch XML content from {url}, status code: {response.status_code}"
            )
            return None
    except Exception as e:
        print(f"Error fetching XML content from {url}: {str(e)}")
        return None


# Process the executive orders data
def process_executive_orders(df):
    print(f"Processing {len(df)} executive orders...")

    # Drop unnecessary columns
    df = df.drop(columns=["json_url", "body_html_url"], errors="ignore")

    # Create a new column for the XML content
    df["content"] = None

    # Fetch XML content for each order
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        xml_url = row.get("full_text_xml_url")
        if xml_url:
            xml_content = fetch_xml_content(xml_url)
            if xml_content:
                df.at[idx, "content"] = xml_content
            else:
                print(f"Could not fetch content for order: {row.get('title')}")
        else:
            print(f"No XML URL for order: {row.get('title')}")

        # Add a small delay to avoid overwhelming the server
        time.sleep(0.5)

    return df


# Save the processed data
def save_processed_data(df):
    # Convert DataFrame to list of dictionaries
    processed_data = df.to_dict("records")

    with open(OUTPUT_FILE, "w") as file:
        json.dump(processed_data, file, indent=2)
    print(f"Processed data saved to {OUTPUT_FILE}")

# Step 1: Load the executive orders data
print("Loading executive orders data...")
df = load_executive_orders()

# Display the first few rows to inspect the data
df.head()

# Step 2: Process the executive orders to fetch XML content
processed_df = process_executive_orders(df)

# Display sample of processed data
processed_df.head()

# Step 3: Save the processed data to file
save_processed_data(processed_df)
print(f"Processing complete. {len(processed_df)} orders processed.")

Loading executive orders data...
Loaded 20 executive orders
Processing 20 executive orders...


100%|██████████| 20/20 [00:16<00:00,  1.19it/s]

Processed data saved to ../../data/executive-orders/executive-orders-processed.json
Processing complete. 20 orders processed.





In [27]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

search_client = SearchClient(
    endpoint=os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT"),
    index_name="acc-guidelines-index",
    credential=AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")),
)

def fetch_all_documents(search_client, max_docs=None):
    """
    Fetch documents from the source index.
    
    Parameters:
    - search_client: The Azure Search client
    - max_docs: Maximum number of documents to retrieve (None for all documents)
    """
    all_documents = []
    document_count = 0
    skip = 0
    batch_size = 1000  # Maximum allowed per request

    # First query to get total count
    results = search_client.search(
        search_text="*",
        select="*",
        top=min(batch_size, max_docs) if max_docs else batch_size,
        skip=skip,
        include_total_count=True,
    )
    
    total_count = results.get_count()
    print(f"Total documents in index: {total_count}")
    
    # If max_docs specified, adjust our target
    target_count = min(total_count, max_docs) if max_docs else total_count
    print(f"Will retrieve {target_count} documents")

    while True:
        # Process this batch
        batch_documents = list(results)
        batch_count = len(batch_documents)

        # If no results returned, we've reached the end
        if batch_count == 0:
            break

        # Add this batch to our collection
        all_documents.extend(batch_documents)
        document_count += batch_count

        # Print progress
        print(f"Retrieved {document_count} of {target_count} documents")

        # If we've retrieved enough documents, exit the loop
        if document_count >= target_count:
            break

        # Calculate how many more documents we need
        remaining = target_count - document_count
        if remaining <= 0:
            break
            
        # Move to next batch
        skip += batch_size
        
        # Get the next batch
        results = search_client.search(
            search_text="*",
            select="*",
            top=min(batch_size, remaining),
            skip=skip,
            include_total_count=False,
        )

    print(f"Total documents retrieved: {len(all_documents)}")
    return all_documents

# Example usage:
# fetch_all_documents(search_client)  # Retrieve all documents
fetch_all_documents(search_client, 2500)  # Retrieve only 5000 documents

Total documents in index: 4154
Will retrieve 2500 documents
Retrieved 1000 of 2500 documents
Retrieved 2000 of 2500 documents
Retrieved 2500 of 2500 documents
Total documents retrieved: 2500


[{'chunk': '# 2017 VA/SCD Guideline\n\n# OCTOBER 2, 2018: e91–220\n\n# TABLE 1\n\n# Applying Class of Recommendation and Level of Evidence to Clinical Strategies, Interventions, Treatments, or Diagnostic Testing in Patient Care* (Updated August 2015)\n\n|CLASS (STRENGTH) OF RECOMMENDATION|LEVEL (QUALITY) OF EVIDENCE|\n|---|---|\n|Suggested phrases for writing recommendations:|High-quality evidence from more than RCT|\n|Is recommended|Meta-analyses of high-quality RCTs|\n|Is indicated / useful/ effective/ beneficial|One or more RCTs corroborated by high-quality registry studies|\n|Should be performed / administered/ other|Moderate-quality evidence from 2 or more RCTs|\n|Comparative Effectiveness Phrases: Treatment/ strategy is recommended/ indicated in preference to treatment B|Meta-analyses of moderate-quality RCTs|\n|Treatment A should be chosen over treatment B|Moderate-quality evidence from 2 or more well-designed, well-executed nonrandomized studies, observational studies, or regis