In [4]:
import json
import pandas as pd
import re

# Load JSON data
region = 'nijmegen'
json_file = f'zorginstellingen-{region}.json'
updated_json_file = f'zorginstellingen-{region}-updated.json'
# Check if the file exists  and is not empty
try:
    with open(json_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
except FileNotFoundError:
    print(f"File {json_file} not found. Please check the file path.")
    exit(1)
except json.JSONDecodeError:
    print(f"File {json_file} is empty or not a valid JSON file.")
    exit(1)

# Load XLSX data using pandas
# This replaces the CSV reading code
excel_data = pd.read_excel('openbaar-databestand-kwaliteitsbeeld-generiek-kompas-verslagjaar-2024.xlsx')
csv_data = excel_data.to_dict('records')

# Create a map of organization names to URLs from the Excel data
org_to_url_map = {}

for row in csv_data:
    if (row.get('IndicatorNaam') == "Wat is de URL van de publieke website waar het Kwaliteitsbeeld van uw zorgorganisatie te vinden is?" and 
        row.get('IndicatorWaarde') and 
        isinstance(row.get('IndicatorWaarde'), str)):
        
        url = row.get('IndicatorWaarde').strip()
        
        # Clean up URLs that might not be properly formatted
        if url.startswith('chrome-extension://'):
            # Extract the actual URL after the chrome-extension part
            matches = re.search(r'https?://[^\s]+', url)
            if matches:
                url = matches.group(0)
        elif url.startswith('www'):
            # Add https:// to URLs that start with www
            url = 'https://' + url
        
        # Store the organization name and URL
        if url:
            org_to_url_map[row.get('OrganisatieNaam')] = url

# Create a lookup function
def find_best_match(json_org_name, csv_org_names):
    # Clean up the name to handle common variations
    def clean_name(name):
        return name.lower().replace(' ', ' ').replace(',', '').strip()
    
    clean_json_name = clean_name(json_org_name)
    
    # First try for exact matches (after cleaning)
    for csv_name in csv_org_names:
        if clean_name(csv_name) == clean_json_name:
            return csv_name
    
    # If no exact match, try partial matches
    # First, try to match the main organization name
    # Many JSON entries have format "OrgName, location X" or "OrgName, X"
    main_org_name = clean_json_name.split(',')[0].strip()
    
    for csv_name in csv_org_names:
        clean_csv_name = clean_name(csv_name)
        if clean_csv_name == main_org_name or main_org_name == clean_csv_name:
            return csv_name
    
    # If still no match, try more flexible matching
    for csv_name in csv_org_names:
        clean_csv_name = clean_name(csv_name)
        if clean_csv_name in main_org_name or main_org_name in clean_csv_name:
            return csv_name
    
    # No match found
    return None

# Get all CSV organization names
csv_org_names = list(org_to_url_map.keys())

# Now update the JSON data with URLs
updated_json_data = []
for item in json_data:
    # Add a new field for the URL
    new_item = item.copy()
    new_item['KwaliteitsbeeldURL'] = ""
    
    # Try to find a match
    matched_org_name = find_best_match(item['Naam'], csv_org_names)
    if matched_org_name and matched_org_name in org_to_url_map:
        new_item['KwaliteitsbeeldURL'] = org_to_url_map[matched_org_name]
    
    updated_json_data.append(new_item)

# Count how many URLs we were able to add
urls_added = sum(1 for item in updated_json_data if item['KwaliteitsbeeldURL'])
match_rate = (urls_added / len(updated_json_data)) * 100

print(f"Summary of URL Matching:")
print(f"- Total organizations: {len(updated_json_data)}")
print(f"- Organizations with URLs: {urls_added}")
print(f"- Match rate: {match_rate:.1f}%")

# Save the updated JSON
with open(updated_json_file, 'w', encoding='utf-8') as f:
    json.dump(updated_json_data, f, indent=2)

print("Updated JSON created successfully")

Summary of URL Matching:
- Total organizations: 166
- Organizations with URLs: 91
- Match rate: 54.8%
Updated JSON created successfully
