# Download Infofactures PDFs

This notebook downloads all infofacture PDFs from the Atlasant√© website for each UDI code in the database.
(cf https://carto.atlasante.fr/1/ars_metropole_udi_infofactures.map)

For each UDI code in the `atlasante_udi_2024` table, it:

- Downloads the corresponding PDF from `https://carto.atlasante.fr/IHM/cartes/infofactures/AQUASISED/2024/`
- Saves it to `database/infos_factures/{code_udi}.pdf`
- Skips files that already exist
- Uses random delays (0.5-2s) between requests to avoid overloading the server


In [None]:
import duckdb
import requests
import time
import random
from pathlib import Path
from tqdm import tqdm


# Get all UDI codes
db_path = "../../database/data.duckdb"
conn = duckdb.connect(db_path, read_only=True)
query = "SELECT distinct code_udi FROM atlasante_udi_2024"
result = conn.execute(query).fetchall()
udi_codes = [row[0] for row in result]
conn.close()

print(f"Found {len(udi_codes)} UDI codes")

# Create output directory if it doesn't exist
output_dir = Path("../../database/infos_factures")
output_dir.mkdir(parents=True, exist_ok=True)

# Download PDFs for each UDI code
base_url = "https://carto.atlasante.fr/IHM/cartes/infofactures/AQUASISED/2024/INFOFACTURE-{code_udi}-2024.pdf"
success_count = 0
error_count = 0


for code_udi in (pbar := tqdm(udi_codes, desc="Downloading PDFs")):
    # Update progress bar to show current UDI code
    pbar.set_postfix({"current": code_udi})

    output_path = output_dir / f"{code_udi}.pdf"

    # Skip if file already exists
    if output_path.exists():
        # print(f"Skipping {code_udi} - file already exists")
        success_count += 1
        continue

    # Build URL
    url = base_url.format(code_udi=code_udi)

    try:
        # print(f"Downloading {code_udi} from {url}")
        # Download PDF
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        # Save to file
        with open(output_path, "wb") as f:
            f.write(response.content)

        success_count += 1

    except requests.exceptions.RequestException as e:
        if e.response and e.response.status_code != 404:
            print(f"Error downloading {code_udi}: {e}")
        error_count += 1
    except Exception as e:
        print(f"Unexpected error for {code_udi}: {e}")
        error_count += 1

    # Add random delay between 0.5 and 2 seconds to avoid overloading the server
    time.sleep(random.uniform(0.5, 2.0))

print("\nDownload complete!")
print(f"Success: {success_count}/{len(udi_codes)}")
print(f"Errors: {error_count}/{len(udi_codes)}")