This script generates a comprehensive **issue summary CSV** for datasets published under the **Open Digital Planning (ODP)** project. It works by querying two key sources from the [Datasette Planning instance](https://datasette.planning.data.gov.uk):

1. **Expected dataset provisions** (what each organisation should supply).
2. **Provision summary with issue counts** (including errors, warnings, and notices).

---

### Key Functionality:

- **Dataset Types**:
  - `SPATIAL_DATASETS` (e.g., `tree`, `conservation-area`)
  - `DOCUMENT_DATASETS` (e.g., `tree-preservation-order`)
  - These can be queried individually or combined (`ALL_DATASETS`).

- **Reliable Datasette Queries**:
  - A `requests` session with retry logic ensures resilient HTTP querying from the Datasette API.

- **Provision Metadata**:
  - `get_provisions()` collects the expected datasets per organisation and cohort.
  
- **Issue Summaries**:
  - The script paginates through the `provision_summary` table, fetching key statistics like:
    - `active_endpoint_count`
    - `error_endpoint_count`
    - `count_issue_error_*`, `warning_*`, and `notice_*` by internal/external source

- **Merging and Reshaping**:
  - The script merges expected provisions with issue statistics and restructures the data into a clean per-dataset format.

- **Output**:
  - A single CSV file, `odp-issue-summary.csv`, is generated in the provided output directory, detailing the issue breakdown for each organisation and dataset.



In [2]:
import os
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import argparse

SPATIAL_DATASETS = [
    "article-4-direction-area",
    "conservation-area",
    "listed-building-outline",
    "tree-preservation-zone",
    "tree",
]
DOCUMENT_DATASETS = [
    "article-4-direction",
    "conservation-area-document",
    "tree-preservation-order",
]
ALL_DATASETS = SPATIAL_DATASETS + DOCUMENT_DATASETS

def get_datasette_http():
    retry_strategy = Retry(total=3, status_forcelist=[400], backoff_factor=0.2)
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    return http

def get_datasette_query(db: str, sql: str, url="https://datasette.planning.data.gov.uk") -> pd.DataFrame:
    full_url = f"{url}/{db}.json"
    params = {"sql": sql, "_shape": "array", "_size": "max"}
    try:
        http = get_datasette_http()
        response = http.get(full_url, params=params)
        response.raise_for_status()
        return pd.DataFrame(response.json())
    except Exception as e:
        print(f"[ERROR] Datasette query failed: {e}")
        return pd.DataFrame()

def get_provisions():
    sql = """
        SELECT
            p.cohort,
            p.organisation,
            c.start_date AS cohort_start_date,
            o.name AS organisation_name
        FROM provision p
        INNER JOIN cohort c ON c.cohort = p.cohort
        INNER JOIN organisation o ON o.organisation = p.organisation
        WHERE p.provision_reason = 'expected'
          AND p.project = 'open-digital-planning'
        GROUP BY p.organisation, p.cohort
    """
    return get_datasette_query("digital-land", sql)

def get_issue_type_chunk(dataset_clause, offset):
    sql = f"""
        SELECT
            edits.*,
            eds.endpoint_end_date,
            eds.endpoint_entry_date,
            eds.latest_status,
            eds.latest_exception
        FROM endpoint_dataset_issue_type_summary edits
        LEFT JOIN (
            SELECT endpoint, end_date as endpoint_end_date,
                   entry_date as endpoint_entry_date,
                   latest_status, latest_exception
            FROM endpoint_dataset_summary
        ) eds ON edits.endpoint = eds.endpoint
        {dataset_clause}
        LIMIT 1000 OFFSET {offset}
    """
    return get_datasette_query("performance", sql)

def get_full_issue_type_summary(datasets):
    dataset_clause = "WHERE " + " OR ".join(f"edits.dataset = '{ds}'" for ds in datasets)
    df_list = []
    offset = 0
    while True:
        chunk = get_issue_type_chunk(dataset_clause, offset)
        if chunk.empty:
            break
        df_list.append(chunk)
        if len(chunk) < 1000:
            break
        offset += 1000
    return pd.concat(df_list, ignore_index=True)

def generate_detailed_issue_csv(output_dir: str, dataset_type="all") -> str:
    datasets = {
        "spatial": SPATIAL_DATASETS,
        "document": DOCUMENT_DATASETS,
        "all": ALL_DATASETS
    }.get(dataset_type, ALL_DATASETS)

    print("[INFO] Fetching provisions...")
    provisions = get_provisions()

    print("[INFO] Fetching detailed issue-level data...")
    issues = get_full_issue_type_summary(datasets)

    print("[INFO] Merging data...")
    merged = provisions.merge(issues.drop(columns=["organisation_name"], errors="ignore"), on=["organisation", "cohort"], how="inner")

    print("[INFO] Saving CSV...")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "odp-issue.csv")
    merged[
        [
            "organisation",
            "cohort",
            "organisation_name",
            "pipeline",
            "issue_type",
            "severity",
            "responsibility",
            "count_issues",
            "collection",
            "endpoint",
            "endpoint_url",
            "latest_status",
            "latest_exception",
            "resource",
            "latest_log_entry_date",
            "endpoint_entry_date",
            "endpoint_end_date",
            "resource_start_date",
            "resource_end_date",
        ]
    ].to_csv(output_path, index=False)

    print(f"[SUCCESS] CSV saved: {output_path} ({len(merged)} rows)")
    return output_path

def parse_args():
    parser = argparse.ArgumentParser(description="Generate detailed ODP issue-level CSV")
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to save the output CSV"
    )
    return parser.parse_args()


if __name__ == "__main__":
    # Parse arguments from CLI
    #args = parse_args()
    output_dir = "C:/Users/DanielGodden/Documents/MCHLG/collecting_and_managing_data"#args.output_dir
    generate_detailed_issue_csv(output_dir, dataset_type="all")


[INFO] Fetching provisions...
[INFO] Fetching detailed issue-level data...
[INFO] Merging data...
[INFO] Saving CSV...
[SUCCESS] CSV saved: C:/Users/DanielGodden/Documents/MCHLG/collecting_and_managing_data\odp-issue.csv (575 rows)
