# Selecting Endpoints **with or without** Documentation URLs

**Author**:  Thiruni K <br>
**Date**:  24th April 2025 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>
**Purpose**: This notebook loads all endpoint records and performs analysis to determine:
   - Total number of endpoints
   - Count and percentage of endpoints missing `documentation_url`
   - Datasets most affected by missing documentation URLs
   - Whether those endpoints are still active or have ended
   - The most recent `entry_date` for missing documentation_url values


In [None]:
import pandas as pd
import urllib.parse
import requests

In [None]:
DATASSETTE_URL = "https://datasette.planning.data.gov.uk/digital-land.json"

# --- Step 1: Load all endpoint-related data using SQL OFFSET pagination
BASE_SQL = """
SELECT 
    o.name,
    s.organisation, 
    sp.pipeline AS "pipeline/dataset", 
    e.endpoint_url, 
    s.documentation_url,
    s.entry_date,
    s.end_date,
    e.endpoint
FROM 
    endpoint e
    INNER JOIN source s on e.endpoint = s.endpoint
    INNER JOIN source_pipeline sp on s.source = sp.source
    INNER JOIN organisation o on o.organisation = s.organisation
ORDER BY s.entry_date DESC
LIMIT 1000 OFFSET {offset}
"""

all_rows = []
offset = 0

while True:
    paginated_sql = BASE_SQL.format(offset=offset)
    params = {
        "sql": paginated_sql,
        "_size": 1000
    }

    response = requests.get(DATASSETTE_URL, params=params)
    if response.status_code != 200:
        print("Failed to retrieve data from Datasette.")
        break

    data = response.json()
    rows = data.get("rows", [])
    columns = data.get("columns", [])

    if not rows:
        break

    all_rows.extend(rows)
    offset += 1000

# Convert all rows into a DataFrame
if all_rows:
    df = pd.DataFrame(all_rows, columns=columns)
else:
    df = pd.DataFrame()

# --- Step 2: Analysis

if not df.empty:
    total_records = len(df)
    df['documentation_missing'] = df['documentation_url'].isnull() | (df['documentation_url'].astype(str).str.strip() == '')
    missing_count = df['documentation_missing'].sum()
    percent_missing = (missing_count / total_records) * 100

    print(f"Total endpoints: {total_records}")
    print(f"Missing documentation_url: {missing_count}")
    print(f"Percent missing: {percent_missing:.2f}%")

    # --- Step 3: Most affected datasets
    affected_by_pipeline = df[df['documentation_missing']].groupby('pipeline/dataset').size().sort_values(ascending=False)
    print("\nTop affected pipelines (by missing documentation_url):")
    print(affected_by_pipeline.head(10).to_string())

    # --- Step 4: Are missing ones active or ended?
    df['is_active'] = df['end_date'].isnull() | (df['end_date'].astype(str).str.strip() == '')
    active_missing = df[df['documentation_missing'] & df['is_active']].shape[0]
    ended_missing = df[df['documentation_missing'] & ~df['is_active']].shape[0]

    print(f"\nEndpoints with missing documentation_url that are still active: {active_missing}")
    print(f"Endpoints with missing documentation_url that are ended: {ended_missing}")

    # --- Step 5: Are we still adding missing ones?
    df['entry_date'] = pd.to_datetime(df['entry_date'], errors='coerce')
    most_recent_missing = df[df['documentation_missing']]['entry_date'].max()
    print(f"\nMost recent entry with missing documentation_url: {most_recent_missing.date() if pd.notnull(most_recent_missing) else 'N/A'}")
else:
    print("No data available to process.")

In [None]:
import matplotlib.pyplot as plt

if not df.empty and not affected_by_pipeline.empty:
    top_20 = affected_by_pipeline.head(10)
    ax = top_20.plot(
        kind='barh',
        figsize=(10, 6),
        title='Top 20 Pipelines with Missing Documentation URLs'
    )
    plt.xlabel('Count of Missing documentation_url')
    plt.ylabel('Pipeline / Dataset')
    plt.gca().invert_yaxis()  # Highest count on top

    # Add value labels to the bars
    for i, v in enumerate(top_20):
        ax.text(v + 0.5, i, str(v), va='center')

    plt.tight_layout()
    plt.show()