In [1]:
import pandas as pd
import urllib

In [4]:
def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df




In [6]:
q1 = """
    WITH unique_endpoints AS (
    SELECT
        collection,
        pipeline as dataset,
        endpoint,
        organisation,
        name,
        MIN(endpoint_entry_date) AS endpoint_entry_date
    FROM
        reporting_historic_endpoints
    WHERE
        (
        "endpoint_end_date" is null
        OR "endpoint_end_date" = ""
        )
        AND "endpoint_entry_date" < DATE('now', '-1 year')
        AND "status" LIKE "4%"
    GROUP BY
        collection, endpoint, name
    ),
    latest_log_entry AS (
    SELECT
        endpoint,
        MAX(latest_log_entry_date) AS latest_200_log_entry_date
    FROM
        reporting_historic_endpoints
    WHERE
        status = '200'
    GROUP BY
        endpoint
    )
    SELECT
    ue.dataset,
    ue.name,
    p.project,
    p.provision_reason,
    ue.endpoint,
    strftime('%d-%m-%Y', ue.endpoint_entry_date) as endpoint_entry_date,
    strftime('%d-%m-%Y', l.latest_200_log_entry_date) as latest_200_log_entry_date,
    CAST(julianday('now') - julianday(l.latest_200_log_entry_date) AS int64) as n_days_since_last_200,
    s.source
    FROM
    unique_endpoints ue
    LEFT JOIN source s ON ue.endpoint = s.endpoint
    LEFT JOIN latest_log_entry l ON ue.endpoint = l.endpoint
    LEFT JOIN provision p on ue.dataset = p.dataset and ue.organisation = p.organisation
    WHERE
    (
        l.latest_200_log_entry_date < DATE('now', '-5 day')
        OR l.latest_200_log_entry_date IS NULL
    )
    ORDER BY
    ue.dataset,
    julianday('now') - julianday(l.latest_200_log_entry_date) desc
  """

stale_like_4 = datasette_query("digital-land", q1)

In [7]:
q2 = """
WITH unique_endpoints AS (
  SELECT
    collection,
    pipeline as dataset,
    endpoint,
    organisation,
    name,
    MIN(endpoint_entry_date) AS endpoint_entry_date
  FROM
    reporting_historic_endpoints
  WHERE
    (
      "endpoint_end_date" is null
      OR "endpoint_end_date" = ""
    )
    AND "endpoint_entry_date" < DATE('now', '-1 year')
    AND "status" not LIKE "2%"
  GROUP BY
    collection, endpoint, name
),
latest_log_entry AS (
  SELECT
    endpoint,
    MAX(latest_log_entry_date) AS latest_200_log_entry_date
  FROM
    reporting_historic_endpoints
  WHERE
    status = '200'
  GROUP BY
    endpoint
)
SELECT
  ue.dataset,
  ue.name,
  p.project,
  p.provision_reason,
  ue.endpoint,
  strftime('%d-%m-%Y', ue.endpoint_entry_date) as endpoint_entry_date,
  strftime('%d-%m-%Y', l.latest_200_log_entry_date) as latest_200_log_entry_date,
  CAST(julianday('now') - julianday(l.latest_200_log_entry_date) AS int64) as n_days_since_last_200,
  s.source
FROM
  unique_endpoints ue
  LEFT JOIN source s ON ue.endpoint = s.endpoint
  LEFT JOIN latest_log_entry l ON ue.endpoint = l.endpoint
  LEFT JOIN provision p on ue.dataset = p.dataset and ue.organisation = p.organisation
WHERE
  (
    l.latest_200_log_entry_date < DATE('now', '-5 day')
    OR l.latest_200_log_entry_date IS NULL
  )
ORDER BY
  ue.dataset,
  julianday('now') - julianday(l.latest_200_log_entry_date) desc
  """

stale_not_2 = datasette_query("digital-land", q2)

In [8]:
print(len(stale_not_2))
print(len(stale_like_4))

59
38


In [10]:
stale_not_2[~stale_not_2["endpoint"].isin(stale_like_4["endpoint"])].head()

Unnamed: 0,dataset,name,project,provision_reason,endpoint,endpoint_entry_date,latest_200_log_entry_date,n_days_since_last_200,source
7,brownfield-land,South Downs National Park Authority,,statutory,110d1feccabea662a02fc393b1e8156f4d4e04f3d65b13...,22-05-2018,13-08-2024,8,15d579380710cba22bc70cbee02f20ed
8,brownfield-land,South Downs National Park Authority,,statutory,224e2b298f076be0a10d5bf0dd70293a983846a6b73ab0...,05-07-2018,13-08-2024,8,fdf0ca858b9a001e29954ccbb69316dd
9,brownfield-land,New Forest National Park Authority,,statutory,2d537016a9765eaf9d7790b3be27c495ef9970ee42ae62...,16-12-2019,13-08-2024,8,e8338e8ca638b9a9bd6b8a4a294a84ec
10,brownfield-land,New Forest National Park Authority,,statutory,36e2aca78ebc25d31cf598e884b0be28039f19f4714b29...,30-12-2021,13-08-2024,8,e2fe67c417e66eab18be0d03ffcff1cb
11,brownfield-land,South Downs National Park Authority,,statutory,52fabab08a9b97eb22b91f59ef90b1b394792b0e06d257...,30-12-2021,13-08-2024,8,62176ed1b8a71bd00e6c4f297709cefb


In [13]:
q3 = """
WITH endpoint_status_instances as (
  select 
     organisation, name, pipeline, endpoint, status, endpoint_entry_date, latest_log_entry_date,
     row_number() over (partition by organisation, pipeline, endpoint order by latest_log_entry_date desc) as status_instance
  from reporting_historic_endpoints
  where endpoint_end_date = ""
--  AND endpoint_entry_date < DATE('now', '-1 year')
  order by pipeline
  ),
  
latest_status_404 as(
  select * from endpoint_status_instances
  WHERE status_instance = 1
  AND status = 404
  ),
  
previous_status as (
  select endpoint, latest_log_entry_date as prev_status_entry_date
  FROM endpoint_status_instances
  where status_instance = 2
)

select 
  br.organisation, br.name, br.pipeline as dataset, br.endpoint, br.status as latest_status, 
  strftime('%d-%m-%Y', br.endpoint_entry_date) as endpoint_entry_date,
  strftime('%d-%m-%Y', br.latest_log_entry_date) as latest_log_entry_date,
  CAST(
    julianday(br.latest_log_entry_date) - julianday(
      case when pr.prev_status_entry_date is null then br.endpoint_entry_date else pr.prev_status_entry_date end
    ) AS int64) as status_duration_days
  
from latest_status_404 br
left join previous_status pr on br.endpoint = pr.endpoint
order by dataset, julianday(br.latest_log_entry_date) - julianday(pr.prev_status_entry_date) desc

  """

stale_404s = datasette_query("digital-land", q3)

In [14]:
stale_404s["endpoint"].value_counts()

endpoint
41fc6aaaf2059002dfe1401841e88a56278f580c6c425683f5fba935b339bdcd    4
28c69df499d7aec5b8121a8d0e0c4f08ede0c85700028bef1ffbbac45a72c79f    1
597ba6f8b88f54f59c1a1ae3bfb6972a1cc92a0a63de33cfe08bf4af0237c53b    1
d1f07ed85cd989adf1fed61292c2944d5293214fd198be3ef115d07e0eeff26c    1
8066711da61959140b9587725af9d389998127272c97ca5b9e87119b2d971aba    1
f7fe6fbe3117c7a0847cc26dd0c708c5c4c749d0a178cc846450a3a3a4bff969    1
22ab6c0ec4df21603a64d93bfaec46b8a039fe2ebbcdfd52d390115759970574    1
50ce8bc714cd997dd04c95ee0d5ac513274d92e2de727a357080ed366a73bf78    1
53f414c26a47ac567d167155e2d55cc523337818cdc32e5001429aca9aa98cfa    1
998fe275c1cce43f0ba4186ea2b0014dc711ccbba2f64109edbe9ac4842497ff    1
9e8b39e2651101f6070aecc6ab8138a767a921a69ade81663c0734cdc3d5df87    1
b9b5ea6bcbca02fbf560859045d78b5ef21b5dffbaa7d578b01e810e0572d6c6    1
f89b6ad0d286c7180c637fdac62c341e851f1c21779b19f5851978db1af1ee4a    1
4d0480beaf5e2be7ad7e63528c311cd0753ef2c1c17d58eb4855e4ee32ae0ca5    1
6b5a5ff541c

In [None]:
# version with filtering out endpoints which are only one for dataset

"""
WITH endpoint_status_instances as (
  select 
     organisation, name, pipeline, endpoint, status, endpoint_entry_date, latest_log_entry_date,
     row_number() over (partition by organisation, pipeline, endpoint order by latest_log_entry_date desc) as status_instance
  from reporting_historic_endpoints
  where endpoint_end_date = ""
--  AND endpoint_entry_date < DATE('now', '-1 year')
  order by pipeline
  ),
  
latest_status_404 as(
  select * from endpoint_status_instances
  WHERE status_instance = 1
  AND status = 404
  ),
  
previous_status as (
  select endpoint, latest_log_entry_date as prev_status_entry_date
  FROM endpoint_status_instances
  where status_instance = 2
),

solo_endpoints as (
  select organisation, pipeline, count(distinct endpoint) as n_endpoints
  from reporting_historic_endpoints
  where endpoint_end_date = ""
  group by 1, 2
  having count(distinct endpoint) = 1
  )

select 
  br.organisation, br.name, br.pipeline as dataset, br.endpoint, br.status as latest_status, 
  strftime('%d-%m-%Y', br.endpoint_entry_date) as endpoint_entry_date,
  strftime('%d-%m-%Y', br.latest_log_entry_date) as latest_log_entry_date,
  CAST(
    julianday(br.latest_log_entry_date) - julianday(
      case when pr.prev_status_entry_date is null then br.endpoint_entry_date else pr.prev_status_entry_date end
    ) AS int64) as status_duration_days
  
from latest_status_404 br
left join previous_status pr on br.endpoint = pr.endpoint
left join solo_endpoints se on br.organisation = se.organisation and br.pipeline = se.pipeline
where se.n_endpoints is null
order by dataset, julianday(br.latest_log_entry_date) - julianday(pr.prev_status_entry_date) desc
"""