In [None]:
from os import sys, path
from datetime import datetime as dt
from IPython.display import display
import urllib
import numpy as np
import math
import pandas as pd
import ipywidgets as widgets
import requests
import urllib.parse

# All Endpoints Information

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

primary_lpas = ["local-authority-eng:BUC","local-authority-eng:DAC","local-authority-eng:DNC","local-authority-eng:GLO","local-authority-eng:CMD","local-authority-eng:LBH","local-authority-eng:SWK","local-authority-eng:MDW","local-authority-eng:NET", "local-authority-eng:BIR","local-authority-eng:CAT","local-authority-eng:EPS","local-authority-eng:BNE","local-authority-eng:GAT","local-authority-eng:GRY","local-authority-eng:KTT","local-authority-eng:SAL","local-authority-eng:TEW","local-authority-eng:WBK","local-authority-eng:DST","local-authority-eng:DOV","local-authority-eng:LIV","local-authority-eng:RDB","local-authority-eng:WFT","local-authority-eng:NLN","local-authority-eng:NSM","local-authority-eng:SLF","local-authority-eng:WRL"]

result_df = pd.DataFrame()

def update_dataframe(organisation):
    global result_df
    if organisation == "Primary LPA's":
        query = f" s.organisation LIKE '%'"
    elif organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          e.endpoint_url,
          l.status,
          l.exception,
          s.collection,
          group_concat(DISTINCT sp.pipeline) as pipelines,
          s.organisation,
          o.name,
          max(l.entry_date) maxentrydate,
          max(e.entry_date) entrydate,
          e.end_date
        from
          log l
          inner join source s on l.endpoint = s.endpoint
          inner join organisation o on o.organisation = replace(s.organisation, '-eng', '')
          inner join endpoint e on l.endpoint = e.endpoint
          inner join source_pipeline sp on s.source = sp.source
        where
           {query} and not collection="brownfield-land"
        group by
          l.endpoint,
          l.status
        order by
          l.endpoint,
          s.collection,
          maxentrydate desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    result_df = pd.read_csv(url)
    if(organisation == "Primary LPA's"):
        result_df = result_df[result_df['organisation'].isin(primary_lpas)].reset_index(drop=True)
    return result_df

def get_provisions():
    global provisions_df  
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            p.cohort,
            p.notes,
            p.organisation,
            p.project,
            p.provision_reason,
            o.reference,
            o.name
        FROM
            provision AS p
        JOIN
            organisation AS o ON SUBSTR(p.organisation, INSTR(p.organisation, ':') + 1) = o.reference
        WHERE
            p.cohort IN ("ODP-Track1", "ODP-Track3", "ODP-Track2", "RIPA-BOPS")
            AND p.provision_reason = "expected"
        GROUP BY
            p.organisation
        ORDER BY
            p.cohort
        """,
        "_size": "max"
    })
    url = f"{datasette_url}digital-land.csv?{params}"
    provisions_df = pd.read_csv(url)
    return provisions_df

def get_org_dict():
    global organisation_options
    provisions_df = get_provisions()[["organisation", "name"]]
    provisions_df.loc[:, "organisation"] = provisions_df["organisation"].str.replace(":", "-eng:")
    organisation_options = dict(zip(provisions_df['name'], provisions_df['organisation']))
    organisation_options.update({"All LPA's":None})
    return organisation_options

global organisation_dropdown
organisation_dropdown = widgets.Dropdown(
    options = get_org_dict(),
    description="Select LPA:",
)

widgets.interact(update_dataframe, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table with all endpoints? (yes/no): ")

if download.lower() == "yes":
    result_df.to_csv("endpoints_with_all_status.csv", index=False)
    print("Query result downloaded as 'endpoints_with_all_status.csv'") 

# Endpoints with current/latest status

In [None]:
new_df = pd.DataFrame()

def update_dataframe_latest_status(organisation):
    global new_df
    all_endpoints=update_dataframe(organisation)
    new_df=all_endpoints.copy()
    new_df['maxentrydate'] = pd.to_datetime(new_df['maxentrydate'])
    new_df['last_status'] = None
    new_df['last_updated_date'] = None
    new_df['date_last_status_200'] = None
    
    for index, row in new_df.iterrows():
        if index < len(new_df) - 1 and (row['status']!=200 or pd.isna(row['status'])):
            if row['endpoint_url'] == new_df.at[index + 1, 'endpoint_url']:
                new_df.at[index, 'last_status'] = new_df.at[index + 1, 'status']
                new_df.at[index, 'last_updated_date'] = new_df.at[index + 1, 'maxentrydate']   
    
    new_df.drop_duplicates(subset='endpoint_url', keep='first', inplace=True)
    new_df.reset_index(drop=True, inplace=True)
    for index, row in new_df.iterrows():
        if row['last_status'] is not None:
            if row['last_status'] != 200  or row['last_status'] is None:
                filtered_df = all_endpoints[(all_endpoints['endpoint_url'] == row['endpoint_url'] ) & (all_endpoints['status'] == 200)]
                if not filtered_df.empty:
                    new_df.at[index, 'date_last_status_200'] = filtered_df['maxentrydate'].values[0][:19] 
    return new_df

widgets.interact(update_dataframe_latest_status, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table with latest endpoints? (yes/no): ")

if download.lower() == "yes":
    new_df.to_csv("endpoints_with_latest_status.csv", index=False)
    print("Query result downloaded as 'endpoints_with_latest_status.csv'")

# Endpoints with status NOT 200

In [None]:
filtered_df = pd.DataFrame()

def update_dataframe_erroring_endpoints(organisation):
    global filtered_df
    filtered_df=update_dataframe_latest_status(organisation)
    filtered_df = filtered_df[filtered_df['status'] != 200] 
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df

widgets.interact(update_dataframe_erroring_endpoints, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table with erroring endpoints being collected till date? (yes/no): ")

if download.lower() == "yes":
    filtered_df.to_csv("endpoints_not_200.csv", index=False)
    print("Query result downloaded as 'endpoints_not_200.csv'")

# First 4 datasets - Endpoints with status NOT 200 - All LPA's

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

result_df = pd.DataFrame()

def update_dataframe1(collection):
    global df1  
    if collection:
        query = f" s.collection = '{collection}'"
    else:
        query = f" s.collection IN ('article-4-direction', 'listed-building', 'tree-preservation-order', 'conservation-area','article-4-direction-area','article-4-direction-rule','listed-building-grade','listed-building-outline','listed-building-building','tree','tree-preservation-zone','tree-preservation-zone-type')"
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          e.endpoint_url,
          l.status,
          l.exception,
          s.collection,
          group_concat(DISTINCT sp.pipeline) as pipelines,
          s.organisation,
          o.name,
          max(l.entry_date) maxentrydate,
          max(e.entry_date) entrydate,
          e.end_date
        from
          log l
          inner join source s on l.endpoint = s.endpoint
          inner join organisation o on o.organisation = replace(s.organisation, '-eng', '')
          inner join endpoint e on l.endpoint = e.endpoint
          inner join source_pipeline sp on s.source = sp.source
        where
           {query} 
        group by
          l.endpoint,
          l.status
        order by
          pipelines,
          o.name,
          maxentrydate desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df1 = pd.read_csv(url)
    result_df1 = df1.copy()

    df1['maxentrydate'] = pd.to_datetime(df1['maxentrydate'])
    df1['last_status'] = None
    df1['last_updated_date'] = None
    df1['date_last_status_200'] = None
    
    for index, row in df1.iterrows():
        if index < len(df1) - 1 and (row['status']!=200 or pd.isna(row['status'])):
            if row['endpoint_url'] == df1.at[index + 1, 'endpoint_url']:
                df1.at[index, 'last_status'] = df1.at[index + 1, 'status']
                df1.at[index, 'last_updated_date'] = df1.at[index + 1, 'maxentrydate']   
    
    df1.drop_duplicates(subset='endpoint_url', keep='first', inplace=True)
    df1.reset_index(drop=True, inplace=True)
    for index, row in df1.iterrows():
        if row['last_status'] is not None:
                if row['last_status'] != 200  or row['last_status'] is None:
                    filtered_df = result_df1[(result_df1['endpoint_url'] == row['endpoint_url'] ) & (result_df1['status'] == 200)]
                    if not filtered_df.empty:
                        df1.at[index, 'date_last_status_200'] = filtered_df['maxentrydate'].values[0][:19]
    df1 = df1[df1['status'] != 200]
    df1.reset_index(drop=True, inplace=True)
    
    return df1


global collection_options    
collection_options = {
    "All 4 datasets":None,"article 4 direction": "article-4-direction","conservation area": "conservation-area","listed building": "listed-building",
    "tree preservation order": "tree-preservation-order"
    
}
global organisation_dropdown
collection_dropdown = widgets.Dropdown(
    options=collection_options,
    description="Select dataset:",
)

widgets.interact(update_dataframe1, collection=collection_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table with erroring endpoints being collected till date? (yes/no): ")

if download.lower() == "yes":
    df1.to_csv("endpoints_not_200_first_4_datasets.csv", index=False)
    print("Query result downloaded as 'endpoints_not_200_first_4_datasets.csv'")

# Counting the Number of Distinct Non-ended Endpoints

This SQL query will **count** the total number of **distinct** and **active** endpoint urls for each collection dataset, for each organisation, and will only return rows with counts greater than 1. Datasets with more than one endpoint url can then be further investigated, so see whether these additional endpoint urls are disfunctional or no longer needed.

**N.B:**
- Whether the endpoint_url is active it determined by the presence of an end_date value.
- Currently the LPA list isn't exhaustive and may need to be added to (defined above) but "all LPAs" can be selected.
- The BFL dataset is not included in this query because of how large the BFL dataset is.
- The number of non-ended endpoints where `status != 200` can also be counted here but it pushes the datasette query limit.


In [None]:
def get_endpoint_aggregate(organisation):
    global endpoint_aggregate  
    if organisation == "Primary LPA's":
        query = f" s.organisation LIKE '%'"
    elif organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            s.organisation AS organisation,
            s.collection AS collection,
            sp.pipeline AS pipeline,
            COUNT ( 
                DISTINCT 
                    CASE 
                        WHEN e.end_date = "" THEN e.endpoint_url ELSE NULL END
                )
                AS non_ended_endpoints,
            COUNT ( 
                DISTINCT 
                    CASE 
                        WHEN e.end_date != "" THEN e.endpoint_url ELSE NULL END
                )
                AS ended_endpoints
        FROM
          log l
          INNER JOIN source s ON l.endpoint = s.endpoint
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          LEFT JOIN source_pipeline sp ON s.source = sp.source
        WHERE
           ({query})
           AND (NOT collection="brownfield-land")
        GROUP BY
            s.organisation,
            s.collection,
            sp.pipeline
        HAVING
            non_ended_endpoints > 1
        ORDER BY
            non_ended_endpoints DESC
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    endpoint_aggregate = pd.read_csv(url)
    if(organisation == "Primary LPA's"):
        endpoint_aggregate = endpoint_aggregate[endpoint_aggregate['organisation'].isin(primary_lpas)].reset_index(drop=True)
    return endpoint_aggregate

widgets.interact(get_endpoint_aggregate, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    endpoint_aggregate.to_csv("endpoint_aggregate.csv", index=False)
    print("Query result downloaded as 'endpoint_aggregate.csv'")

# Status and end_date of **all** Distinct Endpoints

This cell contains functions which retrieve **all** distinct **non BFL** endpoints, grouped by collection and pipeline **(not just the endpoints counted above)**. The endpoint entry date, end date, most recent log date and most recent status for each endpoint are retrieved to help deduce whether the endpoint is no longer functional and if it has been manually listed as an old endpoint.

**N.B:** 
- This query pushes the datasette timeout limit, and often gives an internal server (500) error, rerun this and following cells if this is the case.


In [None]:
def get_most_recent_logs():
    global most_recent_logs  
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT e.endpoint AS endpoint_hash, MAX(l.entry_date) AS most_recent_entry_date, l.status AS most_recent_status
        FROM endpoint e
        JOIN log l ON e.endpoint = l.endpoint
        GROUP BY e.endpoint
        """,
        "_size": "max"
    })

    url = f"{datasette_url}digital-land.csv?{params}"
    most_recent_logs = pd.read_csv(url)
    return most_recent_logs

def get_distinct_endpoints_status_and_end_dates(organisation):
    global distinct_endpoints_status_and_end_dates_df  
    if organisation == "Primary LPA's":
        query = f" s.organisation LIKE '%'"
    elif organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            s.organisation AS organisation,
            s.collection AS collection,
            sp.pipeline AS pipeline,
            e.endpoint_url AS endpoint,
            e.endpoint AS endpoint_hash,
            e.entry_date,
            s.end_date AS end_date
        FROM
          log l
          INNER JOIN source s ON l.endpoint = s.endpoint
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          LEFT JOIN source_pipeline sp ON s.source = sp.source
        WHERE
           ({query})
           AND (NOT collection="brownfield-land")
        GROUP BY
            s.collection,
            sp.pipeline,
            e.endpoint
        ORDER BY
            s.collection,
            sp.pipeline
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    distinct_endpoints_status_and_end_dates_df = pd.read_csv(url)
    if(organisation == "Primary LPA's"):
        distinct_endpoints_status_and_end_dates_df = distinct_endpoints_status_and_end_dates_df[distinct_endpoints_status_and_end_dates_df['organisation'].isin(primary_lpas)].reset_index(drop=True)
    return distinct_endpoints_status_and_end_dates_df

def combine_dataframes(organisation):
    global distinct_endpoints_status_and_end_dates_with_status_df
    left_df = get_distinct_endpoints_status_and_end_dates(organisation)
    right_df = get_most_recent_logs()
    distinct_endpoints_status_and_end_dates_with_status_df = pd.merge(left_df, right_df, left_on=['endpoint_hash'], right_on=['endpoint_hash'])
    return distinct_endpoints_status_and_end_dates_with_status_df

# Investigating Datasets with Multiple Endpoint Urls
The `endpoint_aggregate` dataframe counts the number of active endpoints for each organisation dataset, the function below merges this with the `distinct_endpoints_status_and_end_dates_with_status_df` to list all suspect endpoints in more detail, rather than their aggregates.

In [None]:
def get_suspect_endpoints(organisation):
    global possible_duplicate_endpoints
    # Create both dataframes to be joined
    all_distinct_endpoints = combine_dataframes(organisation)
    endpoint_aggregate = get_endpoint_aggregate(organisation)
    # A One-to-many merge
    df = pd.merge(all_distinct_endpoints, endpoint_aggregate, left_on=['organisation', 'collection', 'pipeline'], right_on=['organisation', 'collection', 'pipeline'])
    #Removing endpoint rows which had no match with the endpoint_aggregate dataframe
    df = df.dropna(subset=["non_ended_endpoints"]).drop(["non_ended_endpoints", "ended_endpoints"], axis=1)
    # Dropping rows with an end_date value (non-active), sorting and reseting the index field
    df = df.drop(df[df.end_date.notnull()].index, axis=0).drop(["end_date"], axis=1).sort_values(by = ["organisation", "collection", "pipeline"]).reset_index(drop=True)
    possible_duplicate_endpoints = df
    return df

# Consider when these endpoints were last successfully accessed
The functions below add the field describing "the timestamp the endpoint was last successfully accessed" (gave a 200-like response) to the `possible_duplicate_endpoints` dataframe generated above, this dataframe is then rendered.


In [None]:
def get_when_successfully_accessed():
    global last_successfully_accessed  
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            MAX(l.entry_date) AS last_200_LIKE_response_timestamp,
            e.endpoint AS endpoint_hash
        FROM
          log l
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          INNER JOIN source s ON l.endpoint = s.endpoint
        WHERE
            (NOT collection="brownfield-land")
            AND l.status LIKE "2%"
        GROUP BY
            e.endpoint_url
        """,
        "_size": "max"
    })
    url = f"{datasette_url}digital-land.csv?{params}"
    last_successfully_accessed = pd.read_csv(url)
    return last_successfully_accessed

def append_last_successfully_accessed(organisation):
    global possible_duplicate_endpoints_last_200
    possible_duplicate_endpoints_last_200 = pd.merge(get_suspect_endpoints(organisation), get_when_successfully_accessed(), left_on=['endpoint_hash'], right_on=['endpoint_hash'])
    return possible_duplicate_endpoints_last_200

widgets.interact(append_last_successfully_accessed, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    possible_duplicate_endpoints_last_200.to_csv("possible_duplicate_endpoints_last_200.csv", index=False)

## - Stale Endpoints
Endpoints which were last successfully accessed over **5 days** ago can be assumed faulty, or effectively ended and can be recommended for removal by the standard process https://docs.google.com/document/d/1Xm1frOBY-J4mLfigXuFdeq976cQGghhnt0gbmZleAyc/edit#heading=h.y6u78drjip12.


In [None]:
def get_stale_endpoints(organisation):
    global stale_endpoints
    # Grab datetime of 5 days ago relative to current date
    five_days_ago_timestamp = pd.to_datetime('today').normalize().tz_localize("Europe/London" ,ambiguous=True) - pd.Timedelta(days=5)
    # Assign new df variable and convert most_recent_status to string for later string comparison
    df = append_last_successfully_accessed(organisation)
    df["most_recent_status"] = df["most_recent_status"].astype(str)
    # Convert last_200_response_timestamp field to datetime
    df["last_200_LIKE_response_timestamp"] = pd.to_datetime(df["last_200_LIKE_response_timestamp"])
    # Grab non-200 data by comparison
    df = df[~df["most_recent_status"].str.contains("2")]
    # Filter for entries which only returned 200 over 5 days ago
    df = df[df['last_200_LIKE_response_timestamp'] < five_days_ago_timestamp].reset_index(drop=True)
    # Convert most_recent_status back to float
    df["most_recent_status"] = df["most_recent_status"].astype(float)
    stale_endpoints =df
    return df

widgets.interact(get_stale_endpoints, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

In [None]:
download = input("Do you want to download the stale_endpoints table? (yes/no): ")

if download.lower() == "yes":
    stale_endpoints.to_csv("stale_endpoints.csv", index=False)

# Grouping Endpoints then Converting to Lists
The `possible_duplicate_endpoints` dataframe above is grouped by organisation, collection and pipeline, all unique endpoint urls are then placed into a list so that they can be easily looped through. This is so that the csv contents of each endpoint can be compared in future, and then decide on which endpoints to keep.

**N.B:** This currently includes the endpoints previously highlighted as stale.


In [None]:
def list_endpoints_per_org_dataset(organisation):
    global possible_duplicate_endpoints_aggregate
    # Remove unnecessary columns
    df = get_suspect_endpoints(organisation).drop(["most_recent_status"], axis = 1)
    # Group by organisation, collection and pipeline, append the aggregated endpoint urls into a list then reset the index
    df = df.groupby(["organisation","collection","pipeline"])["endpoint"].apply(list).reset_index()
    possible_duplicate_endpoints_aggregate = df
    return possible_duplicate_endpoints_aggregate

widgets.interact(list_endpoints_per_org_dataset, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

# This can be looped through programmatically to check the contents of the remaining endpoints to be eliminated

In [None]:
download = input("Do you want to download the possible_duplicate_endpoints_aggregate table? (yes/no): ")

if download.lower() == "yes":
    possible_duplicate_endpoints_aggregate.to_csv("possible_duplicate_endpoints_aggregate.csv", index=False)