This report provides status information on the latest endpoints for a hardcoded list of prioritised LPAs, or organisations from an input.

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [None]:
import urllib
import pandas as pd
import numpy as np
import urllib.parse
import os
%pip install wget
import wget
import requests
import datetime as dt

Download helper utility files from GitHub:

In [None]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [None]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    organisation_list = ['local-authority-eng:BUC', 'local-authority-eng:DAC', 'local-authority-eng:DNC',
    'local-authority-eng:GLO', 'local-authority-eng:CMD', 'local-authority-eng:LBH', 'local-authority-eng:SWK',
    'local-authority-eng:MDW', 'local-authority-eng:NET', 'local-authority-eng:BIR', 'local-authority-eng:CAT',
    'local-authority-eng:EPS', 'local-authority-eng:BNE', 'local-authority-eng:GAT', 'local-authority-eng:GRY',
    'local-authority-eng:KTT', 'local-authority-eng:SAL', 'local-authority-eng:TEW', 'local-authority-eng:WBK',
    'local-authority-eng:DST', 'local-authority-eng:DOV', 'local-authority-eng:LIV', 'local-authority-eng:RDB',
    'local-authority-eng:WFT', 'local-authority-eng:NLN', 'local-authority-eng:NSM', 'local-authority-eng:SLF',
    'local-authority-eng:WRL']
    print('Input file not found. Using default list of organisations.')

Get list of organisation names, to be displayed in the output table. This is gathered separately from the main data, to ensure that if an organisation has not provided any endpoints, it is still included in the output table.

In [None]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'article-4-direction,article-4-direction-area', 'conservation-area', 'conservation-area-document', 'conservation-area,conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order','tree,tree-preservation-zone', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:
    latest_endpoints_df = get_latest_endpoints(organisation)
    latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
    all_orgs_latest_endpoints[organisation] = latest_endpoints_df

For each of these endpoints, the latest status that the endpoints were hit are collected. If there is no status (e.g. connection error), the exception is used instead.
If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [None]:
pd.set_option('display.max_colwidth', None)

def compute_cell_colour(status):
    if status == "200":
        return 'background-color: green'
    elif status == 'No endpoint':
        return 'background-color: orange'
    else:
        return 'background-color: red'
    
def cut_zeros(row):
  if row[-2:]=='.0':
    row=row[:-2]
  return row

# Only display non 200 statuses if they have been non 200 for more than 5 days
def compute_displayed_status(row):
    # Check if the most recent status isn't 200
    if row["last_status"] == 200:
        last_200_date = pd.to_datetime(row["last_updated_date"])
    elif row["last_status"] != None:
        last_200_date = pd.to_datetime(row["date_last_status_200"])
    else:
        # If the most recent status is 200 then we can return 200
        return 200
    
    days_since_200 = (row["maxentrydate"] - last_200_date).days
    # Only show non 200 statuses if they have been non 200 for more than 5 days
    if days_since_200 >= 5:
        status = row['status']
        # Handle cases where there is no status by looking at the exception
        if not pd.isna(status):
            status = int(status)
        else:
            status=latest_endpoints_df.loc[latest_endpoints_df['status'].isna(), 'exception'].values[0]
            if status is None:
                status="Unknown Error"
        return status
    else:
        return 200


rows_list = []
organisation_dataset_statuses_dict = {}
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    latest_endpoints_df = latest_endpoints_df[pd.isna(latest_endpoints_df['end_date'])]
    try:
        name = organisation_name_dict[organisation]
    except:
        name = organisation
    
    dataset_statuses_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        if 'WFS' in row['endpoint_url']:
            response = requests.get(row['endpoint_url'], stream=True)
            try:
                content = next(response.iter_content(chunk_size=1024)).decode('utf-8')
            except requests.exceptions.RequestException as e:
                content = response.text
            if 'Cannot find layer' in content:
                row['status']='Cannot find layer'
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            # Consider cases where a dataset is contributed to by multiple endpoints
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False

            if not skip_dataset:
                dataset_statuses_dict[dataset] = compute_displayed_status(row)
    organisation_dataset_statuses_dict[organisation] = dataset_statuses_dict
   
    new_row = {'organisation': name}
    new_row.update(dataset_statuses_dict)
    rows_list.append(new_row)

output_df = pd.DataFrame(rows_list, columns=['organisation', *dataset_list])
output_df = output_df.replace(np.nan, "No endpoint")

output_df = output_df.astype(str)
output_df = output_df.applymap(cut_zeros)

output_df.to_csv('endpoint_status_master_report.csv', index=False)
output_df.style.applymap(compute_cell_colour, subset=dataset_list)

An output .csv under the name 'endpoint_status_not_200.csv' is created, containing the latest endpoints that do not have a status of 200

In [None]:
# Create output csv containing endpoints with a status other than 200
not_200_output_columns = ['name', 'pipelines', 'endpoint_url', 'organisation', 'collection', 'maxentrydate', 'entrydate', 'end_date', 'last_status', 'last_updated_date']

not_200_output_df = produce_output_csv(all_orgs_latest_endpoints, organisation_dataset_statuses_dict, "status", 200, not_200_output_columns)
not_200_output_df.to_csv('endpoint_status_not_200.csv', index=False)

In [None]:
# Create output csv containing endpoints with any status
all_status_output_columns = ['name', 'pipelines', 'endpoint_url', 'organisation', 'collection', 'maxentrydate', 'entrydate', 'end_date', 'last_status', 'last_updated_date']

all_status_output_df = produce_output_csv(all_orgs_latest_endpoints, organisation_dataset_statuses_dict, "status", "", all_status_output_columns)
all_status_output_df.to_csv('all_status_output.csv', index=False)