This report provides issue/data quality information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [None]:
import numpy as np
import pandas as pd
import os
%pip install wget
import wget

Download helper utility files from GitHub:

In [None]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *


The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [None]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    organisation_list = ['local-authority-eng:BUC', 'local-authority-eng:DAC', 'local-authority-eng:DNC',
    'local-authority-eng:GLO', 'local-authority-eng:CMD', 'local-authority-eng:LBH', 'local-authority-eng:SWK',
    'local-authority-eng:MDW', 'local-authority-eng:NET', 'local-authority-eng:BIR', 'local-authority-eng:CAT',
    'local-authority-eng:EPS', 'local-authority-eng:BNE', 'local-authority-eng:GAT', 'local-authority-eng:GRY',
    'local-authority-eng:KTT', 'local-authority-eng:SAL', 'local-authority-eng:TEW', 'local-authority-eng:WBK',
    'local-authority-eng:DST', 'local-authority-eng:DOV', 'local-authority-eng:LIV', 'local-authority-eng:RDB',
    'local-authority-eng:WFT', 'local-authority-eng:NLN', 'local-authority-eng:NSM', 'local-authority-eng:SLF',
    'local-authority-eng:WRL' ]
    print('Input file not found. Using default list of organisations.')

Get list of organisation names, to be displayed in the output table. This is gathered separately from the main data, to ensure that if an organisation has not provided any endpoints, it is still included in the output table.

In [None]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'article-4-direction,article-4-direction-area', 'conservation-area', 'conservation-area-document', 'conservation-area,conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order','tree,tree-preservation-zone', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:
    latest_endpoints_df = get_latest_endpoints(organisation)
    latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
    all_orgs_latest_endpoints[organisation] = latest_endpoints_df

For each of these endpoints, the associated issues are collected. Any issues with severity 'info' are ignored. 
If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [None]:
# Iterate over organisations and get issues for each endpoint
organisation_dataset_issues_dict = {}
info_issue_types = get_issue_types_with_severity_info()
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_issues_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
          # Consider cases where a dataset is contributed to by multiple endpoints
          # Skip if the dataset has already been processed by a newer endpoint
          same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
          if len(same_datasets_df) > 1:
            skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
          else:
            skip_dataset = False

          if not skip_dataset:    
            issues_df = get_issues_for_resource(resource, dataset)
            issues_df.drop_duplicates(subset='issue_type', keep='first', inplace=True)
            issues = issues_df['issue_type'].values.tolist()
            # Remove info issues from list
            for issue in info_issue_types:
                if issue in issues:
                  issues.remove(issue)
            if issues is None or issues == []:
              dataset_issues_dict[dataset] = 'No issues'
            else:
              dataset_issues_dict[dataset] = ', '.join(issues)
        organisation_dataset_issues_dict[organisation] = dataset_issues_dict
    


In [None]:
pd.set_option('display.max_colwidth', None)

def compute_cell_colour(value):
    if value == "No issues":
        return 'background-color: green'
    elif value == "No endpoint":
        return 'background-color: orange'
    else:
        return 'background-color: red'

rows_list = []
for organisation in organisation_list:
    df = all_orgs_latest_endpoints[organisation]
    df = df[pd.isna(df['end_date'])]
    try:
        name = organisation_name_dict[organisation]
    except:
        name = organisation
    issues = {}
        
    new_row = {'organisation': name}
    new_row.update(organisation_dataset_issues_dict.get(organisation, {}))
    rows_list.append(new_row)

output_df = pd.DataFrame(rows_list, columns=['organisation', *dataset_list])
output_df = output_df.replace(np.nan, "No endpoint")
output_df.to_csv('endpoint_issues_master_report.csv', index=False)
output_df = output_df.style.applymap(compute_cell_colour, subset=dataset_list)
output_df

An output .csv under the name 'endpoint_issues_has_issues.csv 'is created for the latest endpoints that have issues

In [None]:
# Create output csv containing endpoints with issues
has_issues_output_columns = ['name', 'pipelines', 'endpoint_url', 'organisation', 'collection', 'maxentrydate', 'entrydate', 'end_date', 'last_status', 'last_updated_date']

has_issues_output_df = produce_output_csv(all_orgs_latest_endpoints, organisation_dataset_issues_dict, "issues", "No issues", has_issues_output_columns)
has_issues_output_df.to_csv('endpoint_issues_has_issues.csv', index=False)