This report provides compliance to specification information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The column 'structure_score' tells us how much data an endpoint is giving us as a fraction of what we ask for. The column 'column_name_score' tells us how many columns are correctly named.

Example: a column name that is incorrect (e.g 'area' instead of 'geometry') but the data in it has been detected as correct data will score in the 'structure_score' column but not the 'column_name' column

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [None]:
%pip install wget
import wget
import pandas as pd
import os


Download helper utility files from GitHub:

In [None]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [None]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    organisation_list = [
    'local-authority-eng:BUC', 
    'local-authority-eng:DAC', 'local-authority-eng:DNC',
    'local-authority-eng:GLO', 'local-authority-eng:CMD', 'local-authority-eng:LBH', 'local-authority-eng:SWK',
    'local-authority-eng:MDW', 'local-authority-eng:NET', 'local-authority-eng:BIR', 'local-authority-eng:CAT',
    'local-authority-eng:EPS', 'local-authority-eng:BNE', 'local-authority-eng:GAT', 'local-authority-eng:GRY',
    'local-authority-eng:KTT', 'local-authority-eng:SAL', 'local-authority-eng:TEW', 'local-authority-eng:WBK',
    'local-authority-eng:DST', 'local-authority-eng:DOV', 'local-authority-eng:LIV', 'local-authority-eng:RDB',
    'local-authority-eng:WFT', 'local-authority-eng:NLN', 'local-authority-eng:NSM', 'local-authority-eng:SLF',
    'local-authority-eng:WRL' ]
    print('Input file not found. Using default list of organisations.')

In [None]:
def check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset):
    dataset_columns = dataset_field_df['field'].tolist()
    # Remove automatically assigned columns by the pipeline from scoring
    dataset_columns = remove_assigned_columns(dataset, dataset_columns)
    
    missing_columns = []
    present_columns = []
    # Count whether columns in the specification are present in the endpoint
    for column in dataset_columns:
        if column not in fields:
            missing_columns.append(column)
        else:
            present_columns.append(column)
    structure_score = f"{len(dataset_columns) - len(missing_columns)}/{len(dataset_columns)}"
    structure_percentage = (len(dataset_columns) - len(missing_columns)) / len(dataset_columns) * 100

    # The WKT column is removed from the column_field mapping as it is autogenerated by the pipeline for some file formats (e.g geojson)
    filtered_columns = ["WKT"]
    column_field_df = column_field_df[-column_field_df['column'].isin(filtered_columns)]

    mapped_fields = column_field_df['field'].tolist()
    # print("present columns: ", present_columns)
    # print("missing columns: ", missing_columns)
    # print("column field mapping: \n", column_field_df)
    correct_column_names = 0
    for field in present_columns:
        # If a field isn't present in the mapped fields it is correctly named
        # Or if the column name is the same as the field name it is correctly named
        if field not in mapped_fields or column_field_df[column_field_df['field'] == field]['column'].tolist()[0] == field:
            correct_column_names += 1
    
    column_score = f"{correct_column_names}/{len(dataset_columns)}"
    column_percentage = (correct_column_names)/ len(dataset_columns)*100
   
    return structure_score, structure_percentage, column_score, column_percentage


def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field 
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    facts_list = facts_df['field'].tolist()
    return facts_list

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

def remove_assigned_columns(dataset, dataset_columns):
    # These columns are auto generated by the pipeline therefore not used in the scoring
    dataset_columns.remove('entity')
    dataset_columns.remove('organisation')
    dataset_columns.remove('prefix')
    if dataset != "tree" and "point" in dataset_columns:
        dataset_columns.remove('point')
    return dataset_columns

Get list of organisation names, to be displayed in the output table. This is gathered separately from the main data, to ensure that if an organisation has not provided any endpoints, it is still included in the output table.

In [None]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:
    latest_endpoints_df = get_latest_endpoints(organisation)
    latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
    all_orgs_latest_endpoints[organisation] = latest_endpoints_df

For each of these endpoints, the relevant schema for the dataset is downloaded to compare the endpoint columns against.

'Structure score' is the number of columns in the processed data that match the schema, divided by the number of columns in the schema. Note that if there is no data at all in a field, it cannot be detected as a structure match.

'Column name score' is the number of columns in the processed data that had matching column names to the schema before any processing happened (ie no column mapping had to take place). Note that if there is no data at all in a field, it cannot be detected as a column name match.

If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [None]:
def compute_cell_colour(value):
    if "%" in value:
        value = int(value.replace("%", ""))
        if value >= 75:
            return 'background-color: green'
        elif value < 75 and value >= 50:
            return 'background-color: orange'
        elif 0 <= value < 50:
            return 'background-color: red'
        else:
            return 'background-color: brown'

organisation_dataset_compliance_dict={}
rows_list = []
csv_rows_list = []
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_compliance_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False
            # print(organisation, dataset, resource)
            

            dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')
            dataset_field_df = dataset_field_df[dataset_field_df['dataset'] == dataset]

            if not skip_dataset:
                column_field_df = get_column_mappings_for_resource(resource, dataset)
                fields = get_fields_for_resource(resource, dataset)
                structure_score, structure_percentage, column_score, column_percentage = check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset)
                overall_percentage = (structure_percentage + column_percentage) / 2
                dataset_compliance_dict[dataset] = {"structure_score": structure_score, "structure_percentage": structure_percentage, "column_score": column_score, "column_name_percentage": column_percentage}
                new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset, 'structure_score': structure_score, 'structure_percentage': f"{int(structure_percentage)}%" , 'column_name_score': column_score, 'column_name_percentage': f"{int(column_percentage)}%", 'overall_percentage': f"{int(overall_percentage)}%"}
                rows_list.append(new_row)
                csv_row = new_row.copy()
                csv_row['endpoint_url'] = row['endpoint_url']
                csv_row['resource'] = row['resource']
                csv_rows_list.append(csv_row)
    
    organisation_dataset_compliance_dict[organisation] = dataset_compliance_dict


compliance_df = pd.DataFrame(rows_list)
output_df = pd.DataFrame(csv_rows_list)
output_df.to_csv('compliance.csv', index=False)
compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_name_percentage", "overall_percentage"])