This report provides compliance to specification information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The column 'structure_score' tells us how much data an endpoint is giving us as a fraction of what we ask for. The column 'column_name_score' tells us how many columns are correctly named.

Example: a column name that is incorrect (e.g 'area' instead of 'geometry') but the data in it has been detected as correct data will score in the 'structure_score' column but not the 'column_name' column

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [5]:
# %pip install wget
import wget
import pandas as pd
import os
import numpy as np
import urllib


Download helper utility files from GitHub:

In [8]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [9]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    provision_df = get_provisions()
    organisation_list = provision_df["organisation"].str.replace(":","-eng:")
    print('Input file not found. Using default list of organisations.')

Input file not found. Using default list of organisations.


In [1]:
def get_endpoint_resource_data():
    datasette_url = "https://datasette.planning.data.gov.uk/"
  
    params = urllib.parse.urlencode({
        "sql": f"""
        select
            e.endpoint_url,
            l.endpoint,
            l.status,
            l.exception,
            s.collection,
            l.resource,
            sp.pipeline,
            s.organisation,
            o.name,
            l.entry_date as log_entry_date,
            e.entry_date as endpoint_entry_date,
            e.end_date as endpoint_end_date,
            r.start_date as resource_start_date,
            r.end_date as resource_end_date
        from
            most_recent_log l
            inner join source s on l.endpoint = s.endpoint
            inner join endpoint e on l.endpoint = e.endpoint
            inner join organisation o on o.organisation = replace(s.organisation, '-eng', '')
            inner join source_pipeline sp on s.source = sp.source
            left join resource r on l.resource = r.resource
        where
            sp.pipeline IN ('article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree')

        order by s.organisation, sp.pipeline, log_entry_date desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field, fr.resource
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    # facts_list = facts_df['field'].tolist()
    return facts_df

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df



## Get endpoint data

In [31]:
# get data from datasette
# endpoint_resource_df = get_endpoint_resource_data()

# filter to org_list, valid, active endpoints and resources
endpoint_resource_filtered_df = endpoint_resource_df[
    (endpoint_resource_df["organisation"].isin(organisation_list)) &
    (endpoint_resource_df["status"] == 200) &
    (endpoint_resource_df["endpoint_end_date"].isnull()) &
    (endpoint_resource_df["resource_end_date"].isnull())
].copy()

print(len(endpoint_resource_df))
print(len(endpoint_resource_filtered_df))

print(len(endpoint_resource_filtered_df[["endpoint", "pipeline"]].drop_duplicates()))
print(len(endpoint_resource_filtered_df[["resource"]].drop_duplicates()))
print(len(endpoint_resource_filtered_df[["endpoint"]].drop_duplicates()))

204
80
80
73
75


## Get field and col mapping data

In [None]:
# table of unique resources and pipelines
resource_df = endpoint_resource_filtered_df[["pipeline", "resource"]].drop_duplicates().dropna(axis = 0)
print(len(resource_df))

78


In [15]:
# generic function to try the resource datasette queries 
# will return a df with resource and dataset fields as keys, and query results as other fields
def try_results(function, resource, dataset):

    # try grabbing results
    try:
        df = function(resource, dataset)

        # if empty response give NaNs
        if len(df) == 0:
            df = pd.DataFrame({"field" : [np.nan]
            })

        df["resource"] = resource
        df["dataset"] = dataset

    # if error record resource and dataset
    except:
        df = pd.DataFrame({"resource" : [resource],
                           "dataset" : [dataset]
        })

    return df


# get results for col mappings and fields in arrays
results_col_map = [try_results(get_column_mappings_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]
results_field_resource = [try_results(get_fields_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]

# concat the results, resources which errored with have NaNs in query results fields
results_col_map_df = pd.concat(results_col_map)
results_field_resource_df = pd.concat(results_field_resource)

# no. of resources in each query response array
print(len(results_col_map))
print(len(results_field_resource))

# no of records in each results df
print(len(results_col_map_df))
print(len(results_field_resource_df))


78
78
571
541


In [35]:
# add in match field for column mappings 
results_col_map_df["field_matched"] = np.where(
        (results_col_map_df["field"].isin(["geometry", "point"])) |
        (results_col_map_df["field"] == results_col_map_df["column"]),
        1, 
        0
)

# add in flag for fields supplied (i.e. they're in the mapping table)
results_col_map_df["field_supplied"] = 1

# add in flag for fields present
results_field_resource_df["field_loaded"] = 1

## Calculating match rates

In [16]:
dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')

# remove the pipeline-created fields from the spec field table
# ("entity", "organisation", "prefix", "point" for all but tree, and "entity", "organisation", "prefix" for tree)
dataset_field_subset_df = dataset_field_df[
    ((dataset_field_df["dataset"] != "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix", "point"])) |
     (dataset_field_df["dataset"] == "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix"])))
]

dataset_field_subset_df.head()

Unnamed: 0,dataset,field,field-dataset,guidance,hint
0,address,address,,,
1,address,address-text,,,
2,address,end-date,,,
4,address,entry-date,,,
5,address,latitude,,,


In [33]:
# rename pipeline to dataset in endpoint_resource table
endpoint_resource_filtered_df.rename(columns={"pipeline":"dataset"}, inplace=True)

# left join from endpoint resource table to all the fields that each dataset should have
resource_spec_fields_df = endpoint_resource_filtered_df[
    ["organisation", "name", "dataset", "endpoint", "status", "log_entry_date", "endpoint_entry_date", "resource"]
    ].merge(
        dataset_field_subset_df[["dataset", "field"]],
        on = "dataset"
)

print(len(resource_spec_fields_df))
resource_spec_fields_df.head()

1011


Unnamed: 0,organisation,name,dataset,endpoint,status,log_entry_date,endpoint_entry_date,resource,field
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date


In [36]:
# join on field present flag for each resource
resource_fields_match = resource_spec_fields_df.merge(
    results_field_resource_df[["dataset", "resource", "field", "field_loaded"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_match))
resource_fields_match.head()



1011


Unnamed: 0,organisation,name,dataset,endpoint,status,log_entry_date,endpoint_entry_date,resource,field,field_loaded
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date,1.0


In [37]:
# join on field present flag for each resource
resource_fields_map_match = resource_fields_match.merge(
    results_col_map_df[["dataset", "resource", "field", "field_supplied", "field_matched"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_map_match))
resource_fields_map_match.head()

1012


Unnamed: 0,organisation,name,dataset,endpoint,status,log_entry_date,endpoint_entry_date,resource,field,field_loaded,field_supplied,field_matched
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,,1.0,0.0
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,,1.0,0.0
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,,,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,,1.0,0.0
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date,1.0,1.0,0.0


In [86]:
resource_fields_map_match.replace(np.nan, 0, inplace=True)

final_count = resource_fields_map_match.groupby(
    ["organisation", "name", "dataset", "endpoint", "resource", "status", "log_entry_date", "endpoint_entry_date"]
    ).agg(
        {"field":"count",
         "field_supplied" : "sum",
         "field_matched" : "sum",
         "field_loaded" : "sum"}
         ).reset_index(
         ).sort_values(["name"])

# add a field for the endpoint number (so that orgs and datasets with multiple endpoints are split out and in index)
final_count["endpoint_number"] = final_count.groupby(["organisation", "name", "dataset"]).cumcount() + 1
# create % columns
final_count["field_supplied_pct"] = final_count["field_supplied"] / final_count["field"] 
final_count["field_matched_pct"] = final_count["field_matched"] / final_count["field"] 
final_count["field_loaded_pct"] = final_count["field_loaded"] / final_count["field"] 

# final_count.reset_index(drop=True, inplace=True)

final_count.head()

Unnamed: 0,organisation,name,dataset,endpoint,resource,status,log_entry_date,endpoint_entry_date,field,field_supplied,field_matched,field_loaded,endpoint_number,field_supplied_pct,field_matched_pct,field_loaded_pct
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,200.0,2024-03-05T00:16:13Z,2023-11-14T00:00:00Z,12,11.0,1.0,6.0,1,0.916667,0.083333,0.5
1,local-authority-eng:BIR,Birmingham City Council,conservation-area,a09608d26986c205de7ab8dc54b5d76c776ca236a9ecf9...,acb88aac41434c4cfccb9ee77f6471f5c682616617604c...,200.0,2024-03-05T00:04:14Z,2023-11-14T00:00:00Z,11,8.0,1.0,6.0,1,0.727273,0.090909,0.545455
9,local-authority-eng:BOS,Bolsover District Council,conservation-area,79c8d68e7c08230c990410038451d84f11566963510e8c...,cc806852c1ca82b8a9b17302d00d46c784521c3e737baf...,200.0,2024-03-05T00:04:14Z,2020-09-06T12:11:51Z,11,1.0,1.0,0.0,1,0.090909,0.090909,0.0
18,local-authority-eng:DNC,Doncaster Metropolitan Borough Council,tree-preservation-zone,de1eb90a8b037292ef8ae14bfabd1184847ef99b7c6296...,4e4f04d49e528ec5d53363b3fec31d54bdb04afaba44a2...,200.0,2024-03-05T00:14:57Z,2022-10-26T10:14:52Z,13,7.0,2.0,8.0,1,0.538462,0.153846,0.615385
17,local-authority-eng:DNC,Doncaster Metropolitan Borough Council,tree,a05e76e962d438545a5a48c967cd2b1229ebc951a9a4a6...,62f9710fdb2f4c58af20ab11e91fc13e668e851257ee5a...,200.0,2024-03-05T00:14:57Z,2022-10-26T10:15:59Z,15,5.0,1.0,0.0,1,0.333333,0.066667,0.0


Changes to make to this report:

* Make sure list of orgs and datasets is exhaustive in report table
* Sense-check metric results
* Sort index


In [104]:
def make_pretty(styler):
    styler.relabel_index(["Fields Supplied", "Fields Loaded", "Field Names Matched"], axis=1)
    styler.format("{:.0%}")
    styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="PiYG")
    return styler

final_count_out = final_count[
    ["name", "dataset", "endpoint_number", "field", "field_supplied_pct", "field_loaded_pct", "field_matched_pct"]
].copy()

final_count_out.sort_values(["name", "dataset", "endpoint_number"])
final_count_out.set_index(["name", "dataset", "field", "endpoint_number"], inplace=True)
final_count_out.style.pipe(make_pretty)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Fields Supplied,Fields Loaded,Field Names Matched
name,dataset,field,endpoint_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Birmingham City Council,article-4-direction-area,12,1,92%,50%,8%
Birmingham City Council,conservation-area,11,1,73%,55%,9%
Bolsover District Council,conservation-area,11,1,9%,0%,9%
Doncaster Metropolitan Borough Council,tree-preservation-zone,13,1,54%,62%,15%
Doncaster Metropolitan Borough Council,tree,15,1,33%,0%,7%
Doncaster Metropolitan Borough Council,conservation-area,11,1,36%,45%,9%
Doncaster Metropolitan Borough Council,article-4-direction-area,12,1,50%,50%,8%
Doncaster Metropolitan Borough Council,article-4-direction-area,12,2,50%,50%,8%
Doncaster Metropolitan Borough Council,listed-building-outline,16,1,6%,0%,6%
Dover District Council,conservation-area,11,1,82%,55%,45%


In [59]:
final_count_out.index

MultiIndex([(               'Birmingham City Council', ...),
            (               'Birmingham City Council', ...),
            (             'Bolsover District Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            ('Doncaster Metropolitan Borough Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (                'Dover District Council', ...),
            (       'Eps

In [40]:
final_count[["dataset", "field"]].drop_duplicates().sort_values("dataset")

Unnamed: 0,dataset,field
19,article-4-direction,9
0,article-4-direction-area,12
1,conservation-area,11
69,conservation-area,12
67,conservation-area-document,11
16,listed-building-outline,16
17,tree,15
25,tree-preservation-order,12
18,tree-preservation-zone,13


The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:
    latest_endpoints_df = get_latest_endpoints(organisation)
    latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
    all_orgs_latest_endpoints[organisation] = latest_endpoints_df

For each of these endpoints, the relevant schema for the dataset is downloaded to compare the endpoint columns against.

'Structure score' is the number of columns in the processed data that match the schema, divided by the number of columns in the schema. Note that if there is no data at all in a field, it cannot be detected as a structure match.

'Column name score' is the number of columns in the processed data that had matching column names to the schema before any processing happened (ie no column mapping had to take place). Note that if there is no data at all in a field, it cannot be detected as a column name match.

If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [None]:
def compute_cell_colour(value):
    if "%" in value:
        value = int(value.replace("%", ""))
        if value >= 75:
            return 'background-color: green'
        elif value < 75 and value >= 50:
            return 'background-color: orange'
        elif 0 <= value < 50:
            return 'background-color: red'
        else:
            return 'background-color: brown'

organisation_dataset_compliance_dict={}
rows_list = []
csv_rows_list = []
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_compliance_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False
            # print(organisation, dataset, resource)
            

            dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')
            dataset_field_df = dataset_field_df[dataset_field_df['dataset'] == dataset]

            if not skip_dataset:
                column_field_df = get_column_mappings_for_resource(resource, dataset)
                fields = get_fields_for_resource(resource, dataset)
                structure_score, structure_percentage, column_score, column_percentage = check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset)
                overall_percentage = (structure_percentage + column_percentage) / 2
                dataset_compliance_dict[dataset] = {"structure_score": structure_score, "structure_percentage": structure_percentage, "column_score": column_score, "column_name_percentage": column_percentage}
                new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset, 'structure_score': structure_score, 'structure_percentage': f"{int(structure_percentage)}%" , 'column_name_score': column_score, 'column_name_percentage': f"{int(column_percentage)}%", 'overall_percentage': f"{int(overall_percentage)}%"}
                rows_list.append(new_row)
                csv_row = new_row.copy()
                csv_row['endpoint_url'] = row['endpoint_url']
                csv_row['resource'] = row['resource']
                csv_rows_list.append(csv_row)
    
    organisation_dataset_compliance_dict[organisation] = dataset_compliance_dict


compliance_df = pd.DataFrame(rows_list)
output_df = pd.DataFrame(csv_rows_list)
output_df.to_csv('compliance.csv', index=False)
compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_name_percentage", "overall_percentage"])