This report provides compliance to specification information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The column 'structure_score' tells us how much data an endpoint is giving us as a fraction of what we ask for. The column 'column_name_score' tells us how many columns are correctly named.

Example: a column name that is incorrect (e.g 'area' instead of 'geometry') but the data in it has been detected as correct data will score in the 'structure_score' column but not the 'column_name' column

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [1]:
# %pip install wget
import wget
import pandas as pd
import os
import numpy as np
import urllib


Download helper utility files from GitHub:

In [2]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [3]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    provision_df = get_provisions()
    organisation_list = provision_df["organisation"].str.replace(":","-eng:")
    print('Input file not found. Using default list of organisations.')

Input file not found. Using default list of organisations.


In [139]:
def get_endpoint_resource_data():
    datasette_url = "https://datasette.planning.data.gov.uk/"
  
    params = urllib.parse.urlencode({
    "sql": f"""
    select *
    from reporting_latest_endpoints
    """,
    "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field, fr.resource
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    # facts_list = facts_df['field'].tolist()
    return facts_df

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

In [137]:
def get_endpoint_resource_data_new():
    params = urllib.parse.urlencode({
    "sql": f"""
    select *
    from reporting_latest_endpoints
    """,
    "_size": "max"
    })

    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

## Get endpoint data

In [140]:
# get data from datasette
endpoint_resource_df = get_endpoint_resource_data()

# filter to org_list, valid, active endpoints and resources
endpoint_resource_filtered_df = endpoint_resource_df[
    (endpoint_resource_df["organisation"].isin(organisation_list)) &
    (endpoint_resource_df["status"] == 200) &
    (endpoint_resource_df["endpoint_end_date"].isnull()) &
    (endpoint_resource_df["resource_end_date"].isnull())
].copy()

print(len(endpoint_resource_df))
print(len(endpoint_resource_filtered_df))

print(len(endpoint_resource_filtered_df[["endpoint", "pipeline"]].drop_duplicates()))
print(len(endpoint_resource_filtered_df[["resource"]].drop_duplicates()))
print(len(endpoint_resource_filtered_df[["endpoint"]].drop_duplicates()))

155
79
79
71
74


## Get field and col mapping data

In [141]:
# table of unique resources and pipelines
resource_df = endpoint_resource_filtered_df[["pipeline", "resource"]].drop_duplicates().dropna(axis = 0)
print(len(resource_df))

79


In [142]:
# generic function to try the resource datasette queries 
# will return a df with resource and dataset fields as keys, and query results as other fields
def try_results(function, resource, dataset):

    # try grabbing results
    try:
        df = function(resource, dataset)

        # if empty response give NaNs
        if len(df) == 0:
            df = pd.DataFrame({"field" : [np.nan]
            })

        df["resource"] = resource
        df["dataset"] = dataset

    # if error record resource and dataset
    except:
        df = pd.DataFrame({"resource" : [resource],
                           "dataset" : [dataset]
        })

    return df


# get results for col mappings and fields in arrays
results_col_map = [try_results(get_column_mappings_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]
results_field_resource = [try_results(get_fields_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]

# concat the results, resources which errored with have NaNs in query results fields
results_col_map_df = pd.concat(results_col_map)
results_field_resource_df = pd.concat(results_field_resource)

# no. of resources in each query response array
print(len(results_col_map))
print(len(results_field_resource))

# no of records in each results df
print(len(results_col_map_df))
print(len(results_field_resource_df))


79
79
534
476


In [143]:
# add in match field for column mappings 
results_col_map_df["field_matched"] = np.where(
        (results_col_map_df["field"].isin(["geometry", "point"])) |
        (results_col_map_df["field"] == results_col_map_df["column"]),
        1, 
        0
)

# add in flag for fields supplied (i.e. they're in the mapping table)
results_col_map_df["field_supplied"] = 1

# add in flag for fields present
results_field_resource_df["field_loaded"] = 1

## Calculating match rates

In [144]:
dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')

dataset_field_df.head()

Unnamed: 0,dataset,field,field-dataset,guidance,hint
0,address,address,,,
1,address,address-text,,,
2,address,end-date,,,
3,address,entity,,,
4,address,entry-date,,,


In [146]:
# rename pipeline to dataset in endpoint_resource table
endpoint_resource_filtered_df.rename(columns={"pipeline":"dataset"}, inplace=True)

# left join from endpoint resource table to all the fields that each dataset should have
resource_spec_fields_df = endpoint_resource_filtered_df[
    ["organisation", "name", "dataset", "endpoint", "status", "latest_log_entry_date", "endpoint_entry_date", "resource"]
    ].merge(
        dataset_field_df[["dataset", "field"]],
        on = "dataset"
)

print(len(resource_spec_fields_df))
resource_spec_fields_df.head()

1291


Unnamed: 0,organisation,name,dataset,endpoint,status,latest_log_entry_date,endpoint_entry_date,resource,field
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entity


In [147]:
# join on field present flag for each resource
resource_fields_match = resource_spec_fields_df.merge(
    results_field_resource_df[["dataset", "resource", "field", "field_loaded"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_match))
resource_fields_match.head()

1291


Unnamed: 0,organisation,name,dataset,endpoint,status,latest_log_entry_date,endpoint_entry_date,resource,field,field_loaded
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entity,


In [148]:
# join on field present flag for each resource
resource_fields_map_match = resource_fields_match.merge(
    results_col_map_df[["dataset", "resource", "field", "field_supplied", "field_matched"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_map_match))
resource_fields_map_match.head()

1292


Unnamed: 0,organisation,name,dataset,endpoint,status,latest_log_entry_date,endpoint_entry_date,resource,field,field_loaded,field_supplied,field_matched
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,,1.0,0.0
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,,1.0,0.0
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,,,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,,1.0,0.0
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entity,,,


In [149]:
# remove fields that are auto-created in the pipeline from final table to avoid mis-counting

# remove the pipeline-created fields from the spec field table
# ("entity", "organisation", "prefix", "point" for all but tree, and "entity", "organisation", "prefix" for tree)
resource_fields_scored = resource_fields_map_match[
    ((resource_fields_map_match["dataset"] != "tree") & (~resource_fields_map_match["field"].isin(["entity", "organisation", "prefix", "point"])) |
     (resource_fields_map_match["dataset"] == "tree") & (~resource_fields_map_match["field"].isin(["entity", "organisation", "prefix"])))
]

# where entry-date hasn't been supplied it is auto-created - change field_loaded to NaN in these instances so we don't count it as a loaded field
entry_date_mask = ((resource_fields_scored["field"] == "entry-date") &
    (resource_fields_scored["field_supplied"].isnull()) &
    (resource_fields_scored["field_loaded"] == 1))

resource_fields_scored.loc[entry_date_mask, "field_loaded"] = np.nan

In [151]:
# group by and aggregate for final summaries
final_count = resource_fields_scored.groupby(
    ["organisation", "name", "dataset", "endpoint", "resource", "status", "latest_log_entry_date", "endpoint_entry_date"]
    ).agg(
        {"field":"count",
         "field_supplied" : "sum",
         "field_matched" : "sum",
         "field_loaded" : "sum"}
         ).reset_index(
         ).sort_values(["name"])

# add a field for the endpoint number (so that orgs and datasets with multiple endpoints are split out and in index)
final_count["endpoint_number"] = final_count.groupby(["organisation", "name", "dataset"]).cumcount() + 1
# create % columns
final_count["field_supplied_pct"] = final_count["field_supplied"] / final_count["field"] 
final_count["field_matched_pct"] = final_count["field_matched"] / final_count["field"] 
final_count["field_loaded_pct"] = final_count["field_loaded"] / final_count["field"] 

# add string fields for [n fields]/[total fields] style counts
final_count["field_supplied_count"] = final_count["field_supplied"].astype(int).map(str) + "/" + final_count["field"].map(str)
final_count["field_matched_count"] = final_count["field_matched"].astype(int).map(str) + "/" + final_count["field"].map(str)
final_count["field_loaded_count"] = final_count["field_loaded"].astype(int).map(str) + "/" + final_count["field"].map(str)

# final_count.reset_index(drop=True, inplace=True)

final_count.head()

Unnamed: 0,organisation,name,dataset,endpoint,resource,status,latest_log_entry_date,endpoint_entry_date,field,field_supplied,field_matched,field_loaded,endpoint_number,field_supplied_pct,field_matched_pct,field_loaded_pct,field_supplied_count,field_matched_count,field_loaded_count
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,2d9575d771afff89f6d731be59a1ff8cedfd99efcd8bb2...,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,200.0,2024-03-12T00:15:11Z,2023-11-14T00:00:00Z,12,11.0,1.0,6.0,1,0.916667,0.083333,0.5,11/12,1/12,6/12
1,local-authority-eng:BIR,Birmingham City Council,conservation-area,a09608d26986c205de7ab8dc54b5d76c776ca236a9ecf9...,acb88aac41434c4cfccb9ee77f6471f5c682616617604c...,200.0,2024-03-11T00:04:33Z,2023-11-14T00:00:00Z,11,8.0,1.0,6.0,1,0.727273,0.090909,0.545455,8/11,1/11,6/11
9,local-authority-eng:BOS,Bolsover District Council,conservation-area,79c8d68e7c08230c990410038451d84f11566963510e8c...,cc806852c1ca82b8a9b17302d00d46c784521c3e737baf...,200.0,2024-03-11T00:04:33Z,2020-09-06T12:11:51Z,11,1.0,1.0,0.0,1,0.090909,0.090909,0.0,1/11,1/11,0/11
13,local-authority-eng:CAT,Canterbury City Council,locally-listed-building,6c03bc498c117edddbd56138be9d27e63d36ec7b0334c7...,fd73848f969dd4793016fcc23215a9e88b7805dd5ef130...,200.0,2024-03-12T00:15:24Z,2022-05-06T13:13:26Z,12,4.0,1.0,4.0,1,0.333333,0.083333,0.333333,4/12,1/12,4/12
12,local-authority-eng:CAT,Canterbury City Council,listed-building-outline,fad9233216dbd89f5e1c5707d7ecc8a5b2e336b6270bd9...,30aae44a35de3654e6223f15ac377d265531db40e4a829...,200.0,2024-03-12T00:15:24Z,2022-05-05T22:22:18Z,16,2.0,1.0,0.0,1,0.125,0.0625,0.0,2/16,1/16,0/16


Note: It appears that for resources which have an "unknown entity" issue, the results for the fields loaded is always 0. So be aware this is the case in the results table where the fields supplied is > 0 but the fields loaded is 0.

In [152]:
final_count_out = final_count[
    ["organisation", "name", "dataset", "endpoint_number", "field_supplied_count", "field_supplied_pct", 
     "field_loaded_count", "field_loaded_pct", "field_matched_count", "field_matched_pct"]
].copy()

final_count_out.sort_values(["name", "dataset", "endpoint_number"], inplace=True)

slice_ = ["field_supplied_pct", "field_loaded_pct", "field_matched_pct"]

final_count_out.style \
    .relabel_index(["Organisation", "Org Name", "Dataset", "Endpoint no.", "Fields Supplied", "Fields Supplied (%)", 
                    "Fields Loaded", "Fields Loaded (%)", "Field Names Matched", "Field Names Matched (%)"], axis=1) \
    .format("{:.0%}", subset = slice_) \
    .background_gradient(axis=None, vmin=0, vmax=1, cmap="YlGn", subset = slice_)

Unnamed: 0,Organisation,Org Name,Dataset,Endpoint no.,Fields Supplied,Fields Supplied (%),Fields Loaded,Fields Loaded (%),Field Names Matched,Field Names Matched (%)
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,1,11/12,92%,6/12,50%,1/12,8%
1,local-authority-eng:BIR,Birmingham City Council,conservation-area,1,8/11,73%,6/11,55%,1/11,9%
9,local-authority-eng:BOS,Bolsover District Council,conservation-area,1,1/11,9%,0/11,0%,1/11,9%
10,local-authority-eng:CAT,Canterbury City Council,article-4-direction-area,1,0/12,0%,0/12,0%,0/12,0%
11,local-authority-eng:CAT,Canterbury City Council,conservation-area,1,4/11,36%,4/11,36%,1/11,9%
12,local-authority-eng:CAT,Canterbury City Council,listed-building-outline,1,2/16,12%,0/16,0%,1/16,6%
13,local-authority-eng:CAT,Canterbury City Council,locally-listed-building,1,4/12,33%,4/12,33%,1/12,8%
18,local-authority-eng:DNC,Doncaster Metropolitan Borough Council,article-4-direction-area,1,6/12,50%,6/12,50%,1/12,8%
19,local-authority-eng:DNC,Doncaster Metropolitan Borough Council,conservation-area,1,4/11,36%,4/11,36%,1/11,9%
20,local-authority-eng:DNC,Doncaster Metropolitan Borough Council,listed-building-outline,1,1/16,6%,0/16,0%,1/16,6%
