This report provides compliance to specification information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The column 'structure_score' tells us how much data an endpoint is giving us as a fraction of what we ask for. The column 'column_name_score' tells us how many columns are correctly named.

Example: a column name that is incorrect (e.g 'area' instead of 'geometry') but the data in it has been detected as correct data will score in the 'structure_score' column but not the 'column_name' column

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [224]:
# %pip install wget
# import wget
import pandas as pd
import os
import numpy as np

pd.set_option("display.max_rows", 100)


Download helper utility files from GitHub:

In [3]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [51]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    organisation_list = [
    'local-authority-eng:BUC', 
    'local-authority-eng:DAC', 'local-authority-eng:DNC',
    'local-authority-eng:GLO', 'local-authority-eng:CMD', 'local-authority-eng:LBH', 'local-authority-eng:SWK',
    'local-authority-eng:MDW', 'local-authority-eng:NET', 'local-authority-eng:BIR', 'local-authority-eng:CAT',
    'local-authority-eng:EPS', 'local-authority-eng:BNE', 'local-authority-eng:GAT', 'local-authority-eng:GRY',
    'local-authority-eng:KTT', 'local-authority-eng:SAL', 'local-authority-eng:TEW', 'local-authority-eng:WBK',
    'local-authority-eng:DST', 'local-authority-eng:DOV', 'local-authority-eng:LIV', 'local-authority-eng:RDB',
    'local-authority-eng:WFT', 'local-authority-eng:NLN', 'local-authority-eng:NSM', 'local-authority-eng:SLF',
    'local-authority-eng:WRL' ]
    print('Input file not found. Using default list of organisations.')

Input file not found. Using default list of organisations.


In [22]:
# organisation_list = [
#     'local-authority:BUC', 
#     'local-authority:DAC', 'local-authority:DNC',
#     'local-authority:GLO', 'local-authority:CMD', 'local-authority:LBH', 'local-authority:SWK',
#     'local-authority:MDW', 'local-authority:NET', 'local-authority:BIR', 'local-authority:CAT',
#     'local-authority:EPS', 'local-authority:BNE', 'local-authority:GAT', 'local-authority:GRY',
#     'local-authority:KTT', 'local-authority:SAL', 'local-authority:TEW', 'local-authority:WBK',
#     'local-authority:DST', 'local-authority:DOV', 'local-authority:LIV', 'local-authority:RDB',
#     'local-authority:WFT', 'local-authority:NLN', 'local-authority:NSM', 'local-authority:SLF',
#     'local-authority:WRL' ]

In [19]:
def get_funded_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name, statistical_geography
        from organisation 
        where organisation in (
            select distinct organisation 
            from provision 
            where provision_reason = "expected")
        order by organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

funded_orgs_df = get_funded_organisations()

In [25]:
print(len(funded_orgs_df))
print(len(funded_orgs_df[funded_orgs_df["organisation"].isin(organisation_list)]))
print(len(organisation_list))

31
27
28


In [26]:
funded_orgs_df[~funded_orgs_df["organisation"].isin(organisation_list)]

Unnamed: 0,organisation,name,statistical_geography
2,local-authority:BOS,Bolsover District Council,E07000033
18,local-authority:NBL,Northumberland County Council,E06000057
19,local-authority:NED,North East Derbyshire District Council,E07000038
23,local-authority:RED,Redditch Borough Council,E07000236


In [35]:
set(organisation_list).difference(funded_orgs_df["organisation"].to_list())

{'local-authority:RDB'}

In [77]:
organisation_list = funded_orgs_df["organisation_old"].to_list()

In [5]:
def check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset):
    dataset_columns = dataset_field_df['field'].tolist()
    # Remove automatically assigned columns by the pipeline from scoring
    dataset_columns = remove_assigned_columns(dataset, dataset_columns)
    
    missing_columns = []
    present_columns = []
    # Count whether columns in the specification are present in the endpoint
    for column in dataset_columns:
        if column not in fields:
            missing_columns.append(column)
        else:
            present_columns.append(column)
    structure_score = f"{len(dataset_columns) - len(missing_columns)}/{len(dataset_columns)}"
    structure_percentage = (len(dataset_columns) - len(missing_columns)) / len(dataset_columns) * 100

    # The WKT column is removed from the column_field mapping as it is autogenerated by the pipeline for some file formats (e.g geojson)
    filtered_columns = ["WKT"]
    column_field_df = column_field_df[-column_field_df['column'].isin(filtered_columns)]

    mapped_fields = column_field_df['field'].tolist()
    # print("present columns: ", present_columns)
    # print("missing columns: ", missing_columns)
    # print("column field mapping: \n", column_field_df)
    correct_column_names = 0
    for field in present_columns:
        # If a field isn't present in the mapped fields it is correctly named
        # Or if the column name is the same as the field name it is correctly named
        if field not in mapped_fields or column_field_df[column_field_df['field'] == field]['column'].tolist()[0] == field:
            correct_column_names += 1
    
    column_score = f"{correct_column_names}/{len(dataset_columns)}"
    column_percentage = (correct_column_names)/ len(dataset_columns)*100
   
    return structure_score, structure_percentage, column_score, column_percentage


def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field 
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    facts_list = facts_df['field'].tolist()
    return facts_list

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

def remove_assigned_columns(dataset, dataset_columns):
    # These columns are auto generated by the pipeline therefore not used in the scoring
    dataset_columns.remove('entity')
    dataset_columns.remove('organisation')
    dataset_columns.remove('prefix')
    if dataset != "tree" and "point" in dataset_columns:
        dataset_columns.remove('point')
    return dataset_columns

Get list of organisation names, to be displayed in the output table. This is gathered separately from the main data, to ensure that if an organisation has not provided any endpoints, it is still included in the output table.

In [79]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

In [378]:
organisation_info_df.head()

Unnamed: 0,addressbase-custodian,billing-authority,combined-authority,end-date,entity,entry-date,esd-inventory,local-authority-district,local-authority-type,local-enterprise-partnership,...,parliament-thesaurus,prefix,reference,region,start-date,statistical-geography,twitter,website,wikidata,wikipedia
0,3805.0,E3831,,,26,2023-11-19,,E07000223,NMD,,...,453.0,local-authority,ADU,south-east,,E07000223,,https://www.adur-worthing.gov.uk,Q72980889,
1,905.0,E0931,,2023-03-31,27,2023-11-19,,E07000026,NMD,,...,1131.0,local-authority,ALL,north-west,,E07000026,allerdale,https://www.allerdale.gov.uk,Q72980920,
2,1005.0,E1031,,,28,2023-11-19,,E07000032,NMD,,...,1257.0,local-authority,AMB,east-midlands,,E07000032,AmberValleyBC,https://www.ambervalley.gov.uk,Q72980961,
3,3810.0,E3832,,,29,2023-11-19,,E07000224,NMD,,...,1925.0,local-authority,ARU,south-east,,E07000224,ArunDistrict,https://www.arun.gov.uk,Q72980967,
4,2205.0,E2231,,,30,2023-11-19,,E07000105,NMD,,...,1962.0,local-authority,ASF,south-east,,E07000105,AshfordCouncil,https://www.ashford.gov.uk,Q55098926,Ashford_Borough_Council


In [380]:
# organisation_name_dict

In [60]:
def get_funded_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name, statistical_geography
        from organisation   
        where organisation in (
            select distinct organisation 
            from provision 
            where cohort IN (
                "ODP-Track1",
                "RIPA-BOPS",
                "ODP-Track3",
                "ODP-Track2"
            )
            and provision_reason = "expected")
        order by organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


funded_orgs_df = get_funded_organisations()
# add in old-style "-eng" names
funded_orgs_df["organisation_old"] = funded_orgs_df["organisation"].apply(lambda x: "-eng:".join(x.split(":")))

funded_orgs_df.head()

Unnamed: 0,organisation,name,statistical_geography,organisation_old
0,local-authority:BIR,Birmingham City Council,E08000025,local-authority-eng:BIR
1,local-authority:BNE,London Borough of Barnet,E09000003,local-authority-eng:BNE
2,local-authority:BOS,Bolsover District Council,E07000033,local-authority-eng:BOS
3,local-authority:CAT,Canterbury City Council,E07000106,local-authority-eng:CAT
4,local-authority:CMD,London Borough of Camden,E09000007,local-authority-eng:CMD


In [47]:
# check the difference between the hard-coded list and provision table

print(len(funded_orgs_df))
print(len(organisation_list))

set(organisation_list).difference(funded_orgs_df["organisation"].to_list())

27
28


{'local-authority:BUC',
 'local-authority:LBH',
 'local-authority:RDB',
 'local-authority:SWK'}

In [63]:
funded_orgs_df["organisation_old"].to_list()

['local-authority-eng:BIR',
 'local-authority-eng:BNE',
 'local-authority-eng:BOS',
 'local-authority-eng:CAT',
 'local-authority-eng:CMD',
 'local-authority-eng:DAC',
 'local-authority-eng:DNC',
 'local-authority-eng:DOV',
 'local-authority-eng:DST',
 'local-authority-eng:EPS',
 'local-authority-eng:GAT',
 'local-authority-eng:GLO',
 'local-authority-eng:GRY',
 'local-authority-eng:KTT',
 'local-authority-eng:LIV',
 'local-authority-eng:MDW',
 'local-authority-eng:NED',
 'local-authority-eng:NET',
 'local-authority-eng:NLN',
 'local-authority-eng:NSM',
 'local-authority-eng:RED',
 'local-authority-eng:SAL',
 'local-authority-eng:SLF',
 'local-authority-eng:TEW',
 'local-authority-eng:WBK',
 'local-authority-eng:WFT',
 'local-authority-eng:WRL']

## Latest endpoints table

The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [65]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:

    try:
        latest_endpoints_df = get_latest_endpoints(organisation)
        latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
        all_orgs_latest_endpoints[organisation] = latest_endpoints_df

    except:
        all_orgs_latest_endpoints[organisation] = None

In [None]:
# stick dictionary in df
endpoint_latest_df = pd.concat([all_orgs_latest_endpoints[v] for v in all_orgs_latest_endpoints if len(all_orgs_latest_endpoints[v]) > 0])

print(len(endpoint_latest_df))
endpoint_latest_df.head()

73


Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200
0,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,article-4-direction,article-4-direction-area,local-authority-eng:BIR,Birmingham City Council,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,2024-02-26 00:15:44+00:00,2023-11-14 00:00:00+00:00,,,,
1,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,conservation-area,conservation-area,local-authority-eng:BIR,Birmingham City Council,81ed286e34b43d1f9f3053e463a6151224b182538ce98f...,2024-02-26 00:05:43+00:00,2023-11-14 00:00:00+00:00,,,,
0,https://open.barnet.gov.uk/download/20yo8/c6n/...,200.0,,conservation-area,conservation-area,local-authority-eng:BNE,London Borough of Barnet,a480fe98c49bf04ebaee5f3c970dc131b784bdd6a7b81c...,2024-02-26 00:05:43+00:00,2023-11-06 00:00:00+00:00,,,,
3,https://open.barnet.gov.uk/download/e5nge/ktw/...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:BNE,London Borough of Barnet,77fe8c4978ab17814a30f0d6fac7444026f17a14f84c12...,2024-02-27 00:14:39+00:00,2023-11-07 11:11:48+00:00,,,,
2,https://open.barnet.gov.uk/download/2ylny/z7y/...,200.0,,article-4-direction,article-4-direction,local-authority-eng:BNE,London Borough of Barnet,19a54c9320411dc63de45a29f9f947a006c1ddd9076da0...,2024-02-26 00:15:44+00:00,2023-12-18 00:00:00+00:00,,,,


In [383]:
endpoint_latest_df.groupby("status").size()

status
200.0    70
400.0     1
403.0     1
404.0     1
dtype: int64

In [384]:
endpoint_latest_df[endpoint_latest_df["status"] == 400]

Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,dataset
1,https://gcty.dynamicmaps.co.uk:8443/geoserver/...,400.0,,listed-building,listed-building-outline,local-authority-eng:GLO,Gloucester City Council,0fe950b55f7ff4425fc051fe9dc5eaa6d7dd18cea1e16b...,2024-02-26 00:15:59+00:00,2022-07-28 13:21:40+00:00,,200,2023-12-18 00:18:21+00:00,,[listed-building-outline],[listed-building-outline]


In [386]:
endpoint_latest_df[endpoint_latest_df["organisation"] == "local-authority-eng:GLO"].values

array([['https://gcty.dynamicmaps.co.uk:8443/geoserver/Digital_Land/ows?service=wfs&request=GetFeature&TypeNames=Digital_Land%3Aarticle_4_DL&OutputFormat=GML2',
        200.0, nan, 'article-4-direction', 'article-4-direction-area',
        'local-authority-eng:GLO', 'Gloucester City Council',
        '2b544d4d84578f6e104ce8802d12c1f85549ff5215826422b2a88bdbc5f7c7c0',
        Timestamp('2024-02-26 00:15:44+0000', tz='UTC'),
        Timestamp('2022-06-30 09:09:45+0000', tz='UTC'), nan, None, None,
        None, list(['article-4-direction-area']),
        list(['article-4-direction-area'])],
       ['https://gcty.dynamicmaps.co.uk:8443/geoserver/Digital_Land/ows?service=wfs&request=GetFeature&TypeNames=Digital_Land%3Alisted_buildings_DL&OutputFormat=GML2',
        400.0, nan, 'listed-building', 'listed-building-outline',
        'local-authority-eng:GLO', 'Gloucester City Council',
        '0fe950b55f7ff4425fc051fe9dc5eaa6d7dd18cea1e16bbb0b996f059b94524b',
        Timestamp('2024-02-26 00

In [382]:
endpoint_latest_df[endpoint_latest_df["organisation"] == "local-authority-eng:BNE"]

Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,dataset
0,https://open.barnet.gov.uk/download/20yo8/c6n/...,200.0,,conservation-area,conservation-area,local-authority-eng:BNE,London Borough of Barnet,a480fe98c49bf04ebaee5f3c970dc131b784bdd6a7b81c...,2024-02-26 00:05:43+00:00,2023-11-06 00:00:00+00:00,,,,,[conservation-area],[conservation-area]
3,https://open.barnet.gov.uk/download/e5nge/ktw/...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:BNE,London Borough of Barnet,77fe8c4978ab17814a30f0d6fac7444026f17a14f84c12...,2024-02-27 00:14:39+00:00,2023-11-07 11:11:48+00:00,,,,,[tree-preservation-order],[tree-preservation-order]
2,https://open.barnet.gov.uk/download/2ylny/z7y/...,200.0,,article-4-direction,article-4-direction,local-authority-eng:BNE,London Borough of Barnet,19a54c9320411dc63de45a29f9f947a006c1ddd9076da0...,2024-02-26 00:15:44+00:00,2023-12-18 00:00:00+00:00,,,,,[article-4-direction],[article-4-direction]
4,https://open.barnet.gov.uk/download/2w6jz/ztc/...,200.0,,listed-building,listed-building-outline,local-authority-eng:BNE,London Borough of Barnet,b7f316d572822aed57dd631bed064b178a3aaeed3151c1...,2024-02-26 00:15:59+00:00,2023-12-18 00:00:00+00:00,,,,,[listed-building-outline],[listed-building-outline]
5,https://open.barnet.gov.uk/download/e5l77/dhv/...,200.0,,article-4-direction,article-4-direction-area,local-authority-eng:BNE,London Borough of Barnet,8370346f35a81b8b3509f4e3645bb98e43951d09c5cf61...,2024-02-26 00:15:44+00:00,2023-12-18 00:00:00+00:00,,,,,[article-4-direction-area],[article-4-direction-area]


In [210]:
# explode out the 
endpoint_latest_df["dataset"] = endpoint_latest_df["pipelines"].str.split(",")
endpoint_latest_long_df = endpoint_latest_df.explode("dataset", ignore_index=True)

print(len(endpoint_latest_df))
print(len(endpoint_latest_long_df))
endpoint_latest_long_df.head()

73
77


Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,dataset
0,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,article-4-direction,article-4-direction-area,local-authority-eng:BIR,Birmingham City Council,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,2024-02-26 00:15:44+00:00,2023-11-14 00:00:00+00:00,,,,,[article-4-direction-area],article-4-direction-area
1,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,conservation-area,conservation-area,local-authority-eng:BIR,Birmingham City Council,81ed286e34b43d1f9f3053e463a6151224b182538ce98f...,2024-02-26 00:05:43+00:00,2023-11-14 00:00:00+00:00,,,,,[conservation-area],conservation-area
2,https://open.barnet.gov.uk/download/20yo8/c6n/...,200.0,,conservation-area,conservation-area,local-authority-eng:BNE,London Borough of Barnet,a480fe98c49bf04ebaee5f3c970dc131b784bdd6a7b81c...,2024-02-26 00:05:43+00:00,2023-11-06 00:00:00+00:00,,,,,[conservation-area],conservation-area
3,https://open.barnet.gov.uk/download/e5nge/ktw/...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:BNE,London Borough of Barnet,77fe8c4978ab17814a30f0d6fac7444026f17a14f84c12...,2024-02-27 00:14:39+00:00,2023-11-07 11:11:48+00:00,,,,,[tree-preservation-order],tree-preservation-order
4,https://open.barnet.gov.uk/download/2ylny/z7y/...,200.0,,article-4-direction,article-4-direction,local-authority-eng:BNE,London Borough of Barnet,19a54c9320411dc63de45a29f9f947a006c1ddd9076da0...,2024-02-26 00:15:44+00:00,2023-12-18 00:00:00+00:00,,,,,[article-4-direction],article-4-direction


In [212]:
org_dataset_count = endpoint_latest_long_df.groupby(["organisation", "dataset"]).size().reset_index(name = "count")

org_dataset_count[org_dataset_count["count"] > 1]

Unnamed: 0,organisation,dataset,count
55,local-authority-eng:NET,tree-preservation-order,2


In [216]:
resource_count = endpoint_latest_long_df.groupby(["resource"]).size().reset_index(name = "count")

resource_dupes = org_dataset_count[org_dataset_count["count"] > 1]

# look at records which have resource dupes
endpoint_latest_long_df[endpoint_latest_long_df["resource"].isin(resource_dupes["resource"])][
    ["status", "collection", "dataset", "name", "resource", "entrydate", "maxentrydate"]
]

Unnamed: 0,status,collection,dataset,name,resource,entrydate,maxentrydate
9,404.0,tree-preservation-order,tree-preservation-order,Canterbury City Council,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,2021-12-01 18:18:46+00:00,2024-02-27 00:14:39+00:00
10,404.0,tree-preservation-order,tree-preservation-zone,Canterbury City Council,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,2021-12-01 18:18:46+00:00,2024-02-27 00:14:39+00:00
45,200.0,tree-preservation-order,tree-preservation-order,Medway Council,06137759c0934894ce7661bbdecdc815cf8532ea9d1cd8...,2022-07-01 07:07:56+00:00,2024-02-27 00:14:39+00:00
46,200.0,tree-preservation-order,tree-preservation-zone,Medway Council,06137759c0934894ce7661bbdecdc815cf8532ea9d1cd8...,2022-07-01 07:07:56+00:00,2024-02-27 00:14:39+00:00
50,200.0,tree-preservation-order,tree,Newcastle City Council,f74669bf0c4f1f2687cb1ef8c272f108cb0cc0fbb81c19...,2022-07-01 04:04:17+00:00,2024-02-27 00:14:39+00:00
51,200.0,tree-preservation-order,tree-preservation-order,Newcastle City Council,f74669bf0c4f1f2687cb1ef8c272f108cb0cc0fbb81c19...,2022-07-01 04:04:17+00:00,2024-02-27 00:14:39+00:00
52,200.0,tree-preservation-order,tree-preservation-order,Newcastle City Council,0c035f369a58fefe56046a87edf4f9429ae7c95aefc6a8...,2022-07-01 05:05:46+00:00,2024-02-27 00:14:39+00:00
53,200.0,tree-preservation-order,tree-preservation-zone,Newcastle City Council,0c035f369a58fefe56046a87edf4f9429ae7c95aefc6a8...,2022-07-01 05:05:46+00:00,2024-02-27 00:14:39+00:00


## Resource fields and mapping tables

In [322]:
# generic function to try the resource datasette queries 
# will return a df with resource and dataset fields as keys, and query results as other fields
# def try_results(function, resource, dataset):

#     try:
#         df = function(resource, dataset)
#         df["resource"] = resource
#         df["dataset"] = dataset

#     except:
#         df = pd.DataFrame({"resource" : [resource],
#                            "dataset" : [dataset]
#         })

#     return df



# results_col_map = [try_results(get_column_mappings_for_resource, r["resource"], r["dataset"]) for index, r in endpoint_latest_long_df.iterrows()]
# results_field_resource = [try_results(get_fields_for_resource, r["resource"], r["dataset"]) for index, r in endpoint_latest_long_df.iterrows()]

# concat the results, resources which errored with have NaNs in query results fields
results_col_map_df = pd.concat(results_col_map)
results_field_resource_df = pd.concat(results_field_resource)

# no. of resources in each query response array
print(len(results_col_map))
print(len(results_field_resource))

# no of records in each results df
print(len(results_col_map_df))
print(len(results_field_resource_df))


77
77
532
525


In [301]:
# number of distinct resources in each table
print(len(results_col_map_df[["resource"]].drop_duplicates()))
print(len(results_field_resource_df[["resource"]].drop_duplicates()))

71
62


In [307]:
# resources which are in the column mapping df but not in the fields one
results_col_map_df[~results_col_map_df["resource"].isin(results_field_resource_df["resource"].drop_duplicates())].sort_values("resource")

Unnamed: 0,column,field,resource,dataset
0,Description,description,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,tree-preservation-order
0,WKT,geometry,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,tree-preservation-zone
1,Description,description,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,tree-preservation-zone
2,tree-species-list,tree-species-list,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,tree-preservation-zone
0,WKT,geometry,00636f358a59a99853e97e0e3dd059cbddb36b8b6da290...,listed-building-outline
1,NAME,name,00636f358a59a99853e97e0e3dd059cbddb36b8b6da290...,listed-building-outline
0,ADDRESS,address-text,05182443ad8ea72ec17fd2f46dd6e19126e86ddbc2d5f3...,tree
1,Comment,notes,05182443ad8ea72ec17fd2f46dd6e19126e86ddbc2d5f3...,tree
2,DESCRIPT,name,05182443ad8ea72ec17fd2f46dd6e19126e86ddbc2d5f3...,tree
3,Species,tree-species,05182443ad8ea72ec17fd2f46dd6e19126e86ddbc2d5f3...,tree


In [345]:
# there aren't any resources in the field table which aren't in the col map table
results_field_resource_df[~results_field_resource_df["resource"].isin(results_col_map_df["resource"].drop_duplicates())].sort_values("resource")

Unnamed: 0,field,resource,dataset


In [304]:
results_field_resource_df[results_field_resource_df["resource"] == "17934a9db4021f3877b28ae80f03414d78fb56352f38c1ab2afeed8006ecaae6"]

Unnamed: 0,field,resource,dataset


In [308]:
get_fields_for_resource("17934a9db4021f3877b28ae80f03414d78fb56352f38c1ab2afeed8006ecaae6", "tree-preservation-order")

Unnamed: 0,field,resource


Question for tomorrow - why would a resource be in the field mapping table but not in the fields table..??

In [310]:
# add in match field for column mappings 
results_col_map_df["match"] = np.where(
        (results_col_map_df["field"].isin(["geometry", "point"])) |
        (results_col_map_df["field"] == results_col_map_df["column"]),
        1, 
        0
)

# add in flag for fields present
results_field_resource_df["field_present"] = 1

In [316]:
results_col_map_df.head()

Unnamed: 0,column,field,resource,dataset,match
0,NAME,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,0
1,START_DATE,start-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,0
2,WKT,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
3,ADDRESS_TEXT,address-text,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,0
4,ARTICLE_4_DIRECTION,article-4-direction,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,0


### Checking data in fields vs mapping tables

In [336]:
# Taking an example of a single resource - 7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693
# the column mapping table contains 11 fields

results_col_map_df[results_col_map_df["resource"]== "7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693"].sort_values("field")

Unnamed: 0,column,field,resource,dataset
3,ADDRESS_TEXT,address-text,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
4,ARTICLE_4_DIRECTION,article-4-direction,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
5,END_DATE,end-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
6,ENTRY_DATE,entry-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
2,WKT,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
0,NAME,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
7,NOTES,notes,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
8,PERMITTED_DEVELOPMENT_RIGHTS,permitted-development-rights,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
9,REFERENCE,reference,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
1,START_DATE,start-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area


In [337]:
# the field table only contains 8 fields
results_field_resource_df[results_field_resource_df["resource"] == "7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693"].sort_values("field")

Unnamed: 0,field,resource,dataset
0,entry-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
1,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
2,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
3,organisation,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
4,permitted-development-rights,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
5,prefix,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
6,reference,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
7,start-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area


In [338]:
# checking the endpoint itself we can see that the 8 fields above are the 6 matched fields with values, plus organisation and prefix

import geopandas as gpd

bm_af_df = gpd.read_file("https://maps.birmingham.gov.uk/server/rest/services/planx/PlanX/FeatureServer/0/query?where=1=1&outfields=*&f=geojson")

bm_af_df.head()

Unnamed: 0,PRIMARYINDEX,REFERENCE,NAME,NOTES,START_DATE,END_DATE,ENTRY_DATE,UPRN,ADDRESS_TEXT,ARTICLE_4_DIRECTION,ARTICLE_4_DIRECTION_RULES,PERMITTED_DEVELOPMENT_RIGHTS,SE_ANNO_CAD_DATA,geometry
0,1,5,"OLD YARDLEY CONSERVATION AREA, ARTICLE 4 (2)",,1969-07-17,,2023-09-01,,,,,1A;1D;1E;1F;2A;2B;2C,,"POLYGON ((-1.80216 52.47610, -1.80230 52.47611..."
1,2,8,"BOURNVILLE VILLAGE CONSERVATION AREA, ARTICLE...",,1996-05-09,,2023-09-01,,,,,1A;1C;1D;1E;1F;1G;1H;2A;2C,,"MULTIPOLYGON (((-1.93209 52.43199, -1.93203 52..."
2,3,13,"HIGH STREET, SUTTON COLDFIELD CONSERVATION ARE...",,1999-10-14,,2023-09-01,,,,,1A;1C;1D;1F;1H;2C;31B,,"POLYGON ((-1.82973 52.56513, -1.82972 52.56512..."
3,4,30,"AUSTIN VILLAGE CONSERVATION AREA, ARTICLE 4 (2)",,1997-07-17,,2023-09-01,,,,,1D;1C;1B;1F;1H;1A;2C,,"POLYGON ((-1.97335 52.40225, -1.97342 52.40220..."
4,5,15,"EDGBASTON CONSERVATION AREA, ARTICLE 4 (2)",,1975-09-04,,2023-09-01,,,,,1A;1B;1C;1D;1E;1F;1G;2A;2C,,"POLYGON ((-1.91295 52.46975, -1.91296 52.46973..."


In [365]:
results_col_count = results_col_map_df.groupby(["resource"]).size().reset_index(name = "col_map_count")
results_field_count = results_field_resource_df.groupby(["resource"]).size().reset_index(name = "field_count")

col_field_comp_df = results_col_count.merge(
    results_field_count,
    how = "left", 
    on = "resource"
)

# col_field_comp_df.replace(np.nan, 0, inplace=True)

col_field_comp_df["difference"] = col_field_comp_df["col_map_count"] - col_field_comp_df["field_count"]

col_field_comp_df.head(10)

Unnamed: 0,resource,col_map_count,field_count,difference
0,0006c9022c2b832138e71ebdbd972e73e5afdcb4252efc...,4,6.0,-2.0
1,004e273e15af7f9c5ffe43cda70764da076e53c090c128...,10,9.0,1.0
2,00607d7d6a79999730f9f00755d021dbbeeafa74e7091c...,4,,
3,00636f358a59a99853e97e0e3dd059cbddb36b8b6da290...,2,,
4,021f8ddc881f7bb014e0633b818931fe407ee1fef9a68c...,9,10.0,-1.0
5,034a72fde5341f1f5806b4d5b3138911271993495ecdd3...,8,8.0,0.0
6,037fe7b3bce838a2831ba7eda4d9de9ec5492d58a66a7f...,7,10.0,-3.0
7,0384db1ec63fefcb3c64275b6c942296b4220a0c956611...,7,9.0,-2.0
8,05182443ad8ea72ec17fd2f46dd6e19126e86ddbc2d5f3...,5,,
9,0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5...,12,,


In [373]:
col_field_comp_df.iloc[1, 0]

'004e273e15af7f9c5ffe43cda70764da076e53c090c128f937031e63c7ce7a8d'

In [377]:
endpoint_latest_long_df[endpoint_latest_long_df["resource"] == "0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5a0ae709c1fc495bc81"].values

array([['https://services-eu1.arcgis.com/xk4RA36G57mVH7Aw/ArcGIS/rest/services/TPO_National_Planning_Map_view/FeatureServer/0/query?where=1%3D1&f=geojson&outFields=*',
        200.0, nan, 'tree-preservation-order', 'tree-preservation-zone',
        'local-authority-eng:DOV', 'Dover District Council',
        '0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5a0ae709c1fc495bc81',
        Timestamp('2024-02-27 00:14:39+0000', tz='UTC'),
        Timestamp('2023-10-11 11:11:16+0000', tz='UTC'), nan, None, None,
        None, list(['tree-preservation-zone']), 'tree-preservation-zone']],
      dtype=object)

In [366]:
endpoint_latest_long_df[endpoint_latest_long_df["organisation"] == "local-authority-eng:DOV"]

Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,dataset
22,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,conservation-area,conservation-area,local-authority-eng:DOV,Dover District Council,8a3f5d86f38efaa0f6e5d24d6b73f7f5a9f76bc84f4874...,2024-02-26 00:05:43+00:00,2023-10-10 10:10:10+00:00,,,,,[conservation-area],conservation-area
23,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree-preservation-zone,local-authority-eng:DOV,Dover District Council,0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5...,2024-02-27 00:14:39+00:00,2023-10-11 11:11:16+00:00,,,,,[tree-preservation-zone],tree-preservation-zone
24,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,listed-building,listed-building-outline,local-authority-eng:DOV,Dover District Council,a06aa3323c58f5c61626f83c0c72e7fb86310908769e66...,2024-02-26 00:15:59+00:00,2023-10-13 14:14:45+00:00,,,,,[listed-building-outline],listed-building-outline
25,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree,local-authority-eng:DOV,Dover District Council,12d72e771b966bc0d9234fc76bf8adcd454240600376ce...,2024-02-27 00:14:39+00:00,2023-10-26 12:12:45+00:00,,,,,[tree],tree
26,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,article-4-direction,article-4-direction-area,local-authority-eng:DOV,Dover District Council,089b6c5603e250d4e3c0d4270baa8d3ce6a2be705c67c4...,2024-02-26 00:15:44+00:00,2023-12-21 11:11:26+00:00,,,,,[article-4-direction-area],article-4-direction-area
27,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,article-4-direction,article-4-direction,local-authority-eng:DOV,Dover District Council,021f8ddc881f7bb014e0633b818931fe407ee1fef9a68c...,2024-02-26 00:15:44+00:00,2023-12-21 11:11:47+00:00,,,,,[article-4-direction],article-4-direction
28,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:DOV,Dover District Council,2405352ba05c212e9734c05d03ca1bf9500a346b600878...,2024-02-27 00:14:39+00:00,2023-12-22 10:10:13+00:00,,,,,[tree-preservation-order],tree-preservation-order


In [370]:
dov_endpoints_df = get_endpoints("local-authority-eng:DOV")

print(len(dov_endpoints_df))
dov_endpoints_df[dov_endpoints_df["collection"] == "tree-preservation-order"]

24


Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date
1,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree,local-authority-eng:DOV,Dover District Council,12d72e771b966bc0d9234fc76bf8adcd454240600376ce...,2024-02-29T00:14:46Z,2023-10-26T12:12:45Z,
5,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:DOV,Dover District Council,f0082ba711ef431ccd5cc0c23c8c643fcbd6aec1c37161...,2023-12-22T00:14:17Z,2023-12-21T09:09:57Z,2023-12-22
18,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree-preservation-zone,local-authority-eng:DOV,Dover District Council,0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5...,2024-02-29T00:14:46Z,2023-10-11T11:11:16Z,
23,https://services-eu1.arcgis.com/xk4RA36G57mVH7...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:DOV,Dover District Council,2405352ba05c212e9734c05d03ca1bf9500a346b600878...,2024-02-29T00:14:46Z,2023-12-22T10:10:13Z,


In [372]:
get_fields_for_resource("0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5a0ae709c1fc495bc81", "tree-preservation-zone")

Unnamed: 0,field,resource


In [None]:
get_fields_for_resource("3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf546966fcdc552c4282c", "tree-preservation-zone")

Unnamed: 0,field,resource
0,address-text,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
1,description,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
2,entry-date,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
3,geometry,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
4,name,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
5,organisation,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
6,prefix,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
7,reference,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
8,start-date,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...
9,tree-preservation-order,3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf5...


In [376]:
get_fields_for_resource("004e273e15af7f9c5ffe43cda70764da076e53c090c128f937031e63c7ce7a8d", "article-4-direction-area")

Unnamed: 0,field,resource
0,article-4-direction,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
1,entry-date,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
2,geometry,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
3,name,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
4,organisation,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
5,prefix,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
6,reference,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
7,start-date,004e273e15af7f9c5ffe43cda70764da076e53c090c128...
8,uprn,004e273e15af7f9c5ffe43cda70764da076e53c090c128...


In [237]:
results_col_map_df[["column", "field", "match"]].drop_duplicates()

Unnamed: 0,column,field,match
0,NAME,name,0
1,START_DATE,start-date,0
2,WKT,geometry,1
3,ADDRESS_TEXT,address-text,0
4,ARTICLE_4_DIRECTION,article-4-direction,0
...,...,...,...
1,document-type,document-type,1
2,documentation_URL,documentation-url,0
0,REF,reference,0
1,S_NAME,name,0


In [235]:
# check how geometry fields are mapped
results_col_map_df[results_col_map_df["field"] == "geometry"][["column", "field", "match"]].drop_duplicates()

Unnamed: 0,column,field,match
2,WKT,geometry,1
2,geometry,geometry,1
2,Geometry,geometry,1


In [217]:
results_field_resource_df.head()

Unnamed: 0,field,resource,dataset,field_present
0,entry-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
1,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
2,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
3,organisation,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
4,permitted-development-rights,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1


In [209]:
endpoint_latest_long_df.head()

Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,dataset
0,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,article-4-direction,article-4-direction-area,local-authority-eng:BIR,Birmingham City Council,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,2024-02-26 00:15:44+00:00,2023-11-14 00:00:00+00:00,,,,,[article-4-direction-area],article-4-direction-area
1,https://maps.birmingham.gov.uk/server/rest/ser...,200.0,,conservation-area,conservation-area,local-authority-eng:BIR,Birmingham City Council,81ed286e34b43d1f9f3053e463a6151224b182538ce98f...,2024-02-26 00:05:43+00:00,2023-11-14 00:00:00+00:00,,,,,[conservation-area],conservation-area
2,https://open.barnet.gov.uk/download/20yo8/c6n/...,200.0,,conservation-area,conservation-area,local-authority-eng:BNE,London Borough of Barnet,a480fe98c49bf04ebaee5f3c970dc131b784bdd6a7b81c...,2024-02-26 00:05:43+00:00,2023-11-06 00:00:00+00:00,,,,,[conservation-area],conservation-area
3,https://open.barnet.gov.uk/download/e5nge/ktw/...,200.0,,tree-preservation-order,tree-preservation-order,local-authority-eng:BNE,London Borough of Barnet,77fe8c4978ab17814a30f0d6fac7444026f17a14f84c12...,2024-02-27 00:14:39+00:00,2023-11-07 11:11:48+00:00,,,,,[tree-preservation-order],tree-preservation-order
4,https://open.barnet.gov.uk/download/2ylny/z7y/...,200.0,,article-4-direction,article-4-direction,local-authority-eng:BNE,London Borough of Barnet,19a54c9320411dc63de45a29f9f947a006c1ddd9076da0...,2024-02-26 00:15:44+00:00,2023-12-18 00:00:00+00:00,,,,,[article-4-direction],article-4-direction


## Calculating match rates

In [170]:
dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')

# remove the pipeline-created fields from the spec field table
# ("entity", "organisation", "prefix", "point" for all but tree, and
#  "entity", "organisation", "prefix" for tree)
dataset_field_subset_df = dataset_field_df[((dataset_field_df["dataset"] != "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix", "point"])) |
                  (dataset_field_df["dataset"] == "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix"])))]

dataset_field_df.head()

Unnamed: 0,dataset,field,field-dataset,guidance,hint
0,address,address,,,
1,address,address-text,,,
2,address,end-date,,,
3,address,entity,,,
4,address,entry-date,,,


In [253]:
# left join on all fields that each dataset should have
resource_spec_fields_df = endpoint_latest_long_df[["organisation", "name", "dataset", "resource"]].merge(
    dataset_field_subset_df[["dataset", "field"]],
    how = "left",
    on = "dataset"
)

print(len(resource_spec_fields_df))
resource_spec_fields_df.head()

973


Unnamed: 0,organisation,name,dataset,resource,field
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date


In [256]:
# join on field present flag for each resource
resource_fields_match = resource_spec_fields_df.merge(
    results_field_resource_df,
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_match))
resource_fields_match.head()



973


Unnamed: 0,organisation,name,dataset,resource,field,field_present
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date,1.0


In [257]:
# join on field present flag for each resource
resource_fields_map_match = resource_fields_match.merge(
    results_col_map_df,
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_map_match))
resource_fields_map_match.head()

974


Unnamed: 0,organisation,name,dataset,resource,field,field_present,column,match
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,address-text,,ADDRESS_TEXT,0.0
1,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction,,ARTICLE_4_DIRECTION,0.0
2,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,description,,,
3,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,end-date,,END_DATE,0.0
4,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,entry-date,1.0,ENTRY_DATE,0.0


In [262]:
dupe_test = resource_fields_map_match.groupby(["dataset", "resource", "field"]).size().reset_index(name = "count")

dupe_test[dupe_test["count"] > 1].values

array([['conservation-area',
        '804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9980cfa0357a9cc362c',
        'reference', 2]], dtype=object)

In [263]:
resource_spec_fields_df[resource_spec_fields_df["resource"] == "804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9980cfa0357a9cc362c"]

Unnamed: 0,organisation,name,dataset,resource,field
832,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,categories
833,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,conservation-area
834,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,documentation-url
835,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,end-date
836,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,entry-date
837,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,geometry
838,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,legislation
839,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,name
840,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,notes
841,local-authority-eng:SLF,Salford City Council,conservation-area,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,reference


In [264]:
results_col_map_df[results_col_map_df["resource"] == "804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9980cfa0357a9cc362c"]

Unnamed: 0,column,field,resource,dataset,match
0,REF,reference,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,conservation-area,0
1,S_NAME,name,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,conservation-area,0
2,WKT,geometry,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,conservation-area,1
3,id,reference,804c40e898b06dd3e7211583810f9de552f2ffd0d4edc9...,conservation-area,0


In [266]:
results_col_map_df[results_col_map_df["match"] == 1].groupby("field").size()

field
address-text              2
article-4-direction       2
conservation-area         1
description              12
document-type             1
document-url              6
documentation-url         6
end-date                 10
entry-date               10
geometry                 54
legislation               1
listed-building           3
listed-building-grade     3
name                     45
notes                    36
point                     6
reference                38
start-date               10
tree-species-list         9
uprn                      8
dtype: int64

In [268]:
results_col_map_df[results_col_map_df["field"] == "end-date"]

Unnamed: 0,column,field,resource,dataset,match
5,END_DATE,end-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,0
3,END_DATE,end-date,81ed286e34b43d1f9f3053e463a6151224b182538ce98f...,conservation-area,0
3,end_date,end-date,a480fe98c49bf04ebaee5f3c970dc131b784bdd6a7b81c...,conservation-area,0
3,end_date,end-date,77fe8c4978ab17814a30f0d6fac7444026f17a14f84c12...,tree-preservation-order,0
2,end-date,end-date,19a54c9320411dc63de45a29f9f947a006c1ddd9076da0...,article-4-direction,1
0,end-date,end-date,b7f316d572822aed57dd631bed064b178a3aaeed3151c1...,listed-building-outline,1
4,end_date,end-date,8370346f35a81b8b3509f4e3645bb98e43951d09c5cf61...,article-4-direction-area,0
4,end-date,end-date,24b9210088eea5c8312dc370ecaac82b83863e0d7a8c26...,article-4-direction-area,1
3,end-date,end-date,ea2ceb249528bc63b73cf5f63984a08a5d76d9e46fd2e6...,article-4-direction,1
0,end-date,end-date,222aa207c5ad94d02272ac94ac4d0ea93f37a733dd217a...,listed-building-outline,1


In [198]:
resource_fields_match.groupby(["organisation", "name", "dataset"]).agg({"field":"count", "field_present" : "sum"}).reset_index()

Unnamed: 0,organisation,name,dataset,field,field_present
0,local-authority-eng:BIR,Birmingham City Council,article-4-direction-area,16,8.0
1,local-authority-eng:BIR,Birmingham City Council,conservation-area,15,8.0
2,local-authority-eng:BNE,London Borough of Barnet,article-4-direction,12,7.0
3,local-authority-eng:BNE,London Borough of Barnet,article-4-direction-area,16,9.0
4,local-authority-eng:BNE,London Borough of Barnet,conservation-area,15,8.0
...,...,...,...,...,...
71,local-authority-eng:TEW,Tewkesbury Borough Council,tree-preservation-zone,17,7.0
72,local-authority-eng:WBK,West Berkshire Council,article-4-direction-area,16,7.0
73,local-authority-eng:WBK,West Berkshire Council,conservation-area,15,7.0
74,local-authority-eng:WBK,West Berkshire Council,tree,18,6.0


In [128]:
# look at records which have failed the get fields from resource query
endpoint_latest_long_df["result"] = r_test

endpoint_latest_long_df[endpoint_latest_long_df["result"] == "FAIL"]

Unnamed: 0,endpoint_url,status,exception,collection,pipelines,organisation,name,resource,maxentrydate,entrydate,end_date,last_status,last_updated_date,date_last_status_200,pipelines_list,result


In [None]:
endpoint_latest_long_df[endpoint_latest_long_df["result"] == "FAIL"][["pipelines_list", "resource"]].values

In [114]:
endpoint_latest_long_df.groupby("pipelines_list").size()

pipelines_list
article-4-direction            5
article-4-direction-area      14
conservation-area             18
conservation-area-document     1
listed-building-outline       13
tree                           7
tree-preservation-order        9
tree-preservation-zone        10
dtype: int64

In [140]:
# get_fields_for_resource("0384db1ec63fefcb3c64275b6c942296b4220a0c9566114789f8bada92827a41", "tree")

For each of these endpoints, the relevant schema for the dataset is downloaded to compare the endpoint columns against.

'Structure score' is the number of columns in the processed data that match the schema, divided by the number of columns in the schema. Note that if there is no data at all in a field, it cannot be detected as a structure match.

'Column name score' is the number of columns in the processed data that had matching column names to the schema before any processing happened (ie no column mapping had to take place). Note that if there is no data at all in a field, it cannot be detected as a column name match.

If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [142]:
dataset_field_df

Unnamed: 0,dataset,field,field-dataset,guidance,hint
2739,tree-preservation-zone,address-text,,,
2740,tree-preservation-zone,description,,,
2741,tree-preservation-zone,end-date,,,
2742,tree-preservation-zone,entity,,,
2743,tree-preservation-zone,entry-date,,,
2744,tree-preservation-zone,geometry,,,
2745,tree-preservation-zone,name,,,
2746,tree-preservation-zone,notes,,,
2747,tree-preservation-zone,organisation,,,
2748,tree-preservation-zone,point,,,


In [129]:
def compute_cell_colour(value):
    if "%" in value:
        value = int(value.replace("%", ""))
        if value >= 75:
            return 'background-color: green'
        elif value < 75 and value >= 50:
            return 'background-color: orange'
        elif 0 <= value < 50:
            return 'background-color: #ffaeb1'
        else:
            return 'background-color: brown'

organisation_dataset_compliance_dict={}
rows_list = []
csv_rows_list = []
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_compliance_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False
            # print(organisation, dataset, resource)
            

            dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')
            dataset_field_df = dataset_field_df[dataset_field_df['dataset'] == dataset]

            if not skip_dataset:
                column_field_df = get_column_mappings_for_resource(resource, dataset)
                fields = get_fields_for_resource(resource, dataset)
                structure_score, structure_percentage, column_score, column_percentage = check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset)
                overall_percentage = (structure_percentage + column_percentage) / 2
                dataset_compliance_dict[dataset] = {"structure_score": structure_score, "structure_percentage": structure_percentage, "column_score": column_score, "column_name_percentage": column_percentage}
                new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset, 'structure_score': structure_score, 'structure_percentage': f"{int(structure_percentage)}%" , 'column_name_score': column_score, 'column_name_percentage': f"{int(column_percentage)}%", 'overall_percentage': f"{int(overall_percentage)}%"}
                rows_list.append(new_row)
                csv_row = new_row.copy()
                csv_row['endpoint_url'] = row['endpoint_url']
                csv_row['resource'] = row['resource']
                csv_rows_list.append(csv_row)
    
    organisation_dataset_compliance_dict[organisation] = dataset_compliance_dict


compliance_df = pd.DataFrame(rows_list)
output_df = pd.DataFrame(csv_rows_list)
output_df.to_csv('compliance.csv', index=False)
compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_name_percentage", "overall_percentage"])

  compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_name_percentage", "overall_percentage"])


Unnamed: 0,organisation,dataset,structure_score,structure_percentage,column_name_score,column_name_percentage,overall_percentage
0,Birmingham City Council,article-4-direction-area,6/12,50%,1/12,8%,29%
1,Birmingham City Council,conservation-area,6/11,54%,1/11,9%,31%
2,London Borough of Barnet,conservation-area,6/11,54%,3/11,27%,40%
3,London Borough of Barnet,tree-preservation-order,7/12,58%,2/12,16%,37%
4,London Borough of Barnet,article-4-direction,5/9,55%,5/9,55%,55%
5,London Borough of Barnet,listed-building-outline,7/16,43%,7/16,43%,43%
6,London Borough of Barnet,article-4-direction-area,7/12,58%,3/12,25%,41%
7,Bolsover District Council,conservation-area,0/11,0%,0/11,0%,0%
8,Canterbury City Council,conservation-area,5/11,45%,2/11,18%,31%
9,Canterbury City Council,tree-preservation-order,0/12,0%,0/12,0%,0%


## Scrap

### Test different query type

In [None]:
def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

get_column_mappings_for_resource("81ed286e34b43d1f9f3053e463a6151224b182538ce98f9064f43ebd30dc2973", "conservation-area")

Unnamed: 0,column,field
0,REFERENCE,reference
1,WKT,geometry
2,DOCUMENTATION_URL,documentation-url
3,END_DATE,end-date
4,ENTRY_DATE,entry-date
5,NAME,name
6,NOTES,notes
7,START_DATE,start-date


In [None]:
results_col_map = []

for index, r in endpoint_latest_long_df.iterrows():
    try:
        df = get_column_mappings_for_resource(r["resource"], r["dataset"])
        df["resource"] = r["resource"]
        df["dataset"] = r["dataset"]

    except:
        df = pd.DataFrame({"resource" : [r["resource"]],
                           "dataset" : [r["dataset"]]
        })

    results_col_map.append(df)

results_col_map_df = pd.concat(results_col_map)

print(len(results_col_map_df))
results_col_map_df.head()

532


Unnamed: 0,column,field,resource,dataset
0,NAME,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
1,START_DATE,start-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
2,WKT,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
3,ADDRESS_TEXT,address-text,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area
4,ARTICLE_4_DIRECTION,article-4-direction,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area


In [None]:
def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field, fr.resource
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    # facts_list = facts_df['field'].tolist()
    return facts_df

# get_fields_for_resource("81ed286e34b43d1f9f3053e463a6151224b182538ce98f9064f43ebd30dc2973", "conservation-area")

In [None]:
results_field_resource = []

for index, r in endpoint_latest_long_df.iterrows():
    try:
        df = get_fields_for_resource(r["resource"], r["dataset"])
        df["dataset"] = r["dataset"]

    except:
        df = pd.DataFrame({"resource" : [r["resource"]],
                           "dataset" : [r["dataset"]],
                           "field" : [np.nan]
        })

    results_field_resource.append(df)

results_field_resource_df = pd.concat(results_field_resource)

print(len(results_field_resource_df))

results_field_resource_df["field_present"] = 1
results_field_resource_df.head()

525


Unnamed: 0,field,resource,dataset,field_present
0,entry-date,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
1,geometry,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
2,name,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
3,organisation,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1
4,permitted-development-rights,7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b4...,article-4-direction-area,1


In [None]:
results_field_resource_df[results_field_resource_df["field"].isnull()]

Unnamed: 0,field,resource
