This report provides compliance to specification information on the most latest endpoints for a hardcoded list of prioritised list of LPAs, or organisations from an input.

The column 'structure_score' tells us how much data an endpoint is giving us as a fraction of what we ask for. The column 'column_name_score' tells us how many columns are correctly named.

Example: a column name that is incorrect (e.g 'area' instead of 'geometry') but the data in it has been detected as correct data will score in the 'structure_score' column but not the 'column_name' column

The input should be called 'organisation_input.csv' and contain one column, 'organisation' that has the organisation codes for the LPAs to be included in the report.

In [None]:
# %pip install wget
# import wget
import pandas as pd
import os
import numpy as np

pd.set_option("display.max_rows", 100)


Download helper utility files from GitHub:

In [None]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

The default prioritised LPAs are used unless a specific set of LPAs is detected using an 'organisation_input.csv' file in the same directory as this notebook.

In [None]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    provision_df = get_provisions()
    organisation_list = provision_df["organisation"].str.replace(":","-eng:")
    print('Input file not found. Using default list of organisations.')

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

def get_datasette_results(sql):
  
    params = urllib.parse.urlencode({
        "sql": "{}".format(sql),
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    resource_df = pd.read_csv(url)
    return resource_df

In [None]:
def check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset):
    dataset_columns = dataset_field_df['field'].tolist()
    # Remove automatically assigned columns by the pipeline from scoring
    dataset_columns = remove_assigned_columns(dataset, dataset_columns)
    
    missing_columns = []
    present_columns = []
    # Count whether columns in the specification are present in the endpoint
    for column in dataset_columns:
        if column not in fields:
            missing_columns.append(column)
        else:
            present_columns.append(column)
    structure_score = f"{len(dataset_columns) - len(missing_columns)}/{len(dataset_columns)}"
    structure_percentage = (len(dataset_columns) - len(missing_columns)) / len(dataset_columns) * 100

    # The WKT column is removed from the column_field mapping as it is autogenerated by the pipeline for some file formats (e.g geojson)
    filtered_columns = ["WKT"]
    column_field_df = column_field_df[-column_field_df['column'].isin(filtered_columns)]

    mapped_fields = column_field_df['field'].tolist()
    # print("present columns: ", present_columns)
    # print("missing columns: ", missing_columns)
    # print("column field mapping: \n", column_field_df)
    correct_column_names = 0
    for field in present_columns:
        # If a field isn't present in the mapped fields it is correctly named
        # Or if the column name is the same as the field name it is correctly named
        if field not in mapped_fields or column_field_df[column_field_df['field'] == field]['column'].tolist()[0] == field:
            correct_column_names += 1
    
    column_score = f"{correct_column_names}/{len(dataset_columns)}"
    column_percentage = (correct_column_names)/ len(dataset_columns)*100
   
    return structure_score, structure_percentage, column_score, column_percentage


# def get_fields_for_resource(resource, dataset):
#     datasette_url = "https://datasette.planning.data.gov.uk/"
#     params = urllib.parse.urlencode({
#         "sql": f"""
#         select f.field 
#         from 
#             fact_resource fr
#             inner join fact f on fr.fact = f.fact
#         where 
#             resource = '{resource}'
#         group by
#             f.field
#         """,
#         "_size": "max"
#     })
#     url = f"{datasette_url}{dataset}.csv?{params}"
#     facts_df = pd.read_csv(url)
#     facts_list = facts_df['field'].tolist()
#     return facts_list

# alternate version which returns df
def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field, fr.resource
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    # facts_list = facts_df['field'].tolist()
    return facts_df

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

def remove_assigned_columns(dataset, dataset_columns):
    # These columns are auto generated by the pipeline therefore not used in the scoring
    dataset_columns.remove('entity')
    dataset_columns.remove('organisation')
    dataset_columns.remove('prefix')
    if dataset != "tree" and "point" in dataset_columns:
        dataset_columns.remove('point')
    return dataset_columns

Get list of organisation names, to be displayed in the output table. This is gathered separately from the main data, to ensure that if an organisation has not provided any endpoints, it is still included in the output table.

In [None]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

In [None]:
organisation_info_df.head()

In [None]:
# organisation_name_dict

## Latest endpoints table

### Original approach

The latest endpoints are collected for each of the organisations, for the first 4 datasets.

In [None]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:

    try:
        latest_endpoints_df = get_latest_endpoints(organisation)
        latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
        all_orgs_latest_endpoints[organisation] = latest_endpoints_df

    except:
        all_orgs_latest_endpoints[organisation] = None

In [None]:
# stick dictionary in df
endpoint_latest_df = pd.concat([all_orgs_latest_endpoints[v] for v in all_orgs_latest_endpoints])

print(len(endpoint_latest_df))
endpoint_latest_df.head()

In [None]:
endpoint_latest_df.groupby("status").size()

In [None]:
endpoint_latest_df[endpoint_latest_df["status"] == 400]

In [None]:
endpoint_latest_df[endpoint_latest_df["organisation"] == "local-authority-eng:GLO"].values

In [None]:
endpoint_latest_df[endpoint_latest_df["organisation"] == "local-authority-eng:BNE"]

In [None]:
# explode out the 
endpoint_latest_df["dataset"] = endpoint_latest_df["pipelines"].str.split(",")
endpoint_latest_long_df = endpoint_latest_df.explode("dataset", ignore_index=True)

print(len(endpoint_latest_df))
print(len(endpoint_latest_long_df))
endpoint_latest_long_df.head()

In [None]:
# count datasets which have multiple endpoints
org_dataset_count = endpoint_latest_long_df.groupby(["organisation", "name", "dataset"]).size().reset_index(name = "count")

org_dataset_count[org_dataset_count["count"] > 1]

In [None]:
# find datasets which have multiple resources, or resources used in multiple datasets (basically, any instances of resource duplication)
resource_count = endpoint_latest_long_df.groupby(["resource"]).size().reset_index(name = "count")

resource_dupes = resource_count[resource_count["count"] > 1]

# look at records which have resource dupes
endpoint_latest_long_df[endpoint_latest_long_df["resource"].isin(resource_dupes["resource"])][
    ["status", "collection", "dataset", "name", "resource", "entrydate", "maxentrydate"]
]

#### Checking original last updated handling

This is an example of some data which has the `date_last_status_200` populated because the most recent endpoint log doesn't have a 200 status.

In [None]:
endpoint_latest_long_df[endpoint_latest_long_df["date_last_status_200"].notnull()]

In [None]:
# checking how the data looks in the original table before last updated logic applied - endpoint 404 is kept and just date field added to capture when the last 200 status record was
temp_cat = get_endpoints("local-authority-eng:CAT")

temp_cat[temp_cat["pipelines"] == "tree-preservation-order,tree-preservation-zone"]

In [None]:
# however, the resource is still incorrect for this record
get_latest_resource_for_endpoint("https://opendata.arcgis.com/datasets/a4ddbb5114274ba89e33a33545c407c8_0.geojson")

Rather than grabbing all logs and doing this extra logic, it may be easier just to use lastest_logs as the base, then for all non-200 records just grab the last resource which was 200. 


[Example datasette query](https://datasette.planning.data.gov.uk/digital-land?sql=select%0D%0A++++l.endpoint%2C%0D%0A++++l.status%2C%0D%0A++++l.exception%2C%0D%0A++++s.collection%2C%0D%0A++++l.resource%2C%0D%0A++++l.entry_date+as+log_entry_date%2C%0D%0A++++e.entry_date+as+endpoint_entry_date%0D%0Afrom%0D%0A++++log+l%0D%0A++++inner+join+source+s+on+l.endpoint+%3D+s.endpoint%0D%0A%0D%0A++++inner+join+endpoint+e+on+l.endpoint+%3D+e.endpoint%0D%0Awhere%0D%0A++++s.organisation+%3D+%22local-authority-eng%3ACAT%22+and+collection%3D%22tree-preservation-order%22+%0D%0A++++%0D%0Aorder+by+log_entry_date+desc) to get this.

### New approach

In [None]:
# recreating the get_endpoints function, but using latest_log table instead

def get_endpoints_new(organisation):
    if organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
select
    e.endpoint_url,
    l.endpoint,
    l.status,
    l.exception,
    s.collection,
    l.resource,
    sp.pipeline,
    s.organisation,
    o.name,
    l.entry_date as log_entry_date,
    e.entry_date as endpoint_entry_date
from
    most_recent_log l
    inner join source s on l.endpoint = s.endpoint
    inner join endpoint e on l.endpoint = e.endpoint
    inner join organisation o on o.organisation = replace(s.organisation, '-eng', '')
    inner join source_pipeline sp on s.source = sp.source
where
    {query} and not s.collection="brownfield-land" 

order by log_entry_date desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"

    try:
        endpoints_df = pd.read_csv(url)
    except:
        endpoints_df = pd.DataFrame({"organisation":[organisation]})
    
    return endpoints_df

In [None]:
dataset_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']

# dictionary of results, with org name as key
results_dict = {org_name : get_endpoints_new(org_name) for org_name in organisation_list}

# record orgs which didn't return any results
no_result_orgs = [v for v in organisation_list if len(results_dict[v]) == 0]
# concat results into df
endpoint_resource_df = pd.concat([results_dict[v] for v in organisation_list if len(results_dict[v]) > 0])

# filter to only records in pipelines we want
endpoint_resource_df = endpoint_resource_df[endpoint_resource_df["pipeline"].isin(dataset_list)].reset_index(drop=True)

print(len(endpoint_resource_df))
endpoint_resource_df.head()

#### Comparing no. of results between approaches

In [None]:
# no. of endpoints across tables
print(len(endpoint_resource_df))
print(len(endpoint_latest_long_df))

print(len(endpoint_resource_df[endpoint_resource_df["endpoint_url"].isin(endpoint_latest_long_df["endpoint_url"])]))
print(len(endpoint_resource_df[~endpoint_resource_df["endpoint_url"].isin(endpoint_latest_long_df["endpoint_url"])]))

print(len(endpoint_latest_long_df[endpoint_latest_long_df["endpoint_url"].isin(endpoint_resource_df["endpoint_url"])]))

All of the old approach endpoints are captured in the new table, but the new table also has some endpoints which aren't in the old table.

Looking into this below:

In [None]:
# endpoints in new that aren't in old approach
endpoint_resource_df[~endpoint_resource_df["endpoint_url"].isin(endpoint_latest_long_df["endpoint_url"])].head()

In [None]:
# new appraoch captures 2 endpoints for barnet TPO collection, across three pipelines
endpoint_resource_df[(endpoint_resource_df["organisation"] == "local-authority-eng:BNE") & (endpoint_resource_df["collection"] == "tree-preservation-order")]

In [None]:
# but old approach only has one of them, for TPO pipeline
endpoint_latest_long_df[(endpoint_latest_long_df["organisation"] == "local-authority-eng:BNE") & (endpoint_latest_long_df["collection"] == "tree-preservation-order")]

In [None]:
# checking how the data looks in the original get_endpoints table - both endpoints were captured, but grouped into pipeline groups. Not sure why one is lost 

temp_bne = get_endpoints("local-authority-eng:BNE")
temp_bne[temp_bne["collection"] == "tree-preservation-order"][["endpoint_url", "status", "pipelines", "resource", "maxentrydate", "entrydate"]]

In [None]:
# find datasets which have multiple resources, or resources used in multiple datasets (basically, any instances of resource duplication)
resource_count = endpoint_resource_df.groupby(["resource"]).size().reset_index(name = "count")

resource_dupes = resource_count[resource_count["count"] > 1]

# look at records which have resource dupes
endpoint_resource_df[endpoint_resource_df["resource"].isin(resource_dupes["resource"])][
    ["status", "pipeline", "organisation", "name", "endpoint", "resource", "log_entry_date", "endpoint_entry_date"]
]

In [None]:
# find duplicate endpoints
ep_count = endpoint_resource_df.groupby(["endpoint"]).size().reset_index(name = "count")

endpoint_dupes = ep_count[ep_count["count"] > 1]

# look at records which have resource dupes
endpoint_resource_df[endpoint_resource_df["endpoint"].isin(endpoint_dupes["endpoint"])][
    ["status", "pipeline", "name", "endpoint", "resource", "log_entry_date", "endpoint_entry_date"]
]

In [None]:
# find org datasets which are getting data from multiple endpoints
ep_count = endpoint_resource_df.groupby(["organisation", "name", "pipeline"]).size().reset_index(name = "count")

endpoint_dupes = ep_count[ep_count["count"] > 1]
endpoint_dupes
# look at records which have resource dupes
# endpoint_resource_df[endpoint_resource_df["endpoint"].isin(endpoint_dupes["endpoint"])][
#     ["status", "pipeline", "name", "endpoint", "resource", "log_entry_date", "endpoint_entry_date"]
# ]

#### Questions to answer
* what happens when the same resource is duplicated across different endpoints and processed with the same pipeline? As is the case for Yarmouth and Newcastle

In [None]:
# find instances where the same resource is being used multiple times in the same dataset
resource_count = endpoint_resource_df.groupby(["resource", "pipeline"]).size().reset_index(name = "count")

resource_dupes = resource_count[resource_count["count"] > 1]

# look at records which have resource dupes
endpoint_resource_df[endpoint_resource_df["resource"].isin(resource_dupes["resource"])][
    ["status", "pipeline", "name", "endpoint", "resource", "log_entry_date", "endpoint_entry_date"]
]

#### checking status

In [None]:
endpoint_resource_df.groupby("status", dropna=False).size()

In [None]:
# how many total records with bad status
bad_status = endpoint_resource_df[endpoint_resource_df["status"] != 200]

print(len(bad_status))

# join to main table to check whether there are any more recent resources in there - length is the same so no
print(len(bad_status[["endpoint", "pipeline"]].merge(
    endpoint_resource_df,
    how = "inner",
    on = ["endpoint", "pipeline"]
)))

# If we want to get the latest active resources for the bad endpoints will have to process that separately

#### Comparing latest resource between methods

In [None]:
def get_latest_resource_for_endpoint_new(endpoint_url):
  
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          r.resource
        from
          endpoint e
          inner join resource_endpoint re on e.endpoint = re.endpoint
          inner join resource r on re.resource = r.resource
        where
          e.endpoint_url='{endpoint_url}'
        order by
          r.start_date desc
        limit 1
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    resource_df = pd.read_csv(url)
    return resource_df

In [None]:
get_latest_resource_for_endpoint_new("https://maps.birmingham.gov.uk/server/rest/services/planx/PlanX/FeatureServer/4/query?where=1=1&outfields=*&f=geojson")

In [None]:
endpoint_resource_distinct = endpoint_resource_df[["endpoint", "status", "endpoint_url", "resource"]].drop_duplicates()

# endpoint_resource_distinct = endpoint_resource_distinct[endpoint_resource_distinct["status"] == 200]

endpoint_resource_distinct["resource_orig"] = [get_latest_resource_for_endpoint(e)["resource"].values[0] for e in endpoint_resource_distinct["endpoint_url"].values]
endpoint_resource_distinct["resource_new"] = [get_latest_resource_for_endpoint_new(e)["resource"].values[0] for e in endpoint_resource_distinct["endpoint_url"].values]

endpoint_resource_distinct.head()

In [None]:
print(len(endpoint_resource_distinct))

print(len(endpoint_resource_distinct[endpoint_resource_distinct["resource"] == endpoint_resource_distinct["resource_orig"]]))

print(len(endpoint_resource_distinct[endpoint_resource_distinct["resource"] == endpoint_resource_distinct["resource_new"]]))

In [None]:
endpoint_resource_distinct[endpoint_resource_distinct["resource"] != endpoint_resource_distinct["resource_new"]].reset_index(drop=True).iloc[[5]]

In [None]:
# checking whether just one big sql query for all orgs will give same results, and also checking resource end dates

ep_all_df = get_datasette_results(
    """
select
    e.endpoint_url,
    l.endpoint,
    l.status,
    l.exception,
    s.collection,
    l.resource,
    sp.pipeline,
    s.organisation,
    o.name,
    l.entry_date as log_entry_date,
    e.entry_date as endpoint_entry_date,
    e.end_date as endpoint_end_date,
    r.start_date as resource_start_date,
    r.end_date as resource_end_date
from
    most_recent_log l
    inner join source s on l.endpoint = s.endpoint
    inner join endpoint e on l.endpoint = e.endpoint
    inner join organisation o on o.organisation = replace(s.organisation, '-eng', '')
    inner join source_pipeline sp on s.source = sp.source
    left join resource r on l.resource = r.resource
where
    sp.pipeline IN ('article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree')

order by s.organisation, sp.pipeline, log_entry_date desc
""")

print(len(ep_all_df))
ep_all_df.head()

In [None]:
print(len(endpoint_resource_df))

match_all_check = endpoint_resource_df[["endpoint", "status", "pipeline", "collection"]].merge(
    ep_all_df,
    how = "inner",
    on = ["endpoint", "status", "pipeline", "collection"]
)

# check all records matched
print(len(match_all_check))

In [None]:
# checking records which have an endpoint or resource end date
match_all_check[match_all_check["resource_end_date"].notnull()]

In [None]:
# checking those orgs and collections in main table - in all instances those with a non-active resource have a more recent active resource
# so I think we can just filter to endpoints and resources without an end date in the main SQL query
match_all_check[(match_all_check["organisation"] == "local-authority-eng:DNC") & (match_all_check["collection"] == "listed-building")]

In [None]:
endpoint_resource_df[(endpoint_resource_df["organisation"] == "local-authority-eng:DOV") & (endpoint_resource_df["collection"] == "tree-preservation-order")]

In [None]:
print(len(endpoint_resource_df))
print(len(endpoint_resource_df[["endpoint", "pipeline"]].drop_duplicates()))
print(len(endpoint_resource_df[["resource"]].drop_duplicates()))
print(len(endpoint_resource_df[["endpoint"]].drop_duplicates()))

In [None]:
ep_all_filtered_df = ep_all_df[
    (ep_all_df["organisation"].isin(organisation_list)) &
    (ep_all_df["status"] == 200) &
    (ep_all_df["endpoint_end_date"].isnull()) &
    (ep_all_df["resource_end_date"].isnull())
]

print(len(ep_all_df))
print(len(ep_all_filtered_df))

print(len(ep_all_filtered_df[["endpoint", "pipeline"]].drop_duplicates()))
print(len(ep_all_filtered_df[["resource"]].drop_duplicates()))
print(len(ep_all_filtered_df[["endpoint"]].drop_duplicates()))

In [None]:
match_all_check[match_all_check["resource_end_date"].notnull()]["endpoint"].values

In [None]:
ep_all_df[ep_all_df["endpoint"] == "351fdbd179616dcf25ce0c4498cbd7fd5a917c5bbedcbc6af9f3f23d546b484d"]

In [None]:
endpoint_resource_df[endpoint_resource_df["endpoint"].isin(
    match_all_check[match_all_check["organisation"].isnull()]["endpoint"].values
)]

## Resource fields and mapping tables

In [None]:
resource_df = endpoint_resource_df[["pipeline", "resource"]].drop_duplicates().dropna(axis = 0)
print(len(resource_df))

In [None]:
# generic function to try the resource datasette queries 
# will return a df with resource and dataset fields as keys, and query results as other fields
def try_results(function, resource, dataset):

    # try grabbing results
    try:
        df = function(resource, dataset)

        # if empty response give NaNs
        if len(df) == 0:
            df = pd.DataFrame({"column" : [np.nan],
                           "field" : [np.nan]
            })

        df["resource"] = resource
        df["dataset"] = dataset

    # if error record resource and dataset
    except:
        df = pd.DataFrame({"resource" : [resource],
                           "dataset" : [dataset]
        })

    return df



results_col_map = [try_results(get_column_mappings_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]
results_field_resource = [try_results(get_fields_for_resource, r["resource"], r["pipeline"]) for index, r in resource_df.iterrows()]

# concat the results, resources which errored with have NaNs in query results fields
results_col_map_df = pd.concat(results_col_map)
results_field_resource_df = pd.concat(results_field_resource)

# no. of resources in each query response array
print(len(results_col_map))
print(len(results_field_resource))

# no of records in each results df
print(len(results_col_map_df))
print(len(results_field_resource_df))


In [None]:
# number of distinct resources in each table
print(len(results_col_map_df[["resource"]].drop_duplicates()))
print(len(results_field_resource_df[["resource"]].drop_duplicates()))

In [None]:
results_field_resource_df.head()

In [None]:
results_col_map_df.head()

In [None]:
# resources which are in the column mapping df but not in the fields one

results_col_valid = results_col_map_df[results_col_map_df["field"].notnull()]
results_field_valid = results_field_resource_df[results_field_resource_df["field"].notnull()]

results_col_valid[~results_col_valid["resource"].isin(results_field_valid["resource"].drop_duplicates())].sort_values("resource")

Question for tomorrow - why would a resource be in the field mapping table but not in the fields table..??

In [None]:
# add in match field for column mappings 
results_col_map_df["field_matched"] = np.where(
        (results_col_map_df["field"].isin(["geometry", "point"])) |
        (results_col_map_df["field"] == results_col_map_df["column"]),
        1, 
        0
)

# add in flag for fields supplied (i.e. they're in the mapping table)
results_col_map_df["field_supplied"] = 1

# add in flag for fields present
results_field_resource_df["field_loaded"] = 1

In [None]:
results_col_map_df.head()

In [None]:
# check how geometry fields are mapped
results_col_map_df[results_col_map_df["field"] == "geometry"][["column", "field", "match"]].drop_duplicates()

### Checking data in fields vs mapping tables

In [None]:
# check for differences in the number of fields in the col_map response vs. field response
results_col_count = results_col_map_df.groupby(["resource"]).size().reset_index(name = "col_map_count")
results_field_count = results_field_resource_df.groupby(["resource"]).size().reset_index(name = "field_count")

col_field_comp_df = results_col_count.merge(
    results_field_count,
    how = "left", 
    on = "resource"
)

# col_field_comp_df.replace(np.nan, 0, inplace=True)

col_field_comp_df["difference"] = col_field_comp_df["col_map_count"] - col_field_comp_df["field_count"]

col_field_comp_df.head(10)

In [None]:
# Taking an example of a single resource - 7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693
# the column mapping table contains 11 fields

results_col_map_df[results_col_map_df["resource"]== "7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693"].sort_values("field")

In [None]:
# the field table only contains 8 fields
results_field_resource_df[results_field_resource_df["resource"] == "7a937605655b895bf9ebfbe29f8e35af8d3f606fd811b42867251d61ff15b693"].sort_values("field")

In [None]:
# checking the endpoint itself we can see that the 8 fields above are the 6 mapped fields which are populated, plus organisation and prefix

import geopandas as gpd

bm_af_df = gpd.read_file("https://maps.birmingham.gov.uk/server/rest/services/planx/PlanX/FeatureServer/0/query?where=1=1&outfields=*&f=geojson")

bm_af_df.head()

In [None]:
endpoint_latest_long_df[endpoint_latest_long_df["resource"] == "0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5a0ae709c1fc495bc81"].values

In [None]:
endpoint_latest_long_df[endpoint_latest_long_df["organisation"] == "local-authority-eng:DOV"]

In [None]:
dov_endpoints_df = get_endpoints("local-authority-eng:DOV")

print(len(dov_endpoints_df))
dov_endpoints_df[dov_endpoints_df["collection"] == "tree-preservation-order"]

In [None]:
get_fields_for_resource("0519df49c2ecc3c53948b4283704bfd5b905ac4db6e4b5a0ae709c1fc495bc81", "tree-preservation-zone")

In [None]:
get_fields_for_resource("3327be0c6d46fb1ebb8c77a9b3344dd12be4bd59e9abf546966fcdc552c4282c", "tree-preservation-zone")

In [None]:
get_fields_for_resource("004e273e15af7f9c5ffe43cda70764da076e53c090c128f937031e63c7ce7a8d", "article-4-direction-area")

## Calculating match rates

In [None]:
dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')

# remove the pipeline-created fields from the spec field table
# ("entity", "organisation", "prefix", "point" for all but tree, and
#  "entity", "organisation", "prefix" for tree)
dataset_field_subset_df = dataset_field_df[((dataset_field_df["dataset"] != "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix", "point"])) |
                  (dataset_field_df["dataset"] == "tree") & (~dataset_field_df["field"].isin(["entity", "organisation", "prefix"])))]

dataset_field_df.head()

In [None]:
endpoint_resource_df.head()

In [None]:
# left join on all fields that each dataset should have

endpoint_resource_df["dataset"] = endpoint_resource_df["pipeline"]

resource_spec_fields_df = endpoint_resource_df[["organisation", "name", "dataset", "endpoint", "status", "log_entry_date", "endpoint_entry_date", "resource"]].merge(
    dataset_field_subset_df[["dataset", "field"]],
    how = "left",
    on = "dataset"
)

print(len(resource_spec_fields_df))
resource_spec_fields_df.head()

In [None]:
# join on field present flag for each resource
resource_fields_match = resource_spec_fields_df.merge(
    results_field_resource_df[["dataset", "resource", "field", "field_loaded"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_match))
resource_fields_match.head()



In [None]:
# join on field present flag for each resource
resource_fields_map_match = resource_fields_match.merge(
    results_col_map_df[["dataset", "resource", "field", "field_supplied", "field_matched"]],
    how = "left",
    on = ["dataset", "resource", "field"]
)

print(len(resource_fields_map_match))
resource_fields_map_match.head()

In [None]:
resource_fields_map_match.head()

In [None]:
resource_fields_map_match.replace(np.nan, 0, inplace=True)

final_count = resource_fields_map_match.groupby(["organisation", "name", "endpoint", "resource", "dataset", "status", "log_entry_date", "endpoint_entry_date"]).agg(
    {"field":"count", 
     "field_supplied" : "sum",
     "field_matched" : "sum",
     "field_loaded" : "sum"}).reset_index().sort_values(["name"])

final_count.head(10)

In [None]:
final_count[final_count["name"] == "Birmingham City Council"]

In [None]:
endpoint_resource_df[endpoint_resource_df["resource"] == "acb88aac41434c4cfccb9ee77f6471f5c682616617604cd0db502893e9c08579"].values

In [None]:
resource_fields_map_match[resource_fields_map_match["resource"] == "acb88aac41434c4cfccb9ee77f6471f5c682616617604cd0db502893e9c08579"]

In [None]:
results_field_resource_df[results_field_resource_df["resource"] == "9fea9a08d5717b319698f2871b7e5d9e635cb6381a3da08fbec731277c23dd26"]

In [None]:
results_col_map_df[results_col_map_df["resource"] == "9fea9a08d5717b319698f2871b7e5d9e635cb6381a3da08fbec731277c23dd26"]

In [None]:
get_issues_for_resource("acb88aac41434c4cfccb9ee77f6471f5c682616617604cd0db502893e9c08579", "conservation-area")

For each of these endpoints, the relevant schema for the dataset is downloaded to compare the endpoint columns against.

'Structure score' is the number of columns in the processed data that match the schema, divided by the number of columns in the schema. Note that if there is no data at all in a field, it cannot be detected as a structure match.

'Column name score' is the number of columns in the processed data that had matching column names to the schema before any processing happened (ie no column mapping had to take place). Note that if there is no data at all in a field, it cannot be detected as a column name match.

If an endpoint contributes to two datasets, it will only be considered for a dataset if it is the newest endpoint for that dataset, calculated independently.

In [None]:
dataset_field_df

In [None]:
def compute_cell_colour(value):
    if "%" in value:
        value = int(value.replace("%", ""))
        if value >= 75:
            return 'background-color: green'
        elif value < 75 and value >= 50:
            return 'background-color: orange'
        elif 0 <= value < 50:
            return 'background-color: #ffaeb1'
        else:
            return 'background-color: brown'

organisation_dataset_compliance_dict={}
rows_list = []
csv_rows_list = []
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_compliance_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False
            # print(organisation, dataset, resource)
            

            dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')
            dataset_field_df = dataset_field_df[dataset_field_df['dataset'] == dataset]

            if not skip_dataset:
                column_field_df = get_column_mappings_for_resource(resource, dataset)
                fields = get_fields_for_resource(resource, dataset)
                structure_score, structure_percentage, column_score, column_percentage = check_columns_in_endpoint(fields, dataset_field_df, column_field_df, dataset)
                overall_percentage = (structure_percentage + column_percentage) / 2
                dataset_compliance_dict[dataset] = {"structure_score": structure_score, "structure_percentage": structure_percentage, "column_score": column_score, "column_name_percentage": column_percentage}
                new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset, 'structure_score': structure_score, 'structure_percentage': f"{int(structure_percentage)}%" , 'column_name_score': column_score, 'column_name_percentage': f"{int(column_percentage)}%", 'overall_percentage': f"{int(overall_percentage)}%"}
                rows_list.append(new_row)
                csv_row = new_row.copy()
                csv_row['endpoint_url'] = row['endpoint_url']
                csv_row['resource'] = row['resource']
                csv_rows_list.append(csv_row)
    
    organisation_dataset_compliance_dict[organisation] = dataset_compliance_dict


compliance_df = pd.DataFrame(rows_list)
output_df = pd.DataFrame(csv_rows_list)
output_df.to_csv('compliance.csv', index=False)
compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_name_percentage", "overall_percentage"])

In [None]:
pd.set_option('display.max_colwidth', None)

def compute_cell_colour(status):
    if status == "200":
        return 'background-color: green'
    elif status == 'No endpoint':
        return 'background-color: orange'
    else:
        return 'background-color: red'
    
def cut_zeros(row):
  if row[-2:]=='.0':
    row=row[:-2]
  return row

# Only display non 200 statuses if they have been non 200 for more than 5 days
def compute_displayed_status(row):
    # Check if the most recent status isn't 200
    if row["last_status"] == 200:
        last_200_date = pd.to_datetime(row["last_updated_date"])
    elif row["last_status"] != None:
        last_200_date = pd.to_datetime(row["date_last_status_200"])
    else:
        # If the most recent status is 200 then we can return 200
        return 200
    
    days_since_200 = (row["maxentrydate"] - last_200_date).days
    # Only show non 200 statuses if they have been non 200 for more than 5 days
    if days_since_200 >= 5:
        status = row['status']
        # Handle cases where there is no status by looking at the exception
        if not pd.isna(status):
            status = int(status)
        else:
            status=latest_endpoints_df.loc[latest_endpoints_df['status'].isna(), 'exception'].values[0]
            if status is None:
                status="Unknown Error"
        return status
    else:
        return 200


rows_list = []
organisation_dataset_statuses_dict = {}
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    latest_endpoints_df = latest_endpoints_df[pd.isna(latest_endpoints_df['end_date'])]
    try:
        name = organisation_name_dict[organisation]
    except:
        name = organisation
    
    dataset_statuses_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        if 'WFS' in row['endpoint_url']:
            response = requests.get(row['endpoint_url'], stream=True)
            try:
                content = next(response.iter_content(chunk_size=1024)).decode('utf-8')
            except requests.exceptions.RequestException as e:
                content = response.text
            if 'Cannot find layer' in content:
                row['status']='Cannot find layer'
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            # Consider cases where a dataset is contributed to by multiple endpoints
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False

            if not skip_dataset:
                dataset_statuses_dict[dataset] = compute_displayed_status(row)
    organisation_dataset_statuses_dict[organisation] = dataset_statuses_dict
   
    new_row = {'organisation': name}
    new_row.update(dataset_statuses_dict)
    rows_list.append(new_row)

output_df = pd.DataFrame(rows_list, columns=['organisation', *dataset_list])
output_df = output_df.replace(np.nan, "No endpoint")

output_df = output_df.astype(str)
output_df = output_df.applymap(cut_zeros)

output_df.to_csv('endpoint_status_master_report.csv', index=False)
output_df.style.applymap(compute_cell_colour, subset=dataset_list)

## Scrap

### Test different query type

In [None]:
def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df

get_column_mappings_for_resource("81ed286e34b43d1f9f3053e463a6151224b182538ce98f9064f43ebd30dc2973", "conservation-area")

In [None]:
results_col_map = []

for index, r in endpoint_latest_long_df.iterrows():
    try:
        df = get_column_mappings_for_resource(r["resource"], r["dataset"])
        df["resource"] = r["resource"]
        df["dataset"] = r["dataset"]

    except:
        df = pd.DataFrame({"resource" : [r["resource"]],
                           "dataset" : [r["dataset"]]
        })

    results_col_map.append(df)

results_col_map_df = pd.concat(results_col_map)

print(len(results_col_map_df))
results_col_map_df.head()

In [None]:
def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field, fr.resource
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    # facts_list = facts_df['field'].tolist()
    return facts_df

# get_fields_for_resource("81ed286e34b43d1f9f3053e463a6151224b182538ce98f9064f43ebd30dc2973", "conservation-area")

In [None]:
results_field_resource = []

for index, r in endpoint_latest_long_df.iterrows():
    try:
        df = get_fields_for_resource(r["resource"], r["dataset"])
        df["dataset"] = r["dataset"]

    except:
        df = pd.DataFrame({"resource" : [r["resource"]],
                           "dataset" : [r["dataset"]],
                           "field" : [np.nan]
        })

    results_field_resource.append(df)

results_field_resource_df = pd.concat(results_field_resource)

print(len(results_field_resource_df))

results_field_resource_df["field_present"] = 1
results_field_resource_df.head()

In [None]:
results_field_resource_df[results_field_resource_df["field"].isnull()]