In [1]:
%pip install wget
import wget
import pandas as pd
import os


Note: you may need to restart the kernel to use updated packages.


In [2]:
util_file = "master_report_endpoint_utils.py"
if os.path.isfile(util_file):
    from master_report_endpoint_utils import *
else:
    url = "https://raw.githubusercontent.com/digital-land/jupyter-analysis/main/service_report/master_report/master_report_endpoint_utils.py"
    wget.download(url)
    from master_report_endpoint_utils import *

In [57]:
def check_columns_in_endpoint(fields, dataset_field_df, column_field_df):
    dataset_columns = dataset_field_df['field'].tolist()
    
    dataset_columns.remove('entity')
    dataset_columns.remove('organisation')
    dataset_columns.remove('prefix')
    missing_columns = []
    present_columns = []
    for column in dataset_columns:
        if column not in fields:
            missing_columns.append(column)
        else:
            present_columns.append(column)
    structure_score = f"{len(dataset_columns) - len(missing_columns)}/{len(dataset_columns)}"
    structure_percentage = (len(dataset_columns) - len(missing_columns)) / len(dataset_columns) * 100
    
    if ('entity' in present_columns): 
        print("entity column present")
 
    
    
    filtered_columns = ["WKT"]
    column_field_df = column_field_df[-column_field_df['column'].isin(filtered_columns)]

    mapped_fields = column_field_df['field'].tolist()
    correct_column_names = 0
    for field in present_columns:
        if field not in mapped_fields:
            correct_column_names += 1
    
    # correct_column_names = 0
    # for index, row in column_field_df.iterrows():
    #     if row['column'] == row['field']:
    #         correct_column_names += 1
    # incorrect_column_names = len(column_field_df.index)
    # correct_column_names = (len(dataset_columns) - len(missing_columns)) - len(column_field_df.index)

    # print(column_field_df)
    # print(correct_column_names, len(column_field_df.index))
    if len(column_field_df.index) == 0:
        column_score = "0/0"
        column_percentage = 0
        return structure_score, structure_percentage, column_score, column_percentage
    column_score = f"{correct_column_names}/{len(dataset_columns)}"
    column_percentage = (correct_column_names)/ len(dataset_columns)*100
    # column_score = f"{correct_column_names}/{len(column_field_df.index)}"
    # column_percentage = (correct_column_names / len(column_field_df.index)) * 100
    return structure_score, structure_percentage, column_score, column_percentage


def get_fields_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select f.field 
        from 
            fact_resource fr
            inner join fact f on fr.fact = f.fact
        where 
            resource = '{resource}'
        group by
            f.field
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    facts_df = pd.read_csv(url)
    facts_list = facts_df['field'].tolist()
    return facts_list

def get_column_mappings_for_resource(resource, dataset):
    datasette_url = "https://datasette.planning.data.gov.uk/"
    params = urllib.parse.urlencode({
        "sql": f"""
        select column, field
        from 
          column_field  
        where 
            resource = '{resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{dataset}.csv?{params}"
    column_field_df = pd.read_csv(url)
    return column_field_df
   


In [4]:
# Get input from .csv or use default prioritised LPAs
input_path = './organisation_input.csv'
if os.path.isfile(input_path):
    input_df = pd.read_csv(input_path)
    organisation_list = input_df['organisation'].tolist()
    print('Input file found. Using', len(organisation_list), 'organisations from input file.')
else:
    organisation_list = [
    'local-authority-eng:BUC', 
    'local-authority-eng:DAC', 'local-authority-eng:DNC',
    'local-authority-eng:GLO', 'local-authority-eng:CMD', 'local-authority-eng:LBH', 'local-authority-eng:SWK',
    'local-authority-eng:MDW', 'local-authority-eng:NET', 'local-authority-eng:BIR', 'local-authority-eng:CAT',
    'local-authority-eng:EPS', 'local-authority-eng:BNE', 'local-authority-eng:GAT', 'local-authority-eng:GRY',
    'local-authority-eng:KTT', 'local-authority-eng:SAL', 'local-authority-eng:TEW', 'local-authority-eng:WBK',
    'local-authority-eng:DST', 'local-authority-eng:DOV', 'local-authority-eng:LIV', 'local-authority-eng:RDB',
    'local-authority-eng:WFT', 'local-authority-eng:NLN', 'local-authority-eng:NSM', 'local-authority-eng:SLF',
    'local-authority-eng:WRL' ]
    print('Input file not found. Using default list of organisations.')

Input file not found. Using default list of organisations.


In [5]:
# Get organisation names for output table
organisation_info_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv')
organisation_info_df.head()
organisation_name_dict = {}
for organisation in organisation_list:
    organisation_code = organisation.split(':')[1]
    organisation_name = organisation_info_df.loc[organisation_info_df['reference'] == organisation_code].iloc[0]['name']
    organisation_name_dict[organisation] = organisation_name

In [7]:
datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect latest endpoints for each organisation
collection_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree']
pipelines_list = ['article-4-direction', 'article-4-direction-area', 'conservation-area', 'conservation-area-document', 'listed-building-outline', 'tree-preservation-order', 'tree-preservation-zone', 'tree', 'tree,tree-preservation-order', 'tree-preservation-order,tree-preservation-zone']
all_orgs_latest_endpoints={}
for organisation in organisation_list:
    latest_endpoints_df = get_latest_endpoints(organisation)
    latest_endpoints_df = latest_endpoints_df[latest_endpoints_df['pipelines'].isin(pipelines_list)]
    all_orgs_latest_endpoints[organisation] = latest_endpoints_df

In [56]:
def compute_cell_colour(value):
    if "%" in value:
        value = int(value.replace("%", ""))
        if value >= 75:
            return 'background-color: green'
        elif value < 75 and value >= 50:
            return 'background-color: orange'
        elif 0 <= value < 50:
            return 'background-color: red'
        else:
            return 'background-color: brown'

organisation_dataset_compliance_dict={}
rows_list = []
for organisation in organisation_list:
    latest_endpoints_df = all_orgs_latest_endpoints[organisation]
    dataset_compliance_dict = {}
    for index, row in latest_endpoints_df.iterrows():
        resource = row['resource']
        if ',' in row['pipelines']:
            datasets = row['pipelines'].split(',')
        else:
            datasets = [row['pipelines']]
        for dataset in datasets:
            same_datasets_df = latest_endpoints_df[latest_endpoints_df["pipelines"].apply(lambda x: dataset in x.split(','))]
            if len(same_datasets_df) > 1:
                skip_dataset = handle_skip_dataset(same_datasets_df, dataset, row)
            else:
                skip_dataset = False
            print(organisation, dataset, resource)

            dataset_field_df = pd.read_csv('https://raw.githubusercontent.com/digital-land/specification/main/specification/dataset-field.csv')
            dataset_field_df = dataset_field_df[dataset_field_df['dataset'] == dataset]

            if not skip_dataset:
                column_field_df = get_column_mappings_for_resource(resource, dataset)
                fields = get_fields_for_resource(resource, dataset)
                structure_score, structure_percentage, column_score, column_percentage = check_columns_in_endpoint(fields, dataset_field_df, column_field_df)
                overall_percentage = (structure_percentage + column_percentage) / 2
                dataset_compliance_dict[dataset] = {"structure_score": structure_score, "structure_percentage": structure_percentage, "column_score": column_score, "column_percentage": column_percentage}
                new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset, 'structure_score': structure_score, 'structure_percentage': f"{int(structure_percentage)}%" , 'column_score': column_score, 'column_percentage': f"{int(column_percentage)}%", 'overall_percentage': f"{int(overall_percentage)}%"}
                rows_list.append(new_row)
    
    organisation_dataset_compliance_dict[organisation] = dataset_compliance_dict
    print(dataset_compliance_dict)
    # for pipeline in pipelines_list:
    # new_row = {'organisation': organisation_name_dict[organisation], 'dataset': dataset}
    # new_row.update(dataset_compliance_dict)
    # rows_list.append(new_row)

# compliance_df = pd.DataFrame(rows_list, columns=['organisation', *collection_list])
compliance_df = pd.DataFrame(rows_list)
compliance_df.to_csv('compliance.csv', index=False)
compliance_df.head(100)
compliance_df.style.applymap(compute_cell_colour, subset=["structure_percentage", "column_percentage", "overall_percentage"])
        

local-authority-eng:BUC listed-building-outline 1ad3fa87e013320e83b627aaf04ef7b57f88229f7bd7d7f4488b7e688a090ecd
local-authority-eng:BUC tree 1f4e3fa08eec1db0885fb3d03503a8a7d632077a79e74a2d85790e71328396cb
local-authority-eng:BUC tree-preservation-zone 4c7492e7450ff7053a178ed9442597f2552d8132709d5443b1c4474888881d4b
local-authority-eng:BUC conservation-area 51665daeaeece9136f674e024bd5095a1f8a69cf7a3c879ff016006884bbbb98
local-authority-eng:BUC article-4-direction-area 26f858d2b92f412b2d94d23520c0776fbc435cb1fd069a8e6ea4c77a865e08da
{'listed-building-outline': {'structure_score': '0/17', 'structure_percentage': 0.0, 'column_score': '0/17', 'column_percentage': 0.0}, 'tree': {'structure_score': '0/15', 'structure_percentage': 0.0, 'column_score': '0/15', 'column_percentage': 0.0}, 'tree-preservation-zone': {'structure_score': '0/14', 'structure_percentage': 0.0, 'column_score': '0/0', 'column_percentage': 0}, 'conservation-area': {'structure_score': '4/12', 'structure_percentage': 33.3

Unnamed: 0,organisation,dataset,structure_score,structure_percentage,column_score,column_percentage,overall_percentage
0,Buckinghamshire Council,listed-building-outline,0/17,0%,0/17,0%,0%
1,Buckinghamshire Council,tree,0/15,0%,0/15,0%,0%
2,Buckinghamshire Council,tree-preservation-zone,0/14,0%,0/0,0%,0%
3,Buckinghamshire Council,conservation-area,4/12,33%,2/12,16%,24%
4,Buckinghamshire Council,article-4-direction-area,6/13,46%,2/13,15%,30%
5,Doncaster Metropolitan Borough Council,tree-preservation-zone,8/14,57%,2/14,14%,35%
6,Doncaster Metropolitan Borough Council,tree,0/15,0%,0/15,0%,0%
7,Doncaster Metropolitan Borough Council,conservation-area,4/12,33%,2/12,16%,24%
8,Doncaster Metropolitan Borough Council,listed-building-outline,8/17,47%,2/17,11%,29%
9,Doncaster Metropolitan Borough Council,article-4-direction-area,4/13,30%,3/13,23%,26%
