# Title
**Author**:  Greg Slater <br>
**Date**:  8th October 2024 <br>
**Dataset Scope**: all <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose
Check the new [categorical value data quality issue](https://datasette.planning.data.gov.uk/digital-land/issue_type?_sort=issue_type&issue_type__exact=invalid+category+value) is working as expected. 

We want to:  

* Check the issues being raised are valid
* Identify any invalid category values which we think should be added to the [category datasets](https://www.planning.data.gov.uk/dataset/#category) by the Data Design team
* Identify any invalid category values for statutory datasets which should be patched (i.e. they are close to a valid value)




In [21]:
import pandas as pd
import os
import urllib
import spatialite
from datetime import datetime

pd.set_option("display.max_rows", 100)

data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)


In [8]:
def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

def query_sqlite(db_path, query_string):

    with spatialite.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

## Data Import

In [71]:
# get a list of all the invalid category value issues happening

q = """
    SELECT dataset, resource, field, value, count(*) as n_issues
    FROM issue 
    WHERE issue_type = "invalid category value"
    GROUP BY dataset, resource, field, value 
    ORDER BY dataset
"""

cat_issue_sum = datasette_query("digital-land", q)

print(len(cat_issue_sum))
cat_issue_sum.head()

425


Unnamed: 0,dataset,resource,field,value,n_issues
0,ancient-woodland,1d5336e3a650cb037328bf9c2911309328cdc743935842...,ancient-woodland-status,ASNW,39225
1,ancient-woodland,1d5336e3a650cb037328bf9c2911309328cdc743935842...,ancient-woodland-status,AWP,64
2,ancient-woodland,1d5336e3a650cb037328bf9c2911309328cdc743935842...,ancient-woodland-status,PAWS,14343
3,ancient-woodland,80709f042768e421a82f4aaa523f34b837e77af71b4c8a...,ancient-woodland-status,ASNW,39217
4,ancient-woodland,80709f042768e421a82f4aaa523f34b837e77af71b4c8a...,ancient-woodland-status,AWP,64


In [72]:
# get endpoint to resource lookup - we want the endpoint hash for creating new patches where we need them

download_dataset("performance", data_dir, overwrite=True)
perf_path = os.path.join(data_dir, "performance.db")

q = """
    SELECT distinct endpoint, resource
    FROM reporting_historic_endpoints
    WHERE resource != ""
"""

ep_res = query_sqlite(perf_path, q)

print(len(ep_res))
ep_res.head()

downloading data from https://datasette.planning.data.gov.uk/performance.db
to: ../../data/db_downloads/performance.db
download complete
7169


Unnamed: 0,endpoint,resource
0,a16e45dbefe2d67a6d27c086768b6c3610d4e057bb1962...,f1e218c96f99e378fdbaed9a426c6b44d0e7d3b5fec63e...
1,5d30aee8c82e775dd4be67dd417bf782b33de8522edc1e...,2c8470a8cf8f80693ba26bc208c754c2c5c6f02c27ec8b...
2,5d30aee8c82e775dd4be67dd417bf782b33de8522edc1e...,cd5ea36ba52b11b51f6b9bab380710646daf8607bd2107...
3,4c238528f325bcca9a03697583d9f39a91ecf1ec1ecb66...,40ba241c5f5da1de68d3c025e68210d8707413a4d7c50c...
4,58a42ab25e8f24f7dd80b3f0da920be961950bb45442f1...,a6f17c3bc674d60391fc9a1ec361a45627435357916b3d...


## Analysis

In [77]:
# cat_issue_sum["dataset"].value_counts()

In [76]:
# remove ODP datasets from list
ODP_datasets = [
    "article-4-direction", "article-4-direction-area", "conservation-area", "conservation-area-document", "listed-building-outline", "tree", "tree-preservation-order", "tree-preservation-zone"  
]

issues_to_check = cat_issue_sum[~cat_issue_sum["dataset"].isin(ODP_datasets)]

print(len(cat_issue_sum))
print(len(issues_to_check))

# join on endpoint
issues_to_check = issues_to_check.merge(
    ep_res,
    how = "left",
    on = "resource"
)

print(len(issues_to_check))

# export - going to endpoint level rather than resource, for patching purposes
td = datetime.today().strftime('%Y-%m-%d')
issues_to_check[["endpoint", "dataset", "field", "value", "n_issues"]].drop_duplicates().to_csv(f"cat_issues_to_check_{td}.csv", index = False)

425
302
310


Next step here is to manually review the issues output into `cat_issues_to_check_[yyyy-mm-dd].csv`. Add in a `needs_patching` field to flag required patches as "yes" and put the corrected value in a `patch_value` field. Then read back in below. 

In [65]:
# read back in annotated results to create new required patches
issues_validated = pd.read_csv("cat_issues_to_check_2024-10-08_noted.csv")

In [66]:
to_patch = issues_validated[issues_validated["needs_patching"] == "yes"][
    ["endpoint", "dataset", "field", "value", "patch_value"]
].drop_duplicates()

to_patch.head()

Unnamed: 0,endpoint,dataset,field,value,patch_value
15,e08f600e90eb62a72fe50b79d49681b2d64a966df97fc7...,developer-agreement-contribution,contribution-purpose,Amended -Affordable- Housing- plan- locations,affordable-housing
17,1ded2d38b58e6288e3272c42945cdf69626125c02c5691...,developer-agreement-contribution,contribution-purpose,community-facilties,community-facilities
20,e80e128827994e3997e8bffb1550b1cbeefac08f7e598f...,developer-agreement-contribution,contribution-purpose,digtial-infrastructure,digital-infrastructure
24,a54f2b079838979c3fc00056be124dfa0a34fb3ac6a0e2...,developer-agreement-contribution,contribution-purpose,Economic Developent (Local Employment),economic-development
25,a54f2b079838979c3fc00056be124dfa0a34fb3ac6a0e2...,developer-agreement-contribution,contribution-purpose,Education - Secondary,education


In [70]:
# print output to copy into patch.csv


print("dataset,resource,field,pattern,value,entry-number,start-date,end-date,entry-date,endpoint")
print("-------------------------------------")

for i, r in to_patch.iterrows():

    print(r["dataset"] + ",," + r["field"] + "," + r["value"] + "," + r["patch_value"] + ",,,,," + r["endpoint"])

dataset,resource,field,pattern,value,entry-number,start-date,end-date,entry-date,endpoint
-------------------------------------
developer-agreement-contribution,,contribution-purpose,Amended -Affordable- Housing- plan- locations,affordable-housing,,,,,e08f600e90eb62a72fe50b79d49681b2d64a966df97fc77dc6c20a8a8a09f43e
developer-agreement-contribution,,contribution-purpose,community-facilties,community-facilities,,,,,1ded2d38b58e6288e3272c42945cdf69626125c02c5691dc24a486b3934210ff
developer-agreement-contribution,,contribution-purpose,digtial-infrastructure,digital-infrastructure,,,,,e80e128827994e3997e8bffb1550b1cbeefac08f7e598ffc082d7ab86527948d
developer-agreement-contribution,,contribution-purpose,Economic Developent (Local Employment),economic-development,,,,,a54f2b079838979c3fc00056be124dfa0a34fb3ac6a0e213bb24965996b03a27
developer-agreement-contribution,,contribution-purpose,Education - Secondary,education,,,,,a54f2b079838979c3fc00056be124dfa0a34fb3ac6a0e213bb24965996b03a27
develope

## Misc

In [None]:
# import category dataset to get full list of valid values

cat_dataset = "contribution-purpose"

import json
with urllib.request.urlopen(f"https://www.planning.data.gov.uk/entity.json?dataset={cat_dataset}&limit=100") as url:
    data = json.load(url)

cont_purpose = pd.DataFrame.from_records(data["entities"])
cont_purpose.head()

Unnamed: 0,entry-date,start-date,end-date,entity,name,dataset,typology,reference,prefix,organisation-entity,geometry,point
0,2023-09-28,2018-12-04,,16200000,Affordable housing,contribution-purpose,category,affordable-housing,contribution-purpose,,,
1,2023-09-28,2019-05-07,,16200001,Bonds,contribution-purpose,category,bonds,contribution-purpose,,,
2,2023-09-28,2019-03-27,,16200002,CIL administration costs,contribution-purpose,category,cil-administration-costs,contribution-purpose,,,
3,2023-09-28,2018-12-04,,16200003,Community facilities,contribution-purpose,category,community-facilities,contribution-purpose,,,
4,2023-09-28,2018-12-04,,16200004,Community infrastructure levy,contribution-purpose,category,community-infrastructure-levy,contribution-purpose,,,
