# Title
**Author**:  Greg Slater <br>
**Date**:  8th October 2024 <br>
**Dataset Scope**: all <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose
Check the new [categorical value data quality issue](https://datasette.planning.data.gov.uk/digital-land/issue_type?_sort=issue_type&issue_type__exact=invalid+category+value) is working as expected. 

We want to:  

* Check the issues being raised are valid
* Identify any invalid category values which we think should be added to the [category datasets](https://www.planning.data.gov.uk/dataset/#category) by the Data Design team
* Identify any invalid category values for statutory datasets which should be patched (i.e. they are close to a valid value)




In [1]:
import pandas as pd
import os
import urllib
import spatialite
from datetime import datetime

pd.set_option("display.max_rows", 100)

data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)


In [2]:
def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

def query_sqlite(db_path, query_string):

    with spatialite.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

## Data Import

In [None]:
# get a list of all the invalid category value issues happening

q = """
    SELECT dataset, resource, field, value, count(*) as n_issues
    FROM issue 
    WHERE issue_type = "invalid category value"
    GROUP BY dataset, resource, field, value 
    ORDER BY dataset
"""

cat_issue_sum = datasette_query("digital-land", q)

print(len(cat_issue_sum))
cat_issue_sum.head()

In [None]:
q = """select * from patch"""

patches = datasette_query("digital-land", q)

print(len(patches))
# patches.head()

In [None]:
# get endpoint to resource lookup - we want the endpoint hash for creating new patches where we need them

download_dataset("performance", data_dir, overwrite=True)
perf_path = os.path.join(data_dir, "performance.db")

q = """
    SELECT distinct endpoint, resource
    FROM reporting_historic_endpoints
    WHERE resource != ""
"""

ep_res = query_sqlite(perf_path, q)

print(len(ep_res))
ep_res.head()

## Analysis

In [5]:
# cat_issue_sum["dataset"].value_counts()

In [None]:
# remove ODP datasets from list
ODP_datasets = [
    "article-4-direction", "article-4-direction-area", "conservation-area", "conservation-area-document", "listed-building-outline", "tree", "tree-preservation-order", "tree-preservation-zone"  
]

issues_to_check = cat_issue_sum[~cat_issue_sum["dataset"].isin(ODP_datasets)]

print(len(cat_issue_sum))
print(len(issues_to_check))

# join on endpoint
issues_to_check = issues_to_check.merge(
    ep_res,
    how = "left",
    on = "resource"
)

print(len(issues_to_check))

# export - going to endpoint level rather than resource, for patching purposes
td = datetime.today().strftime('%Y-%m-%d')
issues_to_check[["dataset", "field", "value", "n_issues"]].drop_duplicates().to_csv(f"cat_issues_to_check_{td}.csv", index = False)

Next step here is to manually review the issues output into `cat_issues_to_check_[yyyy-mm-dd].csv`. Add in a `needs_patching` field to flag required patches as "yes" and put the corrected value in a `patch_value` field. Then read back in below. 

In [17]:
# read back in annotated results to create new required patches
issues_validated = pd.read_csv("cat_issues_to_check_2024-10-14_noted.csv")

In [None]:
to_patch = issues_validated[issues_validated["needs_patching"] == "yes"][
    ["dataset", "field", "value", "patch_value"]
].drop_duplicates()

# join back on endpoint hash
to_patch = to_patch.merge(
    issues_to_check[["endpoint", "dataset", "field", "value"]], 
    how = "left",
    on = ["dataset", "field", "value"]
)

to_patch

In [None]:
# print output to copy into patch.csv


print("dataset,resource,field,pattern,value,entry-number,start-date,end-date,entry-date,endpoint")
print("-------------------------------------")

for i, r in to_patch.iterrows():

    print(r["dataset"] + ",," + r["field"] + "," + r["value"] + "," + r["patch_value"] + ",,,,," + r["endpoint"])

## Misc

In [None]:
# import category dataset to get full list of valid values

cat_dataset = "contribution-purpose"

import json
with urllib.request.urlopen(f"https://www.planning.data.gov.uk/entity.json?dataset={cat_dataset}&limit=100") as url:
    data = json.load(url)

cont_purpose = pd.DataFrame.from_records(data["entities"])
cont_purpose