# Provision data quality report
**Author**:  Greg Slater <br>
**Date**:  November 2024 <br>
**Dataset Scope**: all datasets <br>
**Report Type**: Ad-hoc <br>

## Purpose
The purpose of this report is to measure the quality of the data that makes up each data provision on the platform, by applying a data quality framework that sets out criteria that must be met in order to reach one of 4 different quality levels. These levels are based around the quality requirements of the ODP software which uses platform data.

In [32]:
import urllib
import os
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from datetime import datetime

td = datetime.today().strftime('%Y-%m-%d')

In [33]:
FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

In [34]:
def query_sqlite(db_path, query_string):

    with sqlite3.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

In [35]:

def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_issue_lookup():
    
    q = """
    select issue_type, severity, responsibility
    from issue_type
"""
    return datasette_query("digital-land", q)

In [36]:
db_dir = "../data/db_downloads/"
os.makedirs(db_dir, exist_ok=True)

output_dir = "../data/quality_report/"
os.makedirs(output_dir, exist_ok=True)

## Get data

In [60]:
# download_dataset("performance", db_dir, overwrite=True)

lookup_issue_qual = pd.read_csv("https://raw.githubusercontent.com/digital-land/jupyter-analysis/refs/heads/main/service_report/input/issue_type_quality.csv")

dataset_subset_dict = dict({
        "ODP" : ["conservation-area", "conservation-area-document", "article-4-direction-area", "article-4-direction", "listed-building-outline", "tree", "tree-preservation-zone", "tree-preservation-order"],
        "BFL" : ["brownfield-land"],
        "Developers" : ["developer-agreement", "developer-agreement-contribution", "developer-agreement-transaction"]
    })

In [None]:
# BASE TABLE

# get table of active endpoints and resources, with issue summaries per resource joined on
q = f"""
    SELECT 
        rhe.organisation, rhe.name as organisation_name, 
        rhe.collection, rhe.pipeline, rhe.endpoint, rhe.resource, rhe.latest_status, rhe.endpoint_entry_date, rhe.resource_start_date, 
        CAST(JULIANDAY('now') - JULIANDAY(rhe.resource_start_date) AS int) as resource_age_days,
        its.issue_type, its.count_issues, its.severity, its.responsibility
    FROM reporting_historic_endpoints rhe
    LEFT JOIN endpoint_dataset_issue_type_summary its on rhe.resource = its.resource
    WHERE 1=1
        AND rhe.endpoint_end_date = ""
        AND rhe.resource_end_date = ""
        AND rhe.latest_status = 200
"""

ep_res_issues = query_sqlite(os.path.join(db_dir, "performance.db"), q)

print(len(ep_res_issues))
ep_res_issues.head()

In [39]:
# Provision lookups

q = f"""
    SELECT 
        distinct organisation, pipeline, cohort
    FROM endpoint_dataset_resource_summary
"""

# get organisation, pipeline and cohort flag from performance table
lookup_provision = query_sqlite(os.path.join(db_dir, "performance.db"), q)


## Setup base tables

In [40]:
# FRESHNESS TABLE - flagging when provisions haven't been updated in last year

# create table of old resources and flag quality level as 5
ep_res_fresh_qual = ep_res_issues[ep_res_issues["resource_age_days"] > 365][["collection", "pipeline", "organisation", "organisation_name"]]

ep_res_fresh_qual["issue_type"] = "not_fresh"
ep_res_fresh_qual["quality_category"] = "1 - endpoint updated in last year"
ep_res_fresh_qual["quality_level"] = 1

In [None]:
# ISSUES TABLE - flagging when provisions have data quality issues

# join on quality key and restrict fields
ep_res_issues_qual = ep_res_issues.merge(
    lookup_issue_qual[["issue_type", "quality_category", "quality_level"]],
    how = "left",
    on = "issue_type"
)[["collection", "pipeline", "organisation", "organisation_name", "issue_type", "quality_category", "quality_level"]]

print(len(ep_res_issues))
print(len(ep_res_issues_qual))

ep_res_issues_qual.head()

In [42]:
# ALL QUALITY CATEGORIES TABLE - joining all records of quality categories (freshness & DQ issues) into one long table 
# concat tables for each type
ep_res_qual_all = pd.concat([ep_res_issues_qual, ep_res_fresh_qual])


In [None]:
# SCORING - using quality framework levels to assign a quality level to each provision

# summarise by provision, taking max quality level for each
qual_summary = ep_res_qual_all.groupby([
    "collection", "pipeline", "organisation", "organisation_name"
    ],
as_index=False,
dropna=False
).agg(
    quality_level = ("quality_level", "min")
)

qual_summary.replace(np.nan, 4, inplace=True)

level_map = {
    4: "4. excellent",
    3: "3. good for ODP",
    2: "2. improve",
    1: "1. update"}

qual_summary["quality_level_label"] = qual_summary["quality_level"].map(level_map)
print(len(qual_summary))

In [None]:
qual_summary

## LPA and dataset table summaries

### ODP LPA x Dataset quality table

In [44]:
# qual_summary

lookup_provision_odp = lookup_provision[
    lookup_provision["cohort"].str.contains("ODP")
    ][["organisation", "pipeline"]].drop_duplicates()

 # subset and pivot
odp_lpa_summary = qual_summary.merge(
    lookup_provision_odp,
    how = "inner",
    on = ["organisation", "pipeline"]
).pivot(
    columns = "pipeline",
    values = "quality_level_label",
    index = ["organisation", "organisation_name"]
).reset_index()

odp_lpa_summary.replace(np.nan, "no data", inplace=True)
# odp_lpa_summary

In [45]:
level_colours = {
    "4. excellent" : "background-color: #1a6837",
    "3. good for ODP" : "background-color: #87cb67",
    "2. improve" : "background-color: #fefebf",
    "1. update" : "background-color: #f78c51"
    }


def make_color_mask_odp_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)

    flag_slice = df.columns[2:]
    for s in flag_slice:
        df_color_map[s] = df[s].map(level_colours)

    return df_color_map

# odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None)

### Dataset x quality categories table

In [46]:
# count issues by the quality category key
qual_cat_count = ep_res_qual_all.groupby(
        ["pipeline", "organisation", "organisation_name", "quality_category"],
        as_index=False
    ).agg(
        n_issues = ("quality_level", "count")
    )

In [None]:
# create a base table with each quality category key for each provision
prov = ep_res_qual_all[["pipeline", "organisation", "organisation_name"]].drop_duplicates()
prov["key"] = 1

qual_cat = ep_res_qual_all[ep_res_qual_all["quality_category"].notnull()][["quality_category"]].drop_duplicates()
qual_cat["key"] = 1

qual_cat_summary = prov.merge(
    qual_cat,
    how = "left",
    on = "key"
)
print(len(qual_cat_summary))

# left join on the counts to the base table
qual_cat_summary = qual_cat_summary.merge(
    qual_cat_count,
    how = "left",
    on = ['pipeline', 'organisation', 'organisation_name', 'quality_category']
)

# create boolean flag for each category
qual_cat_summary["issue_flag"] = np.where(qual_cat_summary["n_issues"] > 0, False, True)
print(len(qual_cat_summary))
# qual_cat_summary.head()

In [48]:
# pivot quality category summary table so that quality categories are columns, join on overall quality level per provision
qual_cat_summary_wide = qual_cat_summary.pivot(
        columns = "quality_category",
        values = "issue_flag",
        index = ["pipeline", "organisation", "organisation_name"]
    ).reset_index(
    ).merge(
        qual_summary[["pipeline", "organisation", "quality_level_label"]],
        how = "left",
        on = ["pipeline", "organisation"]
    )

def get_dataset_qual_detail(dataset):
    # just subsets and styles main wide quality detail table

    qual_detail = qual_cat_summary_wide[qual_cat_summary_wide["pipeline"] == dataset].copy()

    return qual_detail.style.apply(make_color_mask_dataset_lpa, axis=None)

# table styling 
level_colours = {
        "4. excellent" : "background-color: #1a6837",
        "3. good for ODP" : "background-color: #87cb67",
        "2. improve" : "background-color: #fefebf",
        "1. update" : "background-color: #f78c51"
    }

flag_colours = {
        True : "color:green",
        False : "color:red"
    }

def make_color_mask_dataset_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)
    # turn label column into colours
    df_color_map["quality_level_label"] = df["quality_level_label"].map(level_colours)

    flag_slice = df.columns[3:-1]
    for s in flag_slice:
        df_color_map[s] = df[s].map(flag_colours)

    return df_color_map


# make widget
dataset_list = qual_cat_summary["pipeline"].sort_values().drop_duplicates().values

dataset_dropdown = widgets.Dropdown(
    options = dataset_list,
    value = "conservation-area",
    description = "Select Dataset: ",
)


## Chart

In [49]:
# VISUALISE

# qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(subset_bfl)]
# qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(subset_dvl)]

# color map to use in chart
cmap = plt.get_cmap('RdYlGn')
colors = [cmap(i / 4) for i in np.arange(1, 5)]

def make_quality_overview_chart(subset):
    """
    Uses the qual summary table to display a horizontal bar chart 
    """

    qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(dataset_subset_dict[subset])]

    # count providers by dataset & quality level
    qual_chart = qual_summary_subset.groupby(["pipeline", "quality_level", "quality_level_label"], as_index=False).agg(
        n_providers = ("quality_level", "count")
    )

    qual_chart.sort_values(["pipeline", "quality_level_label"], inplace=True)
    qual_chart_wide = qual_chart.pivot(columns = "quality_level_label", values = "n_providers", index = "pipeline")
    
    qual_chart_wide.plot.barh(
        stacked = True, 
        color = colors, 
        figsize = (9, 6))

    # Add labels and title
    plt.xlabel('Count of providers')
    plt.ylabel('Dataset')
    plt.title('Quality levels for ODP datasets')
    plt.legend(title='Quality level')

    return plt.show()


subset_dropdown = widgets.Dropdown(
    options = dataset_subset_dict.keys(),
    # value = dataset_list[0],
    description = "Select Dataset subset: ",
)

# widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

# Reports

### Data quality overview chart - by dataset groups

In [None]:
widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

### ODP LPA overview table by dataset & quality

In [None]:
odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None)

### Dataset quality scoring detail table

In [None]:
widgets.interact(get_dataset_qual_detail, dataset = dataset_dropdown)

### Output
Save report files

In [53]:
fn = os.path.join(output_dir, f"quality_ODP-dataset-scores-by-LPA_{td}.xlsx")
odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None).to_excel(fn, index = False)

In [54]:
fn = os.path.join(output_dir, f"quality_dataset-quality-detail_{td}.csv")
qual_cat_summary_wide.to_csv(fn)