# Provision data quality report
**Author**:  Greg Slater <br>
**Date**:  November 2024 <br>
**Dataset Scope**: all datasets <br>
**Report Type**: Ad-hoc <br>

**Purpose**: The purpose of this report is to measure the quality of the data that makes up each data provision on the platform, by applying a data quality framework that sets out criteria that must be met in order to reach one of 4 different quality levels. These levels are based around the quality requirements of the ODP software which uses platform data.

In [1]:
import urllib
import os
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from datetime import datetime
import wget

td = datetime.today().strftime('%Y-%m-%d')

In [20]:
def save_util_file(file_name):

    if os.path.isfile(file_name) == False:
        url = f"https://raw.githubusercontent.com/digital-land/jupyter-analysis/refs/heads/gs/qual-report-v2/{file_name}"
        wget.download(url)
        print(f"downloaded {file_name} from github")

    else:
        print("file available locally")

for f in ["functions_core.py", "functions_import.py", "functions_transform.py"]:
    save_util_file(f)

import functions_core as fc
import functions_import as fi
import functions_transform as ft

file available locally
file available locally
file available locally


In [2]:
db_dir = "../../data/db_downloads/"
os.makedirs(db_dir, exist_ok=True)

output_dir = "../../data/quality_report/"
os.makedirs(output_dir, exist_ok=True)

## 1. Import

In [3]:
# performance db
fc.download_dataset("performance", db_dir, overwrite=False)
path_perf_db = os.path.join(db_dir, "performance.db")

# Issue quality criteria lookup
lookup_issue_qual = pd.read_csv("https://raw.githubusercontent.com/digital-land/jupyter-analysis/refs/heads/main/service_report/input/issue_type_quality.csv")
https://raw.githubusercontent.com/digital-land/jupyter-analysis/refs/heads/gs/point-dupe-analysis/README.md
# Provision lookups
lookup_provision = fi.get_provision_lookup(os.path.join(path_perf_db))

# Dataset subset dict for chart
dataset_subset_dict = dict({
        "ODP" : ["conservation-area", "conservation-area-document", "article-4-direction-area", "article-4-direction", "listed-building-outline", "tree", "tree-preservation-zone", "tree-preservation-order"],
        "BFL" : ["brownfield-land"],
        "Developers" : ["developer-agreement", "developer-agreement-contribution", "developer-agreement-transaction"]
    })

# Base table
ep_res_issues = fi.get_endpoint_res_issues(path_perf_db)

## 2. Transform

In [6]:
# FRESHNESS TABLE - flagging when provisions haven't been updated in last year

# create table of old resources and flag quality level as 5
ep_res_fresh_qual = ft.make_freshness_input_table(ep_res_issues, age_days = 365)


# ISSUES TABLE - flagging when provisions have data quality issues

# join on quality key and restrict fields
ep_res_issues_qual = ft.make_issues_input_table(ep_res_issues, lookup_issue_qual)

# print(len(ep_res_issues))
# print(len(ep_res_issues_qual))


# ALL QUALITY CATEGORIES TABLE - joining all records of quality categories (freshness & DQ issues) into one long table 
# concat tables for each type
ep_res_qual_all = pd.concat([ep_res_issues_qual, ep_res_fresh_qual])
# ep_res_issues_qual.head()

In [9]:
level_map = {
    4: "4. excellent",
    3: "3. good for ODP",
    2: "2. improve",
    1: "1. update"}

qual_summary = ft.make_score_summary_table(ep_res_qual_all, level_map)
print(len(qual_summary))

## 3. Summarise

### ODP LPA x Dataset quality table

In [11]:
# qual_summary

lookup_provision_odp = lookup_provision[
    lookup_provision["cohort"].str.contains("ODP")
    ][["organisation", "pipeline"]].drop_duplicates()

 # subset and pivot
odp_lpa_summary = qual_summary.merge(
    lookup_provision_odp,
    how = "inner",
    on = ["organisation", "pipeline"]
).pivot(
    columns = "pipeline",
    values = "quality_level_label",
    index = ["organisation", "organisation_name"]
).reset_index()

odp_lpa_summary.replace(np.nan, "no data", inplace=True)
# odp_lpa_summary

In [12]:
level_colours = {
    "4. excellent" : "background-color: #1a6837",
    "3. good for ODP" : "background-color: #87cb67",
    "2. improve" : "background-color: #fefebf",
    "1. update" : "background-color: #f78c51"
    }


def make_color_mask_odp_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)

    flag_slice = df.columns[2:]
    for s in flag_slice:
        df_color_map[s] = df[s].map(level_colours)

    return df_color_map

# odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None)

### Dataset x quality categories table

In [13]:
# count issues by the quality category key
qual_cat_count = ep_res_qual_all.groupby(
        ["pipeline", "organisation", "organisation_name", "quality_category"],
        as_index=False
    ).agg(
        n_issues = ("quality_level", "count")
    )

In [14]:
# create a base table with each quality category key for each provision
prov = ep_res_qual_all[["pipeline", "organisation", "organisation_name"]].drop_duplicates()
prov["key"] = 1

qual_cat = ep_res_qual_all[ep_res_qual_all["quality_category"].notnull()][["quality_category"]].drop_duplicates()
qual_cat["key"] = 1

qual_cat_summary = prov.merge(
    qual_cat,
    how = "left",
    on = "key"
)
print(len(qual_cat_summary))

# left join on the counts to the base table
qual_cat_summary = qual_cat_summary.merge(
    qual_cat_count,
    how = "left",
    on = ['pipeline', 'organisation', 'organisation_name', 'quality_category']
)

# create boolean flag for each category
qual_cat_summary["issue_flag"] = np.where(qual_cat_summary["n_issues"] > 0, False, True)
print(len(qual_cat_summary))
# qual_cat_summary.head()

3460
3460


In [15]:
# pivot quality category summary table so that quality categories are columns, join on overall quality level per provision
qual_cat_summary_wide = qual_cat_summary.pivot(
        columns = "quality_category",
        values = "issue_flag",
        index = ["pipeline", "organisation", "organisation_name"]
    ).reset_index(
    ).merge(
        qual_summary[["pipeline", "organisation", "quality_level_label"]],
        how = "left",
        on = ["pipeline", "organisation"]
    )

def get_dataset_qual_detail(dataset):
    # just subsets and styles main wide quality detail table

    qual_detail = qual_cat_summary_wide[qual_cat_summary_wide["pipeline"] == dataset].copy()

    return qual_detail.style.apply(make_color_mask_dataset_lpa, axis=None)

# table styling 
level_colours = {
        "4. excellent" : "background-color: #1a6837",
        "3. good for ODP" : "background-color: #87cb67",
        "2. improve" : "background-color: #fefebf",
        "1. update" : "background-color: #f78c51"
    }

flag_colours = {
        True : "color:green",
        False : "color:red"
    }

def make_color_mask_dataset_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)
    # turn label column into colours
    df_color_map["quality_level_label"] = df["quality_level_label"].map(level_colours)

    flag_slice = df.columns[3:-1]
    for s in flag_slice:
        df_color_map[s] = df[s].map(flag_colours)

    return df_color_map


# make widget
dataset_list = qual_cat_summary["pipeline"].sort_values().drop_duplicates().values

dataset_dropdown = widgets.Dropdown(
    options = dataset_list,
    value = "conservation-area",
    description = "Select Dataset: ",
)


### Chart

In [16]:
# VISUALISE

# qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(subset_bfl)]
# qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(subset_dvl)]

# color map to use in chart
cmap = plt.get_cmap('RdYlGn')
colors = [cmap(i / 4) for i in np.arange(1, 5)]

def make_quality_overview_chart(subset):
    """
    Uses the qual summary table to display a horizontal bar chart 
    """

    qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(dataset_subset_dict[subset])]

    # count providers by dataset & quality level
    qual_chart = qual_summary_subset.groupby(["pipeline", "quality_level", "quality_level_label"], as_index=False).agg(
        n_providers = ("quality_level", "count")
    )

    qual_chart.sort_values(["pipeline", "quality_level_label"], inplace=True)
    qual_chart_wide = qual_chart.pivot(columns = "quality_level_label", values = "n_providers", index = "pipeline")
    
    qual_chart_wide.plot.barh(
        stacked = True, 
        color = colors, 
        figsize = (9, 6))

    # Add labels and title
    plt.xlabel('Count of providers')
    plt.ylabel('Dataset')
    plt.title('Quality levels for ODP datasets')
    plt.legend(title='Quality level')

    return plt.show()


subset_dropdown = widgets.Dropdown(
    options = dataset_subset_dict.keys(),
    # value = dataset_list[0],
    description = "Select Dataset subset: ",
)

# widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

## 4. Present

### Data quality overview chart - by dataset groups

In [17]:
widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

interactive(children=(Dropdown(description='Select Dataset subset: ', options=('ODP', 'BFL', 'Developers'), va…

<function __main__.make_quality_overview_chart(subset)>

### ODP LPA overview table by dataset & quality

In [18]:
odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None)

pipeline,organisation,organisation_name,article-4-direction,article-4-direction-area,conservation-area,conservation-area-document,listed-building-outline,tree,tree-preservation-order,tree-preservation-zone
0,local-authority:ASF,Ashford Borough Council,no data,no data,no data,no data,4. excellent,no data,4. excellent,no data
1,local-authority:BDG,London Borough of Barking and Dagenham,no data,2. improve,4. excellent,no data,4. excellent,no data,no data,no data
2,local-authority:BIR,Birmingham City Council,4. excellent,4. excellent,4. excellent,4. excellent,2. improve,no data,no data,2. improve
3,local-authority:BNE,London Borough of Barnet,4. excellent,4. excellent,4. excellent,3. good for ODP,4. excellent,4. excellent,3. good for ODP,4. excellent
4,local-authority:BOL,Bolton Metropolitan Borough Council,no data,no data,1. update,no data,no data,no data,no data,no data
5,local-authority:CAS,Castle Point Borough Council,4. excellent,4. excellent,4. excellent,4. excellent,4. excellent,no data,no data,no data
6,local-authority:CAT,Canterbury City Council,no data,3. good for ODP,4. excellent,no data,3. good for ODP,no data,no data,no data
7,local-authority:DOV,Dover District Council,4. excellent,3. good for ODP,4. excellent,no data,4. excellent,4. excellent,3. good for ODP,2. improve
8,local-authority:ECA,East Cambridgeshire District Council,4. excellent,4. excellent,4. excellent,no data,no data,no data,no data,no data
9,local-authority:ENF,London Borough of Enfield,no data,4. excellent,4. excellent,no data,no data,no data,no data,no data


### Dataset quality scoring detail table

In [19]:
widgets.interact(get_dataset_qual_detail, dataset = dataset_dropdown)

interactive(children=(Dropdown(description='Select Dataset: ', index=15, options=('agricultural-land-classific…

<function __main__.get_dataset_qual_detail(dataset)>

### Output
Save report files

In [108]:
fn = os.path.join(output_dir, f"quality_ODP-dataset-scores-by-LPA_{td}.xlsx")
odp_lpa_summary.style.apply(make_color_mask_odp_lpa, axis=None).to_excel(fn, index = False)

In [109]:
fn = os.path.join(output_dir, f"quality_dataset-quality-detail_{td}.csv")
qual_cat_summary_wide.to_csv(fn)