# Provision data quality report
**Author**:  Greg Slater <br>
**Date Created**:  November 2024 <br>
**Dataset Scope**: ODP datasets <br>
**Report Type**: Ad-hoc <br>

**Purpose**: The purpose of this report is to measure the quality of the data that makes up each data provision on the platform, by applying a data quality framework that sets out criteria that must be met in order to reach one of 4 different quality levels. These levels have been created for a proof-of-concept, and are based around our current best understanding of user needs for the data.


Future improvements:
* Error handling. Queries not working may break bits of the report. Not very high priority while report is more of a POC.
* Base tables. Expand summaries to full ODP provision, including where no data at all. This could be done by switching the `qual_cat_summary` table to be constructed from a base of the provision table, rather than `qual_all` (which only includes provisions with quality issues).
* Adding more quality checks. This depends on more checks going live in issues or expectations tables, but once they are should be easy to add extra criteria checks through the `qual_` table structure.
* Include data from old endpoints. This will need re-working of the base table query (from `fi.get_endpoint_res_issues()`) to include old endpoints and resources. Though this will add complexity to work out which are the "latest" endpoints and resources to include, especially for provisions with multiple endpoints. May be low priority.


### Data quality framework  
The table below visualises the framework that is used to assign a quality level to each data provision. 

The criteria marked as "true" at each level must be met by a data provision in order for it to be scored at that level. The levels are cumulative, so all criteria must be met in order for a provision to be scored as *data that is trustworthy*. Where we have data from alternative providers (e.g. Historic England conservation-area data) the first criteria cannot be met so it is scored as the first quality level, *some data*.

![quality framework table](quality-framwork-table_all-datasets.png)

In [10]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from datetime import datetime
import folium
import matplotlib

try:
  import mapclassify

except:
  print("installing mapclassify")
  !pip install mapclassify
  import mapclassify

td = datetime.today().strftime('%Y-%m-%d')

In [None]:
def save_util_file(file_name):

    if os.path.isfile(file_name) == False:
        url = f"https://raw.githubusercontent.com/digital-land/jupyter-analysis/refs/heads/main/reports/measure_data_quality/{file_name}"
        !wget {url}
        print(f"downloaded {file_name} from github")

    else:
        print("file available locally")

for f in ["functions_core.py", "functions_import.py", "functions_transform.py"]:
    save_util_file(f)

import functions_core as fc
import functions_import as fi
import functions_transform as ft

In [12]:
db_dir = "../../data/db_downloads/"
os.makedirs(db_dir, exist_ok=True)

output_dir = "../../data/quality_report/"
os.makedirs(output_dir, exist_ok=True)

## 1. Import

In [None]:
# performance db
fc.download_dataset("performance", db_dir, overwrite=True)
path_perf_db = os.path.join(db_dir, "performance.db")

# Issue quality criteria lookup
lookup_issue_qual = fi.get_issue_quality_lookup()

# Provision lookups
lookup_provision_odp = fi.get_odp_provision_lookup()
lookup_provision_odp.rename(columns={"dataset" : "pipeline"}, inplace=True)


# Dataset subset dict for chart
dataset_subset_dict = dict({
        "BFL" : ["brownfield-land"],
        "Developers" : ["developer-agreement", "developer-agreement-contribution", "developer-agreement-transaction"]
    })

# Base table
ep_res_issues = fi.get_endpoint_res_issues(path_perf_db)


## 2. Transform

In [14]:
# ISSUES TABLE - flagging when provisions have data quality issues

qual_issues = ft.make_issues_input_table(ep_res_issues, lookup_issue_qual)


# # FRESHNESS TABLE - flagging when provisions haven't been updated in last year - not included in quality framework for now

# create table of old resources and flag quality level as 5
qual_fresh = ft.make_freshness_input_table(ep_res_issues, age_days = 365)


# ALL QUALITY CATEGORIES TABLE - joining all records of quality categories (freshness & DQ issues) into one long table 
# concat tables for each type
qual_all = pd.concat([qual_issues, qual_fresh])
# qual_all.head()

In [None]:
level_map = {
    4: "4. trustworthy",
    3: "3. good",
    2: "2. improve",
    1: "1. update"}


qual_summary = ft.make_score_summary_table(qual_all, level_map)
print(len(qual_summary))

## 3. Summarise

### ODP LPA x Dataset quality table

In [16]:
def make_provider_summary_table(subset):
    """
    Uses the qual summary table to display a horizontal bar chart 
    """

    qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(dataset_subset_dict[subset])]

    qual_summary_wide = qual_summary_subset.pivot(
        columns = "pipeline",
        values = "quality_level_label",
        index = ["organisation", "organisation_name"]
    ).rename_axis(
        None, axis = 1
    ).reset_index(
    ).sort_values(
        ["organisation_name"]
    )

    qual_summary_wide.replace(np.nan, "0. no data", inplace=True)

    return qual_summary_wide.style.apply(make_color_mask_odp_lpa, axis=None)

# make_provider_summary_table("Developers")

In [17]:
level_background_colours = {
    "4. trustworthy" : "background-color: #1a6837",
    "3. good" : "background-color: #87cb67",
    "2. improve" : "background-color: #fefebf",
    "1. update" : "background-color: #f78c51"
    }

ready_flag_colours = {
        "yes" : "color:green"
    }

def make_color_mask_odp_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)

    flag_slice = df.columns[2:]
    for s in flag_slice:
        df_color_map[s] = df[s].map(level_background_colours)

    return df_color_map

# make_color_mask_odp_lpa(odp_lpa_summary)
# odp_lpa_summary_wide.style.apply(make_color_mask_odp_lpa, axis=None)

### Dataset x quality categories table

In [None]:
# count issues by the quality category 
qual_cat_count = qual_all.groupby(
        ["pipeline", "organisation", "organisation_name", "quality_criteria"],
        as_index=False
    ).agg(
        n_issues = ("quality_level", "count")
    )

In [None]:
# create a base table with each quality category for each provision - this is so it can be pivoted correctly with all categories included
prov = qual_all[["pipeline", "organisation", "organisation_name"]].drop_duplicates()
prov["key"] = 1

qual_cat = qual_all[qual_all["quality_criteria"].notnull()][["quality_criteria"]].drop_duplicates()
qual_cat["key"] = 1

qual_cat_summary = prov.merge(
    qual_cat,
    how = "left",
    on = "key"
)
print(len(qual_cat_summary))

# left join on the counts to the base table
qual_cat_summary = qual_cat_summary.merge(
    qual_cat_count,
    how = "left",
    on = ['pipeline', 'organisation', 'organisation_name', 'quality_criteria']
)

# create boolean flag for each category
qual_cat_summary["issue_flag"] = np.where(qual_cat_summary["n_issues"] > 0, False, True)
print(len(qual_cat_summary))
# qual_cat_summary.head()

In [20]:
# pivot quality category summary table so that quality categories are columns, join on overall quality level per provision
qual_cat_summary_wide = qual_cat_summary.pivot(
        columns = "quality_criteria",
        values = "issue_flag",
        index = ["pipeline", "organisation", "organisation_name"]
    ).reset_index(
    ).merge(
        qual_summary[["pipeline", "organisation", "quality_level_label"]],
        how = "left",
        on = ["pipeline", "organisation"]
    )

def get_dataset_qual_detail(dataset):
    # just subsets and styles main wide quality detail table

    qual_detail = qual_cat_summary_wide[qual_cat_summary_wide["pipeline"] == dataset].copy()

    return qual_detail.style.apply(make_color_mask_dataset_lpa, axis=None)


flag_colours = {
        True : "color:green",
        False : "color:red"
    }

def make_color_mask_dataset_lpa(df):
    #DataFrame with same index and columns names as original filled empty strings
    df_color_map =  pd.DataFrame("", index=df.index, columns=df.columns)
    # turn label column into colours
    df_color_map["quality_level_label"] = df["quality_level_label"].map(level_background_colours)

    flag_slice = df.columns[3:-1]
    for s in flag_slice:
        df_color_map[s] = df[s].map(flag_colours)

    return df_color_map


# make widget

dataset_dropdown = widgets.Dropdown(
    options = qual_summary["pipeline"].drop_duplicates().values,
    # value = "article-4-direction",
    description = "Select Dataset: ",
)


### Chart

In [21]:
# VISUALISE

# color map to use in chart
cmap = plt.get_cmap('RdYlGn')
colors = [cmap(i / 4) for i in np.arange(1, 5)]

def make_quality_overview_chart(subset):
    """
    Uses the qual summary table to display a horizontal bar chart 
    """

    qual_summary_subset = qual_summary[qual_summary["pipeline"].isin(dataset_subset_dict[subset])]

    # count providers by dataset & quality level
    qual_chart = qual_summary_subset.groupby(["pipeline", "quality_level", "quality_level_label"], as_index=False).agg(
        n_providers = ("quality_level", "count")
    )

    qual_chart.sort_values(["pipeline", "quality_level_label"], inplace=True)
    qual_chart_wide = qual_chart.pivot(columns = "quality_level_label", values = "n_providers", index = "pipeline")
    
    qual_chart_wide.plot.barh(
        stacked = True, 
        color = colors, 
        figsize = (9, 6))

    # Add labels and title
    plt.xlabel('Count of providers')
    plt.ylabel('Dataset')
    plt.title('Quality levels for ODP datasets')
    plt.legend(title='Quality level')

    return plt.show()


subset_dropdown = widgets.Dropdown(
    options = dataset_subset_dict.keys(),
    # value = dataset_list[0],
    description = "Select Dataset subset: ",
)

# widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

## 4. Present

### Data quality overview chart - by dataset groups

In [None]:
widgets.interact(make_quality_overview_chart, subset = subset_dropdown)

### ODP LPA overview table by dataset & quality

In [None]:
widgets.interact(make_provider_summary_table, subset = subset_dropdown)

### Dataset quality scoring detail table

In [None]:
widgets.interact(get_dataset_qual_detail, dataset = dataset_dropdown)

### Output
Save report files

In [25]:
fn_bfl = os.path.join(output_dir, f"quality-dataset-scores-by-provider_BFL_{td}.xlsx")
fn_dev = os.path.join(output_dir, f"quality-dataset-scores-by-provider_Developers_{td}.xlsx")

make_provider_summary_table("BFL").to_excel(fn_bfl, index = False)
make_provider_summary_table("Developers").to_excel(fn_dev, index = False)

In [26]:
fn = os.path.join(output_dir, f"quality_dataset-quality-detail_{td}.csv")
qual_cat_summary_wide.to_csv(fn)