# Title
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose


In [86]:
import pandas as pd
import os
import urllib
import spatialite

pd.set_option("display.max_rows", 100)

data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)


In [82]:
def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_all_organisations():
    q = """
        select organisation, name, entity as organisation_entity
        from organisation
        """
    return datasette_query("digital-land", q)

FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

def query_sqlite(db_path, query_string):

    with spatialite.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

In [127]:
def count_entity_vs_resource(dataset, organisation_entity, resources):

    # needs organisation_entity and a list of active resources for that org
    # will return a count of current entities, and no. of reference values from active resources

    q = """ 

    with latest_res as (
        select distinct f.value 
        from fact_resource fr
        inner join fact f on fr.fact = f.fact
        where 1=1
            and f.field = "reference"
            and resource in ({})
    )

    select 
        e.dataset as pipeline, 
        count(*) as count_entities, 
        count(distinct e.reference) as count_entity_unique_refs, 
        count(distinct lr.value) as count_active_res_unique_refs
    from entity e
    full outer join latest_res lr on e.reference = lr.value
    where e.organisation_entity = {}
    """.format(', '.join(f"'{r}'" for r in resources), organisation_entity)

    try:
        df = datasette_query(dataset, q)
    
    except:
        df = pd.DataFrame()
    
    return df


In [115]:
def get_deleted_entities(dataset, organisation_entity, resources):

    # needs organisation_entity and a list of active resources for that org
    # will return the entities which have a reference value that doesn't appear 
    # on active resources
    
    q = """ 

    with latest_res as (
        select distinct f.value 
        from fact_resource fr
        inner join fact f on fr.fact = f.fact
        where 1=1
            and f.field = "reference"
            and resource in ({})
    )

    select e.dataset as pipeline, e.entity, e.organisation_entity, e.reference
    from entity e
    full outer join latest_res lr on e.reference = lr.value
    where e.organisation_entity = {}
    and lr.value is null
    """.format(', '.join(f"'{r}'" for r in resources), organisation_entity)
    
    return datasette_query(dataset, q)


## Data Import

In [83]:
# download performance db
download_dataset("performance", data_dir, overwrite=False)
perf_path = os.path.join(data_dir, "performance.db")

In [120]:
# get unknown entity issues from performance db
q = """
    SELECT distinct resource, count_issues as count_unknown_entities
    FROM endpoint_dataset_issue_type_summary
    WHERE issue_type = "unknown entity"
"""

uk_ents = query_sqlite(perf_path, q)
# uk_ents

In [None]:
org_lookup = get_all_organisations()

dataset = "article-4-direction-area"

# get active resources for dataset
q = f"""
    SELECT * 
    FROM reporting_historic_endpoints
    WHERE pipeline = '{dataset}'
    AND latest_status = 200
    AND resource_end_date = ""
"""
historic_ep_df = query_sqlite(perf_path, q)

print(len(historic_ep_df))

# join on org_entity and unknown entity counts
historic_ep_df = historic_ep_df.merge(
    org_lookup[["name", "organisation_entity"]],
    how = "left",
    on = "name"
).merge(
    uk_ents,
    how = "left",
    on = "resource"
)

print(len(historic_ep_df))
historic_ep_df.head()

## Analysis

In [None]:
# group to provision level - get count and list of active resources for each, plus unknown entities
grouped = historic_ep_df.groupby(
    ["pipeline", "organisation", "name", "organisation_entity"],
    as_index=False ).agg(
        resources = ("resource", list),
        n_active_resources = ("resource", "count"),
        count_unknown_entities = ("count_unknown_entities", "sum")
        )

grouped

In [128]:
# run comparison query for each provision and store results
count_comp = []

for i, r in grouped.iterrows():
    
    results = count_entity_vs_resource(r["pipeline"] ,r["organisation_entity"], r["resources"])
    results["organisation_entity"] = r["organisation_entity"]

    count_comp.append(results)

count_comp_all = pd.concat(count_comp)

In [None]:
comp_results = grouped.merge(
    count_comp_all,
    how = "left",
    on = ["pipeline", "organisation_entity"]
)

# get results where count entities != either source of unique refs (entity or active resources)
mis_matches = comp_results[
    (comp_results["count_entities"] != comp_results["count_entity_unique_refs"]) |
    (comp_results["count_entities"] != comp_results["count_active_res_unique_refs"])]

mis_matches

In [None]:
deleted_ents = []

for i, r in grouped.iterrows():
    
    results = get_deleted_entities(r["pipeline"] ,r["organisation_entity"], r["resource"])
    results["name"] = r["name"]

    deleted_ents.append(results)

deleted_ents_all = pd.concat(deleted_ents)
deleted_ents_all

In [None]:
org_lookup = get_all_organisations()

q = """
    SELECT * 
    FROM reporting_historic_endpoints
    WHERE pipeline = "article-4-direction-area"
    AND organisation in ("local-authority-eng:BUC", "local-authority-eng:BIR", "local-authority-eng:HOR")
    AND latest_status = 200
    and resource_end_date = ""
"""

historic_ep_df = datasette_query("performance", q)
historic_ep_df = historic_ep_df.merge(
    org_lookup[["name", "organisation_entity"]],
    how = "left",
    on = "name"
)

historic_ep_df.head()