# Title
**Author**:  Greg Slater <br>
**Date**:  13 Jan 2025 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose
[Jira ticket](https://mhclgdigital.atlassian.net/browse/DATA-1199)   
[Mural board](https://app.mural.co/t/mhclg2837/m/mhclg2837/1706786112750/e94eeff744863fe859b2d22a3ccb2dac1e442270?wid=0-1736957406208) mapping out problem and approach.   

Analysis to identify BFL entities which can be given an end-date on the basis that their references do not appear on the latest resource for a provision.

In [149]:
import pandas as pd
import os
import urllib
import sqlite3
from datetime import datetime
import numpy as np

td = datetime.today().strftime('%Y-%m-%d')

pd.set_option("display.max_rows", 100)

data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)
out_dir = "../../data/deleted_entities/"
os.makedirs(out_dir, exist_ok=True)

## Problem organisations to exclude
# This is a list of organisations who we know supply multiple endpoints for multiple areas - we need to exclude these as this process only works for 
# provisions where there is one dataset updated over subsequent endpoints and resources, rather than multiple datasets.

orgs_to_exclude = ["local-authority:BUC"]


In [76]:
def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_all_organisations():
    q = """
        select organisation, name, entity as organisation_entity
        from organisation
        where end_date = ""
        """
    return datasette_query("digital-land", q)

FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

def query_sqlite(db_path, query_string):

    with sqlite3.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

In [77]:
# function to get historic endpoints and resources for a provision, along with incrementing resource key
def get_historic_resources(dataset, organisation):

    q = f"""

        WITH endpoint_count as (
            SELECT 
                distinct organisation,
                latest_status, 
                endpoint, 
                endpoint_entry_date, 
                endpoint_end_date, 
                resource, 
                resource_start_date, 
                resource_end_date,
                dense_rank() over (order by endpoint_entry_date, endpoint) as endpoint_no

            FROM reporting_historic_endpoints
            WHERE pipeline = '{dataset}'
            AND organisation = '{organisation}'
            AND (latest_status = 200 OR latest_status = "")
            ORDER BY endpoint_entry_date, resource_start_date
        ),

        endpoint_resource_count as (
            SELECT 
                *,
                CAST(endpoint_no as string) || "." || dense_rank() over (partition by endpoint_no order by resource_start_date, latest_status) as endpoint_resource_no
            FROM endpoint_count
        )

        SELECT 
            *,
            CASE WHEN resource_end_date = "" then 
                LEAD(endpoint_entry_date) over (order by endpoint_resource_no) 
                ELSE resource_end_date end
                as entity_end_date

        FROM endpoint_resource_count

"""
    r = datasette_query("performance", q)

    return r 

# get_historic_resources("brownfield-land", "local-authority:BST")

In [78]:
def get_resource_references(resources, db_path):

    # take list of resources and db path to get all reference values which have appeared on each resource

    q = """ 

        SELECT distinct fr.resource, f.value 
        FROM fact_resource fr
        INNER JOIN fact f on fr.fact = f.fact
        WHERE 1=1
            AND f.field = "reference"
            AND fr.resource in ({})
    """.format(', '.join(f"'{r}'" for r in resources))

    df = query_sqlite(db_path, q)
    
    return df

## Data Import

In [None]:
# download performance db
download_dataset("brownfield-land", data_dir, overwrite=True)
bfl_path = os.path.join(data_dir, "brownfield-land.db")

In [80]:
org_lookup = get_all_organisations()

org_dict = dict(zip(org_lookup["organisation"], org_lookup["organisation_entity"]))

In [None]:
prov_bfl_all = datasette_query(
    "digital-land",
    """
    SELECT organisation
    FROM provision
    WHERE dataset = "brownfield-land"
    AND end_date = ""
    """
)

# remove organisations to exclude
prov_bfl = prov_bfl_all[~prov_bfl_all["organisation"].isin(orgs_to_exclude)].copy()

print(len(prov_bfl_all))
print(len(prov_bfl))


## Analysis steps

Expected steps:
* For each organisation, get table of: organisation, resource, start-date, reference
* Take the latest (non-200, with more than 0 entities) resource for each organisation
* For any references from not latest resources, use end-date as end-date for entities with that reference


NOTE - we should only run this for organisations supplying authoritative data, as otherwise it will get confusing factoring in orgs like GLA who can supply data for multiple organisations. This method of working out latest entities relies on a model of on organisation supplying successive data about the same things through one or multiple endpoints.

NOTE - some organisations are using end-date (e.g. Bristol), we should make sure to only apply this calculated end date to entities which don't already have one from the org.

Questions:
* Is taking resource with latest start date robust enough to identify latest snapshot of data? What if an endpoint for 2022 data has a mistake corrected which generates a newer resource than an endpoint with 2024 data has? Should we use the latest resource from latest endpoint?

Suggest here we actually begin giving old endpoints for orgs an end date, it will make the logic of working out what end-date to give old entities SO much easier.

* What about orgs like Bucks who have multiple active endpoints to supply data for multiple regions? Can't reliably identify latest data between this model and single dataset model. May have to manually exclude orgs like Bucks, or maybe restrict method to only orgs with single active endpoint?

* how to deal with entities with data from old resources that don't appear in reporting or collection tables? e.g. Bristol entity [1725409](https://www.planning.data.gov.uk/entity/1725409) has reference facts from resource `01307bca0ecc5c950ef9b35c02ee7b3378cd9b7d29078229e4984a88d4f04ed1`. This appears in the `resource.csv` with an end-date, but isn't in `old-resource.csv`, or in the BFL `dataset_resource` or `fact_resource` tables. So not easy to work out an end-date for it from just the sqlite file. Will probably need to separately check for any more of these and then they can just be retired.


Risks:
* Using endpoint_entry_date as a proxy for the data lineage will be wrong in some cases, e.g. [Brighton and Hove](https://datasette.planning.data.gov.uk/performance?sql=SELECT+*%2C%0D%0A++++CAST%28endpoint_no+as+string%29+%7C%7C+%22.%22+%7C%7C+dense_rank%28%29+over+%28partition+by+endpoint_no+order+by+resource_start_date%29+as+endpoint_resource_no%0D%0A%0D%0AFROM+%28%0D%0ASELECT+distinct+organisation%2C+organisation_name%2C+dataset%2C+latest_status%2C+endpoint_url%2C+endpoint%2C+endpoint_entry_date%2C+endpoint_end_date%2C+resource%2C+resource_start_date%2C+resource_end_date%2C%0D%0A+++dense_rank%28%29+over+%28order+by+endpoint_entry_date%2C+endpoint%29+as+endpoint_no%0D%0AFROM+reporting_historic_endpoints%0D%0AWHERE+pipeline+%3D+%22brownfield-land%22%0D%0AAND+organisation+%3D+%22local-authority%3ABNH%22%0D%0AAND+latest_status+%3D+200%0D%0AORDER+BY+endpoint_entry_date%2C+resource_start_date%0D%0A++%29) got 2017 and 2018 data added on the same day in 2019. So references which appeared in one and not the other may get an unusual end-date.

### Test

In [None]:
# just testing steps with one organisation for POC - Bristol
bs_test = get_historic_resources("brownfield-land", "local-authority:BST")

# get historic resources and the endpoint_resource count key for Bristol
bs_test.head()


In [None]:
# get all reference values for each historic resource
res_refs = get_resource_references(bs_test["resource"].drop_duplicates(), "../../data/db_downloads/brownfield-land.db")

res_refs.head()

In [None]:
# join bristol resources to list of reference values per resource
org_res_refs = bs_test.merge(
    res_refs,
    how = "left",
    on = "resource"
)

# get the endpoint_resource_no for the most recent resource the reference value appeared in 
ref_latest_res = org_res_refs.groupby(["value"], as_index = False).agg(
    endpoint_resource_no = ("endpoint_resource_no", "max")
)

# get the endpoint_resource_no for the latest bristol resource
max_ep_res = org_res_refs["endpoint_resource_no"].max()

# old refs are those where the most recent resource they appeared on is not the latest resource
old_refs = ref_latest_res[ref_latest_res["endpoint_resource_no"] != max_ep_res]

# join back to res_refs table to get the end date of the resource each old ref last appeared on - this will be the entity end-date
old_refs_dated = org_res_refs.merge(
    old_refs,
    how = "inner",
    on = ["value", "endpoint_resource_no"]
)[["organisation", "value", "entity_end_date"]].drop_duplicates()

old_refs_dated.rename(
    inplace = True,
    columns = {"value" : "reference"}
)

old_refs_dated

In [None]:
bst_entity = query_sqlite("../../data/db_downloads/brownfield-land.db", 
             """
                SELECT * 
                FROM entity
                WHERE organisation_entity = 66
             """)


# find entities that have a reference which doens't appear in any valid resources at all
old_ents = bst_entity[["entity", "entry_date", "reference"]].merge(
                old_refs_dated,
                how = "inner",
                on = "reference"
            )

print(f"No of bristol entities: {len(bst_entity)}")
print(f"No of old bristol entities: {len(old_ents)}")
old_ents.head()

In [None]:
# mystery entities:
# find entities that have a reference which doens't appear in any valid resources at all
mystery_ents = bst_entity[~bst_entity["reference"].isin(res_refs["value"].drop_duplicates())]
print(f"No of bristol mystery entities: {len(mystery_ents)}")
mystery_ents.head()

In [None]:
query_sqlite("../../data/db_downloads/brownfield-land.db", 
            f"""
                SELECT * 
                FROM entity
                WHERE organisation_entity = {org_dict["local-authority:BST"]}
                and end_date = ""
            """)

### Scale: Apply to all BFL provisions

In [None]:

all_out = []

for org in prov_bfl["organisation"]:
    
    # get all valid resources
    res_hist = get_historic_resources("brownfield-land", org)

    print("---------------------------------------")
    print(org)
    print(f"Number of endpoint & resource records found: {len(res_hist)}")

    if len(res_hist) > 1:

        # get entities
        org_entities = query_sqlite("../../data/db_downloads/brownfield-land.db", 
            f"""
                SELECT * 
                FROM entity
                WHERE organisation_entity = {org_dict[org]}
                and end_date = ""
            """)

        print(f"Number of live entities: {len(org_entities)}")

        # get all reference values for resources
        res_refs = get_resource_references(
            res_hist["resource"].drop_duplicates(), 
            "../../data/db_downloads/brownfield-land.db")
        
        # join resources to list of reference values per resource
        org_res_refs = res_hist.merge(
            res_refs,
            how = "left",
            on = "resource"
        )

        # get the endpoint-resource count code for the most recent resource each reference has appeared in 
        ref_latest_res = org_res_refs.groupby(["value"], as_index = False).agg(
            endpoint_resource_no = ("endpoint_resource_no", "max")
        )

        print(len(ref_latest_res))
        
        # join back to the full resource table using the endpoint_resource_no to get the end-date of the most recent
        ref_latest_res_dated = ref_latest_res.merge(
            org_res_refs[["resource", "endpoint_resource_no", "value", "entity_end_date"]],
            how = "left",
            on = ["value", "endpoint_resource_no"]
        )

        # join from entity table to our reference and resource calcs
        entity_ref_latest_res_dated = org_entities[["entity", "organisation_entity", "reference", "end_date"]].merge(
            ref_latest_res_dated,
            how = "left",
            left_on = "reference",
            right_on = "value"
        )

        entity_ref_latest_res_dated.drop("value", axis=1, inplace=True)

        # add flags for old and mystery entities
        max_ep_res = org_res_refs["endpoint_resource_no"].max()
        entity_ref_latest_res_dated["old_entity"] = np.where(entity_ref_latest_res_dated["endpoint_resource_no"] != max_ep_res, True, False)
        entity_ref_latest_res_dated["mystery_entity"] = np.where(entity_ref_latest_res_dated["endpoint_resource_no"].isnull(), True, False)

        all_out.append(entity_ref_latest_res_dated)




In [None]:
all_out_df = pd.concat(all_out)
print(len(all_out_df))

all_out_df["organisation_entity"] = all_out_df["organisation_entity"].astype(int)
all_out_df = all_out_df.merge(
    org_lookup[["organisation_entity", "organisation"]],
    on = "organisation_entity"
)

unique_ent_test = len(all_out_df) == len(all_out_df["entity"].drop_duplicates())
print(f"Is the output table unique by entity: {unique_ent_test}")

# all_out_df["mystery_entity"] = np.where(all_out_df["endpoint_resource_no"].isnull(), True, False)
# all_out_df["old_entity"] = np.where(all_out_df["entity_end_date"].notnull(), True, False)

all_out_df.sort_values(["organisation", "endpoint_resource_no"], inplace=True, ascending= [True, False])

all_out_df.to_csv(os.path.join(out_dir, f"test - old_entity_results_all_{td}.csv"), index = False)

print(len(all_out_df))
all_out_df.head()

In [None]:
print(len(all_out_df[all_out_df["old_entity"]]))
print(len(all_out_df[all_out_df["mystery_entity"]]))

In [None]:
print(len(org_lookup))
print(len(org_lookup["organisation_entity"].drop_duplicates()))

In [None]:
all_out_df[all_out_df["organisation"] == "local-authority:BUC"]

### Check entity ranges

This is to make sure that all of the entities we're adding end-dates for have an entry in `entity-organisation.csv`; this ensures that the organisation won't be updated to MHCLG when the newer facts come through from our github endpoints.

In [None]:
ent_org = pd.read_csv(f"https://raw.githubusercontent.com/digital-land/config/refs/heads/main/pipeline/brownfield-land/entity-organisation.csv")


In [None]:
e_range = all_out_df["entity"]

print(f"checking ranges for {len(e_range)} entities")

# check how many ranges in range table each entity has
range_checks = [len(ent_org[(ent_org["entity-minimum"] <= e) & (ent_org["entity-maximum"] >= e)]) for e in e_range]

# df for results
check_df = pd.DataFrame(
    {
        "entity" : e_range,
        "n_ranges" : range_checks
    }
)

# test if any with > 1 range
entities_no_range = check_df[check_df["n_ranges"] == 0]
entities_no_range

In [None]:
check_df["n_ranges"].value_counts()

In [None]:
all_out_df[~all_out_df["organisation"].isin(ent_orgs)]