# Title
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

**Purpose**: Identify how to redirect and retire brownfield-land entities belonging to organisations which have been retired and replaced by new organisations (usually new Unitary Authorities).


In [31]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import urllib
# from sqlite_query_functions import DatasetSqlite
from datetime import datetime

pd.set_option("display.max_rows", 100)

td = datetime.today().strftime('%Y-%m-%d')
data_dir = "../../data/BFL_old_orgs/"
os.makedirs(data_dir, exist_ok=True)
# fn = os.path.join(data_dir, f"FILENAME_{td}.csv")


## Data Import

In [32]:
org_df = get_all_organisations()
org_dict = dict(zip(org_df["organisation_entity"], org_df["organisation"]))


In [None]:
bfl = pd.read_csv("https://files.planning.data.gov.uk/dataset/brownfield-land.csv", dtype=str)
print(len(bfl))

bfl.columns = [x.replace("-", "_") for x in bfl.columns]
# bfl["organisation_entity"] = bfl["organisation_entity"].astype(str)
# bfl.to_csv(f"brownfield-land_{td}.csv", index = False)

In [None]:
old_ent_df= pd.read_csv(f"https://raw.githubusercontent.com/digital-land/config/refs/heads/main/pipeline/brownfield-land/old-entity.csv")
print(len(old_ent_df))

In [None]:
bfl[bfl["entity"] == "1700485"]

In [36]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as org_name, organisation, dataset as org_type, end_date
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype=str)
    return df


def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

In [None]:
def check_matches(org_x, org_y, match_fields):

    dfx = bfl[bfl["organisation_entity"] == org_x].copy()
    dfy = bfl[bfl["organisation_entity"] == org_y].copy()

    orgx = org_dict[org_x]
    orgy = org_dict[org_y]
    print(f"no. of entities in left table ({orgx}) = {len(dfx)}")
    print(f"no. of entities in right table  ({orgy})= {len(dfy)}")

    cross_ref_df = pd.merge(
        dfx,
        dfy,
        how = "outer",
        on = match_fields,
        indicator=True,
    )[match_fields + ["entity_x", "entity_y"] + ["_merge"]]

    # d["ref_matches"] = d["cross_ref_df"][d["cross_ref_df"]["_merge"] == "both"]["reference"]
    cross_ref_df["organisation_x"] = orgx
    cross_ref_df["organisation_y"] = orgy
    
    return cross_ref_df

cr_test = check_matches("47", "54", ["reference"])
# cr_test_matches = cr_test[cr_test["_merge"] == "both"]["reference"]

cr_test.value_counts("_merge")

In [38]:
def compare_match_fields(org_x, org_y, ref_matches, fields):

    dfx = bfl[
        (bfl["organisation_entity"] == org_x) &
        (bfl["reference"].isin(ref_matches))].copy()
    
    dfy = bfl[
        (bfl["organisation_entity"] == org_y) &
        (bfl["reference"].isin(ref_matches))].copy()

    # subset each table to only records which match on reference
    matches_df1 = dfx[dfx["reference"].isin(ref_matches)].copy()
    matches_df1.set_index("reference", inplace=True)
    matches_df1.sort_index(inplace=True)

    matches_df2 = dfy[dfy["reference"].isin(ref_matches)].copy()
    matches_df2.set_index("reference", inplace=True)
    matches_df2.sort_index(inplace=True)

    # full comparison
    full_comp = matches_df1 == matches_df2

    # calculate match % for each field and make into neat wide table
    results_df_nar = (full_comp[fields].sum(axis=0) / len(full_comp)).to_frame(name = "x_y_pct_match")
    results_df_wide = results_df_nar.reset_index().pivot_table(columns = "index", values = "x_y_pct_match")
    return results_df_wide

# t = compare_match_fields("47", "54", cr_test_matches, ["name", "point", "site", "site_address"])
# t

## Analysis

### Check all old orgs

In [None]:
# table of all existing provisions
bfl_prov = datasette_query(
    "performance",
    """
    SELECT
        organisation, name,
        count(distinct endpoint) as count_endpoint
        FROM
        reporting_historic_endpoints
        WHERE
        latest_status = 200
--        AND endpoint_end_date = ""
        AND pipeline = "brownfield-land"
        AND resource != ""
        GROUP BY organisation, name
    """)

print(len(bfl_prov))
bfl_prov.head()

In [40]:
# count entities per org
bfl_ent_count = bfl.groupby(["organisation_entity"], as_index=False).agg(entity_count = ("entity", "count"))

# bfl_ent_count

In [41]:
# count of entities for all old orgs
old_orgs = org_df[org_df["end_date"].notnull()]


old_org_ent_count = bfl_prov[["organisation", "name"]].merge(
    old_orgs[["organisation", "organisation_entity", "end_date"]],
    how = "inner",
    on = "organisation"
).merge(
    bfl_ent_count,
    how = "left",
    on = "organisation_entity"
)

# export to manually add in new organisations
# old_org_ent_count.to_csv("old_org_ent_count.csv", index=False)

In [None]:
old_org_ent_count

In [None]:
bfl[bfl["organisation_entity"] == "32"]

In [None]:
# read in appended version with new organisations
old_appended = pd.read_csv("old_org_ent_count_appended.csv", dtype=str)

# join org lookup and then count of entities for new orgs
old_new_cross_ref = old_appended.merge(
    org_df[["org_name", "organisation", "organisation_entity"]],
    how = "left",
    left_on = "name_new",
    right_on = "org_name"
).merge(
    bfl_ent_count,
    how = "left",
    left_on = "organisation_entity_y",
    right_on = "organisation_entity"
).sort_values(["name_new", "name"])

old_new_cross_ref.drop(["organisation_entity", "org_name"], axis=1, inplace=True)
old_new_cross_ref.columns = [
    "organisation_old", "name_old", "organisation_entity_old", "end_date", "entity_count_old",
    "name_new", "checked", "organisation_new", "organisation_entity_new", "entity_count_new"
]

# save
# old_new_cross_ref.to_csv("old_new_cross_ref.csv", index=False)

old_new_cross_ref.head()

In [None]:
# where there are entities for the new org, run check_matches to work out which old and new org entities match on reference and site_address
# matches will be redirected to new org entities, old org entities with no match will be retired.
org_redirects = old_new_cross_ref[old_new_cross_ref["entity_count_new"].notnull()]

org_redirects_df = pd.concat([
    check_matches(
        r["organisation_entity_old"], 
        r["organisation_entity_new"], 
        ["reference", "site_address"]
        ) 
    for i, r in org_redirects.iterrows()]
)

print(len(org_redirects_df))
org_redirects_df.head()

In [None]:
org_redirects_df.value_counts("_merge")

In [121]:
# create output for brownfield-land old-entity.csv
# where entities have matched (both), redirect, and where they haven't (left only), retire
org_redirects_df["old_entity_entry"] = np.select(
    [
        org_redirects_df["_merge"] == "both",
        org_redirects_df["_merge"] == "left_only",
    ],
    [
        org_redirects_df["entity_x"] + ",301," + org_redirects_df["entity_y"] + ",redirect due to organisation merge " + org_redirects_df["organisation_x"] + " to " + org_redirects_df["organisation_y"] + ",,,",
        org_redirects_df["entity_x"] + ",410," + ",retire due to organisation merge " + org_redirects_df["organisation_x"] + " to " + org_redirects_df["organisation_y"] + ",,,",
    ],
    default = ""
)

# filter to redirects and retires only, sort for readability and save
org_redirects_df[org_redirects_df["_merge"] != "right_only"].sort_values(["organisation_y", "organisation_x", "_merge"]).to_csv("old_entity_entries.csv", index = False)

In [None]:
# breakdown to see how many redirects and retires per old org to new org
redirect_breakdown = org_redirects_df[org_redirects_df["_merge"] != "right_only"].groupby(
    ["organisation_x", "organisation_y", "_merge"]
    , observed=True, 
    as_index=False
    ).size().sort_values(
        ["organisation_y", "organisation_x", "_merge"]
    )

print(redirect_breakdown["size"].sum())
redirect_breakdown

In [None]:
old_new_cross_ref[old_new_cross_ref["entity_count_new"].notnull()]["entity_count_old"].astype(float).sum()

### Check retires and redirects made to already redirected ents

In [None]:
old_ent_df[old_ent_df["old-entity"].isin(old_ent_df["entity"])]

## Per org replacement, check count of entities and match %s across different fields.
### Bournemouth, Christchurch and Poole Council 

In [None]:
ox = "47"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "78"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "254"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

### Buckinghamshire Council

In [None]:
ox = "32"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "82"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "280"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "393"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

### Dorset

In [None]:
ox = "121"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "222"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "258"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

In [None]:
ox = "360"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "365"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

### East Suffolk

In [None]:
ox = "326"
oy = "132"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
ox = "356"
oy = "132"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

In [None]:
ox = "140"
oy = "386"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

In [None]:
cr_matches