# Title
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

**Purpose**: Identify how to redirect and retire brownfield-land entities belonging to organisations which have been retired and replaced by new organisations (usually new Unitary Authorities).


In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import urllib
# from sqlite_query_functions import DatasetSqlite
from datetime import datetime

pd.set_option("display.max_rows", 100)

td = datetime.today().strftime('%Y-%m-%d')
data_dir = "../../data/BFL_old_orgs/"
os.makedirs(data_dir, exist_ok=True)
# fn = os.path.join(data_dir, f"FILENAME_{td}.csv")


In [None]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as org_name, organisation, dataset as org_type, end_date
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype=str)
    return df

org_df = get_all_organisations()
org_dict = dict(zip(org_df["organisation_entity"], org_df["organisation"]))


def datasette_query(db, sql_string):
    params = urllib.parse.urlencode({
        "sql": sql_string,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
    df = pd.read_csv(url)
    return df

## Data Import

In [171]:
bfl = pd.read_csv("https://files.planning.data.gov.uk/dataset/brownfield-land.csv", dtype=str)
print(len(bfl))

bfl.columns = [x.replace("-", "_") for x in bfl.columns]
# bfl["organisation_entity"] = bfl["organisation_entity"].astype(str)
bfl.to_csv(f"brownfield-land_{td}.csv", index = False)

36327


In [82]:
def check_matches(org_x, org_y, match_fields):

    dfx = bfl[bfl["organisation_entity"] == org_x].copy()
    dfy = bfl[bfl["organisation_entity"] == org_y].copy()

    orgx = org_dict[org_x]
    orgy = org_dict[org_y]
    print(f"no. of entities in left table ({orgx}) = {len(dfx)}")
    print(f"no. of entities in right table  ({orgy})= {len(dfy)}")

    cross_ref_df = pd.merge(
        dfx,
        dfy,
        how = "outer",
        on = match_fields,
        indicator=True,
    )[match_fields + ["entity_x", "entity_y"] + ["_merge"]]

    # d["ref_matches"] = d["cross_ref_df"][d["cross_ref_df"]["_merge"] == "both"]["reference"]
    cross_ref_df["organisation_x"] = orgx
    cross_ref_df["organisation_y"] = orgy
    
    return cross_ref_df

cr_test = check_matches("47", "54", ["reference"])
# cr_test_matches = cr_test[cr_test["_merge"] == "both"]["reference"]

cr_test.value_counts("_merge")

no. of entities in left table (local-authority:BMH) = 50
no. of entities in right table  (local-authority:BPC)= 344


_merge
right_only    296
both           48
left_only       2
Name: count, dtype: int64

In [180]:
def compare_match_fields(org_x, org_y, ref_matches, fields):

    dfx = bfl[
        (bfl["organisation_entity"] == org_x) &
        (bfl["reference"].isin(ref_matches))].copy()
    
    dfy = bfl[
        (bfl["organisation_entity"] == org_y) &
        (bfl["reference"].isin(ref_matches))].copy()

    # subset each table to only records which match on reference
    matches_df1 = dfx[dfx["reference"].isin(ref_matches)].copy()
    matches_df1.set_index("reference", inplace=True)
    matches_df1.sort_index(inplace=True)

    matches_df2 = dfy[dfy["reference"].isin(ref_matches)].copy()
    matches_df2.set_index("reference", inplace=True)
    matches_df2.sort_index(inplace=True)

    # full comparison
    full_comp = matches_df1 == matches_df2

    # calculate match % for each field and make into neat wide table
    results_df_nar = (full_comp[fields].sum(axis=0) / len(full_comp)).to_frame(name = "x_y_pct_match")
    results_df_wide = results_df_nar.reset_index().pivot_table(columns = "index", values = "x_y_pct_match")
    return results_df_wide

t = compare_match_fields("47", "54", cr_test_matches, ["name", "point", "site", "site_address"])
t

index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,0.708333


## Analysis

### Check all old orgs

In [None]:
# table of all existing provisions
bfl_prov = datasette_query(
    "performance",
    """
    SELECT
        organisation, name,
        count(distinct endpoint) as count_endpoint
        FROM
        reporting_historic_endpoints
        WHERE
        latest_status = 200
--        AND endpoint_end_date = ""
        AND pipeline = "brownfield-land"
        AND resource != ""
        GROUP BY organisation, name
    """)

print(len(bfl_prov))
bfl_prov.head()

356


Unnamed: 0,organisation,name,count_endpoint
0,development-corporation:Q20648596,Old Oak and Park Royal Development Corporation,2
1,development-corporation:Q6670544,London Legacy Development Corporation,3
2,local-authority:ADU,Adur District Council,3
3,local-authority:ALL,Allerdale Borough Council,2
4,local-authority:AMB,Amber Valley Borough Council,5


In [172]:
# count entities per org
bfl_ent_count = bfl.groupby(["organisation_entity"], as_index=False).agg(entity_count = ("entity", "count"))

# bfl_ent_count

In [37]:
# count of entities for all old orgs
old_orgs = org_df[org_df["end_date"].notnull()]


old_org_ent_count = bfl_prov[["organisation", "name"]].merge(
    old_orgs[["organisation", "organisation_entity", "end_date"]],
    how = "inner",
    on = "organisation"
).merge(
    bfl_ent_count,
    how = "left",
    on = "organisation_entity"
)

# export to manually add in new organisations
# old_org_ent_count.to_csv("old_org_ent_count.csv", index=False)

In [105]:
# read in appended version with new organisations
old_appended = pd.read_csv("old_org_ent_count_appended.csv", dtype=str)

# join org lookup and then count of entities for new orgs
old_new_cross_ref = old_appended.merge(
    org_df[["org_name", "organisation", "organisation_entity"]],
    how = "left",
    left_on = "name_new",
    right_on = "org_name"
).merge(
    bfl_ent_count,
    how = "left",
    left_on = "organisation_entity_y",
    right_on = "organisation_entity"
).sort_values(["name_new", "name"])

old_new_cross_ref.drop(["organisation_entity", "org_name"], axis=1, inplace=True)
old_new_cross_ref.columns = [
    "organisation_old", "name_old", "organisation_entity_old", "end_date", "entity_count_old",
    "name_new", "checked", "organisation_new", "organisation_entity_new", "entity_count_new"
]

# save
# old_new_cross_ref.to_csv("old_new_cross_ref.csv", index=False)

old_new_cross_ref.head()

Unnamed: 0,organisation_old,name_old,organisation_entity_old,end_date,entity_count_old,name_new,checked,organisation_new,organisation_entity_new,entity_count_new
21,local-authority:POL,Borough of Poole,254,31/03/2019,122,"Bournemouth, Christchurch and Poole Council",yes,local-authority:BPC,54,344.0
3,local-authority:BMH,Bournemouth Borough Council,47,31/03/2019,50,"Bournemouth, Christchurch and Poole Council",yes,local-authority:BPC,54,344.0
5,local-authority:CHC,Christchurch Borough Council,78,31/03/2019,41,"Bournemouth, Christchurch and Poole Council",yes,local-authority:BPC,54,344.0
1,local-authority:AYL,Aylesbury Vale District Council,32,31/03/2020,16,Buckinghamshire Council,yes,local-authority:BUC,67,313.0
6,local-authority:CHN,Chiltern District Council,82,31/03/2020,51,Buckinghamshire Council,,local-authority:BUC,67,313.0


In [106]:
# where there are entities for the new org, run check_matches to work out which old and new org entities match on reference and site_address
# matches will be redirected to new org entities, old org entities with no match will be retired.
org_redirects = old_new_cross_ref[old_new_cross_ref["entity_count_new"].notnull()]

org_redirects_df = pd.concat([
    check_matches(
        r["organisation_entity_old"], 
        r["organisation_entity_new"], 
        ["reference", "site_address"]
        ) 
    for i, r in org_redirects.iterrows()]
)

print(len(org_redirects_df))
org_redirects_df.head()

no. of entities in left table (local-authority:POL) = 122
no. of entities in right table  (local-authority:BPC)= 344
no. of entities in left table (local-authority:BMH) = 50
no. of entities in right table  (local-authority:BPC)= 344
no. of entities in left table (local-authority:CHC) = 41
no. of entities in right table  (local-authority:BPC)= 344
no. of entities in left table (local-authority:AYL) = 16
no. of entities in right table  (local-authority:BUC)= 313
no. of entities in left table (local-authority:CHN) = 51
no. of entities in right table  (local-authority:BUC)= 313
no. of entities in left table (local-authority:SBU) = 37
no. of entities in right table  (local-authority:BUC)= 313
no. of entities in left table (local-authority:WYO) = 47
no. of entities in right table  (local-authority:BUC)= 313
no. of entities in left table (local-authority:EDO) = 29
no. of entities in right table  (local-authority:DST)= 83
no. of entities in left table (local-authority:NDO) = 16
no. of entities

Unnamed: 0,reference,site_address,entity_x,entity_y,_merge,organisation_x,organisation_y
0,P/BLR/0001,"The Grain Silo, The Quay, Poole, BH15 1HA",1710299,1708995,both,local-authority:POL,local-authority:BPC
1,P/BLR/0002,"Broadstone Hotel, Station Approach, Poole, BH1...",1710300,1709006,both,local-authority:POL,local-authority:BPC
2,P/BLR/0004,"5 Highmoor Road, Poole, BH14 8SZ",1710301,1708876,both,local-authority:POL,local-authority:BPC
3,P/BLR/0005,"10 Crichel Mount Road, Poole, BH14 8LT",1710302,1708852,both,local-authority:POL,local-authority:BPC
4,P/BLR/0006,"Former James Brothers Site, Blandford Road, Po...",1710303,1708854,both,local-authority:POL,local-authority:BPC


In [107]:
org_redirects_df.value_counts("_merge")

_merge
right_only    3384
left_only      714
both           166
Name: count, dtype: int64

In [121]:
# create output for brownfield-land old-entity.csv
# where entities have matched (both), redirect, and where they haven't (left only), retire
org_redirects_df["old_entity_entry"] = np.select(
    [
        org_redirects_df["_merge"] == "both",
        org_redirects_df["_merge"] == "left_only",
    ],
    [
        org_redirects_df["entity_x"] + ",301," + org_redirects_df["entity_y"] + ",redirect due to organisation merge " + org_redirects_df["organisation_x"] + " to " + org_redirects_df["organisation_y"] + ",,,",
        org_redirects_df["entity_x"] + ",410," + ",retire due to organisation merge " + org_redirects_df["organisation_x"] + " to " + org_redirects_df["organisation_y"] + ",,,",
    ],
    default = ""
)

# filter to redirects and retires only, sort for readability and save
org_redirects_df[org_redirects_df["_merge"] != "right_only"].sort_values(["organisation_y", "organisation_x", "_merge"]).to_csv("old_entity_entries.csv", index = False)

In [136]:
# breakdown to see how many redirects and retires per old org to new org
redirect_breakdown = org_redirects_df[org_redirects_df["_merge"] != "right_only"].groupby(
    ["organisation_x", "organisation_y", "_merge"]
    , observed=True, 
    as_index=False
    ).size().sort_values(
        ["organisation_y", "organisation_x", "_merge"]
    )

print(redirect_breakdown["size"].sum())
redirect_breakdown

880


Unnamed: 0,organisation_x,organisation_y,_merge,size
3,local-authority:BMH,local-authority:BPC,left_only,16
4,local-authority:BMH,local-authority:BPC,both,34
5,local-authority:CHC,local-authority:BPC,left_only,2
6,local-authority:CHC,local-authority:BPC,both,39
21,local-authority:POL,local-authority:BPC,left_only,56
22,local-authority:POL,local-authority:BPC,both,66
0,local-authority:AYL,local-authority:BUC,left_only,15
1,local-authority:AYL,local-authority:BUC,both,1
7,local-authority:CHN,local-authority:BUC,left_only,48
8,local-authority:CHN,local-authority:BUC,both,3


In [140]:
old_new_cross_ref[old_new_cross_ref["entity_count_new"].notnull()]["entity_count_old"].astype(float).sum()

945.0

## Per org replacement, check count of entities and match %s across different fields.
### Bournemouth, Christchurch and Poole Council 

In [185]:
ox = "47"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:BMH) = 50
no. of entities in right table  (local-authority:BPC)= 344

Checking entity match on `reference` field
_merge
right_only    296
both           48
left_only       2
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,0.708333


In [186]:
ox = "78"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:CHC) = 41
no. of entities in right table  (local-authority:BPC)= 344

Checking entity match on `reference` field
_merge
right_only    305
both           39
left_only       2
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,1.0


In [187]:
ox = "254"
oy = "54"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:POL) = 122
no. of entities in right table  (local-authority:BPC)= 344

Checking entity match on `reference` field
_merge
right_only    255
both           89
left_only      33
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,0.741573


### Buckinghamshire Council

In [188]:
ox = "32"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:AYL) = 16
no. of entities in right table  (local-authority:BUC)= 313

Checking entity match on `reference` field
_merge
right_only    312
left_only      15
both            1
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,1.0,0.0,1.0


In [189]:
ox = "82"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:CHN) = 51
no. of entities in right table  (local-authority:BUC)= 313

Checking entity match on `reference` field
_merge
right_only    289
left_only      27
both           24
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,0.125


In [190]:
ox = "280"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:SBU) = 37
no. of entities in right table  (local-authority:BUC)= 313

Checking entity match on `reference` field
_merge
right_only    294
both           19
left_only      18
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index,name,point,site,site_address
x_y_pct_match,1.0,0.0,0.0,0.263158


In [191]:
ox = "393"
oy = "67"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:WYO) = 47
no. of entities in right table  (local-authority:BUC)= 313

Checking entity match on `reference` field
_merge
right_only    313
left_only      47
both            0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


### Dorset

In [193]:
ox = "121"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:EDO) = 29
no. of entities in right table  (local-authority:DST)= 83

Checking entity match on `reference` field
_merge
right_only    83
left_only     29
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [179]:
ox = "222"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (North Dorset District Council) = 16
no. of entities in right table  (Dorset Council)= 83

Checking entity match on `reference` field
_merge
right_only    83
left_only     16
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [180]:
ox = "258"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

no. of entities in left table (Purbeck District Council) = 12
no. of entities in right table  (Dorset Council)= 83

Checking entity match on `reference` field
_merge
right_only    83
left_only     12
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [181]:
ox = "360"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (West Dorset District Council) = 47
no. of entities in right table  (Dorset Council)= 83

Checking entity match on `reference` field
_merge
right_only    83
left_only     47
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [183]:
ox = "365"
oy = "112"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

no. of entities in left table (Weymouth and Portland Borough Council) = 50
no. of entities in right table  (Dorset Council)= 83

Checking entity match on `reference` field
_merge
right_only    83
left_only     50
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


### East Suffolk

In [184]:
ox = "326"
oy = "132"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (Suffolk Coastal District Council) = 18
no. of entities in right table  (East Suffolk Council)= 141

Checking entity match on `reference` field
_merge
right_only    141
left_only      18
both            0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [186]:
ox = "356"
oy = "132"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site-address"])

no. of entities in left table (Waveney District Council) = 33
no. of entities in right table  (East Suffolk Council)= 141

Checking entity match on `reference` field
_merge
right_only    141
left_only      33
both            0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [None]:
ox = "140"
oy = "386"

cross_ref = check_matches(ox, oy, ["reference"])
cr_matches = cross_ref[cross_ref["_merge"] == "both"]["reference"]

print()
print("Checking entity match on `reference` field")
print(cross_ref.value_counts("_merge"))

print()
print("comparing field match % for entities matched on `reference` field")
compare_match_fields(ox, oy, cr_matches, ["name", "point", "site", "site_address"])

no. of entities in left table (local-authority:FOR) = 3
no. of entities in right table  (local-authority:WSK)= 28

Checking entity match on `reference` field
_merge
right_only    28
left_only      3
both           0
Name: count, dtype: int64

comparing field match % for entities matched on `reference` field


index
x_y_pct_match


In [177]:
cr_matches

Series([], Name: reference, dtype: object)