In [None]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np
import os

In [None]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name, organisation, dataset, local_planning_authority
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

def get_provisions():
    global provisions_df  
    params = urllib.parse.urlencode({
        "sql": f"""
            SELECT
                cohort, notes, organisation, project, provision_reason, start_date
            FROM
                provision   
            WHERE 
                provision_reason = "expected"
                AND project = "open-digital-planning"
            GROUP BY organisation
            ORDER BY cohort
        """,
        "_size": "max"
    })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    provisions_df = pd.read_csv(url)
    return provisions_df


## Data in

In [None]:
# get prov
provisions_df = get_provisions()

# get orgs
org_df = get_all_organisations()
# flag ODP
org_df["odp_flag"] = np.where(org_df["organisation"].isin(provisions_df["organisation"]), True, False)

print(len(org_df))

In [None]:
# read in manual count sheet
con_count_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSGZIudsGx0ez4cU-4wSvymvXIFfpDb_qfbS3uW5RiuBkJrJQ9D8k0HBUPtgncRXA/pub?gid=485605871&single=true&output=csv")
con_count_df.columns = [x.replace("-", "_") for x in con_count_df.columns]

# join on organisation names and LPA codes
con_count_lpa_df = con_count_df.merge(
    org_df[["organisation_entity", "name", "organisation", "local_planning_authority", "odp_flag"]],
    how = "left",
    on = "organisation_entity"
)

print(len(con_count_lpa_df))
# con_count_lpa_df.head()

In [None]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point", "geometry"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

# load to gdf
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_gdf = gpd.GeoDataFrame(ca_df, geometry='point')

# Transform to ESPG:27700 for more interpretable area units
ca_gdf.set_crs(epsg=4326, inplace=True)
ca_gdf.to_crs(epsg=27700, inplace=True)

In [None]:
# Latest ONS LPA file, for flagging whether pdp LPAs are 2023 or not
# ons_lpa_gpd = gpd.read_file("https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Local_Planning_Authorities_April_2023_Boundaries_UK_BGC/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson",)

# print(len(ons_lpa_gpd))
# ons_lpa_gpd.head()

In [None]:
# LPA boundaries from PDP site
# lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf["lpa_2023"] = np.where(lpa_gdf["reference"].isin(ons_lpa_gpd["LPA23CD"]), True, False)
lpa_gdf.rename(columns={'name':'lpa_name'}, inplace=True)

print(len(lpa_gdf))
print(len(lpa_gdf[lpa_gdf["lpa_2023"]]))
# lpa_gdf.head()

## Analysis

### Spatial joining - LPA boundaries to conservation area points

In [None]:
# join LPAs to all conservation areas, then join on the names of supplying organisations for matching conservation areas
lpa_ca_join = gpd.sjoin(
    lpa_gdf[["reference", "lpa_name", "lpa_2023", "geometry"]],
    ca_gdf[["entity", "organisation_entity", "point"]],
    how = "left",
    predicate = "intersects"
).merge(
    org_df[["organisation_entity", "name"]],
    how = "left",
    on = "organisation_entity"
)

# force name to string type 
lpa_ca_join["name"] = lpa_ca_join["name"].astype(str)

# flag the providing org type - ranking so when we group and count we can count areas with LPA and Historic England providing as LPA
lpa_ca_join["org_type_rank"] = np.select(
    [
        (lpa_ca_join["organisation_entity"] != 16) & (lpa_ca_join["organisation_entity"].notnull()),
        lpa_ca_join["organisation_entity"] == 16
    ],
    [1, 2],
    default = 3)

print(len(lpa_ca_join))
# lpa_ca_join.head()


In [None]:
# count no. of conservation areas per LPA then join on the manual counts
lpa_ca_join_count = lpa_ca_join.groupby(
        ["reference", "lpa_name", "lpa_2023"]
    ).agg(
        {"entity" : "count",
         "name" : lambda x: ', '.join(set(x)),
         "organisation_entity" : "nunique",
         "org_type_rank" : "min"}
    ).reset_index(    
    ).merge(
        con_count_lpa_df[["local_planning_authority", "name", "conservation_area_count", "odp_flag"]],
        how = "left",
        left_on = "reference",
        right_on = "local_planning_authority"
    )

# rename cols
lpa_ca_join_count.rename(columns=
                         {"entity":"count_platform", 
                          "name_x":"platform_data_providers", 
                          "organisation_entity" : "n_platform_data_providers",
                          "conservation_area_count":"count_manual",
                          "name_y":"lpa_name_manual"}, inplace = True)

# calculate count comparison delta
lpa_ca_join_count["count_delta"] = (lpa_ca_join_count["count_platform"] - lpa_ca_join_count["count_manual"]) / lpa_ca_join_count["count_manual"]
lpa_ca_join_count["count_delta_abs"] = abs(lpa_ca_join_count["count_delta"])
# use org type rank to flag the best provider for an area
lpa_ca_join_count["provider_org_type"] = lpa_ca_join_count["org_type_rank"].map({1:"LPA", 2:"Historic England", 3:"None"})
lpa_ca_join_count["provider_org_type_s"] = np.where(lpa_ca_join_count["n_platform_data_providers"] > 1, "Historic England & LPA", lpa_ca_join_count["provider_org_type"])

# lpa_ca_join_count.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison.csv"), index = False)

# lpa_ca_join_count.head()

#### Get single LPA layers

Where we have manual CA counts from organisations which are now technically "retired" LPAs (i.e. replaced by a newer LPA), it indicates that the data is still divided and provided by these historic orgs. In these cases we don't want to show the new 2023 LPA on the map at the same time as it overlaps and is confusing as we haven't technically collected data from this new org.

So we want to find the new 2023 LPAs which sit over retired LPAs that have supplied us with data, so we can remove them from the map and get a single contiguous layer which is a mix of historic and current LPA boundaries.

In [None]:
# create gdf of the match counts for all LPAs
lpa_ca_join_count_gdf = lpa_gdf[["reference", "geometry"]].merge(
    lpa_ca_join_count,
    how = "left",
    on = "reference"
)

In [None]:
# old lpas = those which are not a 2023 boundary and we have data on the platform for
old_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == False) & (lpa_ca_join_count_gdf["count_manual"].notnull())]

# buffer the boundaries of new 2023 lpas a bit, so we can find which old ones are contained within them
buffered_new_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == True)][["reference", "lpa_name", "geometry"]].copy()
buffered_new_lpas["geometry"] = buffered_new_lpas["geometry"].buffer(100)

# new 2023 lpas to flag are those which have an "old" lpa within them
new_lpas = gpd.sjoin(
    old_lpas[["reference", "lpa_name", "lpa_2023", "geometry"]],
    buffered_new_lpas,
    how = "inner",
    predicate = "within"
)

In [None]:
old_lpas_incl_list = old_lpas["reference"].drop_duplicates().values
new_lpas_excl_list = new_lpas["reference_right"].drop_duplicates().values

lpa_ca_join_count_gdf["old_lpa_combo_display"] = np.select(
    [
        # is in old exclude list - show
        lpa_ca_join_count_gdf["reference"].isin(old_lpas_incl_list),
        # is new and not in the new exclude list - show 
        (lpa_ca_join_count_gdf["lpa_2023"] == True) & (~lpa_ca_join_count_gdf["reference"].isin(new_lpas_excl_list))
    ],
    [True, True],
    default = False
)


In [None]:
# test whether new flag gives consistent single layer

# lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].explore(
#     color = "blue",
#     tooltip = False,
#     # popup = ["name", "LPACD"],
#         style_kwds = {
#         "fillOpacity" : "0.1"
#         }
# )

In [None]:
# show both on a map

# old_lpas_list = old_lpas["reference"].drop_duplicates().values
# new_lpas_list = new_lpas["reference_right"].drop_duplicates().values


# map_entities = lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["reference"].isin(old_lpas_list)].explore(
#     color = "red",
#     tooltip = False,
#     # popup = ["organisation_name", "entity", "name", "reference"],
#     tiles = "CartoDB positron",  # use "CartoDB positron" tiles
#     # highlight = False,
#     style_kwds = {
#     "fillOpacity" : "0.1"
#     }
# )

# buffered_new_lpas[buffered_new_lpas["reference"].isin(new_lpas_list)].explore(
#     m = map_entities,
#     color = "blue",
#     tooltip = False,
#     # popup = ["name", "LPACD"],
#         style_kwds = {
#         "fillOpacity" : "0"
#         }
# )

#### Summary matching figures

In [None]:
count_mixed = lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].copy()

n_lpas_total = len(count_mixed)
n_lpas_on_pdp = len(count_mixed[count_mixed["count_platform"] > 0])
n_lpas_not_on_pdp = len(count_mixed[count_mixed["count_platform"] == 0])
n_perfect_matches = len(count_mixed[count_mixed["count_delta"] == 0])
n_within_10_pct = len(count_mixed[abs(count_mixed["count_delta"]) <= .1])

print(f"Total LPAs (Old/New LPA combo): {n_lpas_total}")
print(f"n LPAs with CA data on the site: {n_lpas_on_pdp} ({n_lpas_on_pdp/n_lpas_total:.0%} pct of total LPAs)")
print(f"n LPAs without CA data on the site: {n_lpas_not_on_pdp} ({n_lpas_not_on_pdp/n_lpas_total:.0%} pct of total LPAs)")

print("")

print(f"n LPAs where count of CAs for site and manual check matches exactly: {n_perfect_matches} ({n_perfect_matches/n_lpas_on_pdp:.0%} pct of LPAs with data on the site)")
print(f"n LPAs where count of CAs for site and manual check is within +/- 10%: {n_within_10_pct} ({n_within_10_pct/n_lpas_on_pdp:.0%}  pct of LPAs with data on the site)")

print("")
print("")

n_cas = sum(count_mixed["count_platform"])
mean_ca_per_lpa = n_cas / n_lpas_on_pdp
projected_missing = mean_ca_per_lpa * n_lpas_not_on_pdp
counted_missing = count_mixed[count_mixed["count_platform"] == 0]["count_manual"].sum()

print(f"Total CAs on site (within Old/New LPA combo LPA boundaries): {n_cas:,}")
print(f"mean no. of CAs per LPA on site: {mean_ca_per_lpa:.3g}")
print(f"projected CAs still to add: ~{projected_missing:,.0f}")
print(f"counted CAs still to add: {counted_missing:,.0f}")


In [None]:
source_count = count_mixed.groupby(["provider_org_type_s"]).size().reset_index(name = "count")
source_count["pct"] = source_count["count"] / source_count["count"].sum()

source_count.sort_values(by = "pct", inplace= True)
source_count

In [None]:
ax = source_count.plot.barh(x = "provider_org_type_s", y = "pct",
                  color = ["#A285D1", "#F46A25", "grey","#28A197"],
                  title = "% of English LPAs by conservation-area data provider type",
                  ylabel = "",
                  legend = False,
                  xlim = [0, 0.8]
                  )

ax.bar_label(ax.containers[0], fmt='{:,.0%}', padding = 4)
ax.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off

In [None]:
ax.containers[0]

In [None]:
count_mixed.groupby(["provider_org_type"]).size()

### Entity merging and history
Aim here is to flag when entities are those which have been merged with data from another org, or have had another entity redirected to them.

In [None]:
# read in old-entity and lookup csvs
ca_old_ent = pd.read_csv("https://raw.githubusercontent.com/digital-land/config/main/pipeline/conservation-area/old-entity.csv")
ca_lookup = pd.read_csv("https://raw.githubusercontent.com/digital-land/config/main/pipeline/conservation-area/lookup.csv")

ca_old_ent.columns = [x.replace("-", "_") for x in ca_old_ent.columns]
ca_lookup.columns = [x.replace("-", "_") for x in ca_lookup.columns]

# create org lookup with "-eng" in organisation field
org_eng_sub = org_df[org_df["dataset"] == "local-authority"].copy()
org_eng_sub["organisation"] = org_eng_sub["organisation"].apply(lambda x: x.split(":")[0] + "-eng:" + x.split(":")[1])

# org_lookup: need to join a version with normal organisation field and an "-eng" organisation field into a lookup table, as CA lookup.csv contains both types
org_eng_lookup = pd.concat([
    org_df[["organisation", "organisation_entity", "dataset"]].copy(),
    org_eng_sub[["organisation", "organisation_entity", "dataset"]]
])

ca_lookup = ca_lookup.merge(
    org_eng_lookup[["organisation_entity", "organisation", "dataset"]], 
    how = "left",
    on = "organisation"
)

print(len(ca_old_ent))
print(len(ca_lookup))

In [None]:
# flag LPA or Historic England supplier
ca_lookup["supplier_flag"] = np.select([
    ca_lookup["organisation_entity"] == 16,
    ca_lookup["dataset"] == "local-authority"
    ],
    ["HE", "LPA"],
    default = np.nan)

# ca_lookup.head()

In [None]:
# breaking down multiple entries in the lookup file with the same entity number, and where the supplier for duplicates is from HE or LPA
entity_count = ca_lookup.groupby(
        ["entity"]
    ).agg(
        {"reference" : "count",
         "organisation_entity" : "nunique",
         "supplier_flag" : lambda x: ', '.join(set(x))}
    ).reset_index()

entity_count.rename(columns={
    "reference" : "n_references",
    "organisation_entity" : "n_orgs_distinct",
    "supplier_flag" : "supplier_list"}, inplace = True)

# list of all entities which have an entry for both an LPA and HE
he_lpa_ent_dupes = entity_count[entity_count["supplier_list"] == "LPA, HE"]

# show breakdown
entity_count[entity_count["n_references"] > 1].groupby(["n_references", "n_orgs_distinct", "supplier_list"]).size().reset_index(name = "n_instances")


In [None]:
# quick check of total no. of entities with multiple lookups in the CA file from PDP
# ca_df["he_lpa_dupe"] =np.where(ca_df["entity"].isin(he_lpa_ent_dupes["entity"].values), True, False)
# ca_df.groupby("he_lpa_dupe").size()

In [None]:
# quick check of how many are redirects
# print(len(ca_df))
# print(len(ca_df[ca_df["entity"].isin(ca_old_ent["entity"])]))

In [None]:
# create count of conservation areas and supplying orgs per LPA (granular), with dupe and redirect flags in
lpa_ca_join["he_lpa_merged"] =np.where(lpa_ca_join["entity"].isin(he_lpa_ent_dupes["entity"].values), 1, 0)
lpa_ca_join["is_redirect"] = np.where(lpa_ca_join["entity"].isin(ca_old_ent[ca_old_ent["entity"].notnull()]["entity"]), 1, 0)

lpa_ca_join_count_granular = lpa_ca_join.groupby(
        ["reference", "lpa_name", "lpa_2023", "organisation_entity", "name"],
        dropna=False
    ).agg(
        {"entity" : "count",
         "he_lpa_merged" : "sum",
         "is_redirect" : "sum"}
    ).reset_index(    
    ).merge(
        con_count_lpa_df[["local_planning_authority", "name", "organisation", "conservation_area_count", "odp_flag"]],
        how = "left",
        left_on = "reference",
        right_on = "local_planning_authority"
    )

# rename cols
lpa_ca_join_count_granular.rename(columns=
                         {"entity":"count_platform", 
                          "he_lpa_merged":"n_he_lpa_merged",
                          "is_redirect":"n_ent_redirects",
                          "name_x":"platform_data_providers", 
                          "conservation_area_count":"count_manual",
                          "name_y":"lpa_name_manual"}, inplace = True)

# add old/new combo layer LPA flag
lpa_ca_join_count_granular["old_lpa_combo_display"] = np.select(
    [
        # is in old exclude list - show
        lpa_ca_join_count_granular["reference"].isin(old_lpas_incl_list),
        # is new and not in the new exclude list - show 
        (lpa_ca_join_count_granular["lpa_2023"] == True) & (~lpa_ca_join_count_granular["reference"].isin(new_lpas_excl_list))
    ],
    [True, True],
    default = False
)

lpa_ca_join_count_granular

lpa_ca_join_count_granular.head()
lpa_ca_join_count_granular.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison - granular.csv"), index = False)

In [None]:
# re-do LPA count to include the merge/redirect info, plus the single LPA layer flag
lpa_ca_join_count2 = lpa_ca_join.groupby(
        ["reference", "lpa_name", "lpa_2023"]
    ).agg(
        {"entity" : "count",
         "he_lpa_merged" : "sum",
         "is_redirect" : "sum",
         "name" : lambda x: ', '.join(set(x)),
         "organisation_entity" : "nunique",
         "org_type_rank" : "min"}
    ).reset_index(    
    ).merge(
        con_count_lpa_df[["local_planning_authority", "name", "conservation_area_count", "odp_flag"]],
        how = "left",
        left_on = "reference",
        right_on = "local_planning_authority"
    )

# rename cols
lpa_ca_join_count2.rename(columns=
                         {"entity":"count_platform", 
                          "he_lpa_merged":"n_he_lpa_merged",
                          "is_redirect":"n_ent_redirects",
                          "name_x":"platform_data_providers", 
                          "organisation_entity" : "n_platform_data_providers",
                          "conservation_area_count":"count_manual",
                          "name_y":"lpa_name_manual"}, inplace = True)

lpa_ca_join_count2["old_lpa_combo_display"] = np.select(
    [
        # is in old exclude list - show
        lpa_ca_join_count2["reference"].isin(old_lpas_incl_list),
        # is new and not in the new exclude list - show 
        (lpa_ca_join_count2["lpa_2023"] == True) & (~lpa_ca_join_count2["reference"].isin(new_lpas_excl_list))
    ],
    [True, True],
    default = False
)

# calculate count comparison delta
lpa_ca_join_count2["count_delta"] = (lpa_ca_join_count2["count_platform"] - lpa_ca_join_count2["count_manual"]) / lpa_ca_join_count2["count_manual"]
lpa_ca_join_count2["count_delta_abs"] = abs(lpa_ca_join_count2["count_delta"])
# use org type rank to flag the best provider for an area
lpa_ca_join_count2["provider_org_type"] = lpa_ca_join_count2["org_type_rank"].map({1:"LPA", 2:"Historic England", 3:"None"})

lpa_ca_join_count2.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison.csv"), index = False)

lpa_ca_join_count2.head()

### Merging problem summary stats

In [None]:
combo_lpas_count = lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True]

n_total_lpas = len(combo_lpas_count)

n_lpas_with_multiple_provs = len(combo_lpas_count[combo_lpas_count["n_platform_data_providers"] > 1])

n_lpas_with_multiple_provs

In [None]:
lpa_ca_join_count.head()

In [None]:
# ODP Summary stats

odp_lpas = lpa_ca_join_count_granular[lpa_ca_join_count_granular["odp_flag"] == True]

# flag lpas which have supplied data at some point 
# (data comes from not-HE, or is showing as HE but has merge or redirect flags, indicating entities for LPA were also created at some point)
odp_providers = odp_lpas[
    (
        (odp_lpas["organisation_entity"] != 16) &
        (odp_lpas["count_platform"] > 0)
     ) |
    (
        (odp_lpas["organisation_entity"] == 16) &
        (
            (odp_lpas["n_he_lpa_merged"] > 0 ) |
            (odp_lpas["n_ent_redirects"] > 0 )
        )
    )
]

odp_providers_prov_count = odp_providers.groupby(
        ["reference", "lpa_name"]
    ).agg(
        {
            "platform_data_providers" : "count",
            "count_platform" : "sum",
            "n_he_lpa_merged" : "sum",
            "n_ent_redirects" : "sum"
        }
    ).reset_index()

odp_dist = odp_lpas["lpa_name"].drop_duplicates()
odp_provs_dist = odp_providers["lpa_name"].drop_duplicates()

odp_provs_w_multi = odp_providers_prov_count[odp_providers_prov_count["platform_data_providers"] > 1]
odp_provs_w_merges = odp_providers_prov_count[
    (odp_providers_prov_count["platform_data_providers"] == 1) &
    (odp_providers_prov_count["n_he_lpa_merged"] + odp_providers_prov_count["n_ent_redirects"] > 0)
]

odp_provs_clean = odp_providers_prov_count[
    (odp_providers_prov_count["platform_data_providers"] == 1) &
    (odp_providers_prov_count["n_he_lpa_merged"] + odp_providers_prov_count["n_ent_redirects"] == 0)
]

print(f"n ODP LPAs : {len(odp_dist)}")
print(f"n ODP LPAs who've supplied data ever: {len(odp_provs_dist)}")
print(f"n ODP LPAs (supplying data) with multiple providers: {len(odp_provs_w_multi)}")
print(f"n ODP LPAs (supplying data) with single provider but merges: {len(odp_provs_w_merges)}")
print(f"n ODP LPAs (supplying data) with single provider and no merges: {len(odp_provs_clean)}")

# show summary table
# odp_providers_prov_count[odp_providers_prov_count["platform_data_providers"] > 1]

In [None]:
# check single example in granular summary
lpa_ca_join_count_granular[lpa_ca_join_count_granular["reference"] == "E60000279"]
# lpa_ca_join[lpa_ca_join["reference"] == "E60000279"].sort_values(by = ["name", "he_lpa_merged", "is_redirect"])

In [None]:
# Exporting some data to map for Waverley example
ex_wav = lpa_ca_join[lpa_ca_join["reference"] == "E60000279"].copy()

ex_wav["merge_lab"] = ex_wav["he_lpa_merged"].map({1:"_merged", 0:""})
ex_wav["redirect_lab"] = ex_wav["is_redirect"].map({1:"_redirect", 0:""})
ex_wav["provider_flag"] = ex_wav["name"] + ex_wav["merge_lab"] + ex_wav["redirect_lab"]

# ex_wav
# ca_df[["entity", "geometry"]].merge(
#     ex_wav[["entity", "name", "provider_flag"]]
# ).to_csv(os.path.join(output_dir, "entity_merging_example_waverly.csv"), index = False)

In [None]:
# Checking entity-level data for some LPAs - Doncaster
lpa_ca_join[lpa_ca_join["reference"] == "E60000065"].head(2)

In [None]:
# Checking entity-level data for some LPAs - Somerset
# lpa_ca_join[lpa_ca_join["reference"] == "E60000337"]