In [None]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np

In [None]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name, organisation, dataset, local_planning_authority, local_authority_district,
        case when dataset = "local-authority" then local_authority_district else local_planning_authority end as statistical_geography
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

In [None]:
def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

## Data in

In [None]:
org_df = get_all_organisations()
print(len(org_df))

In [None]:
# read in manual count sheet
con_count_df = pd.read_csv("data/conservation_area_count.csv")
con_count_df.columns = [x.replace("-", "_") for x in con_count_df.columns]

# join on organisation names and LPA codes
con_count_lpa_df = con_count_df.merge(
    org_df[["organisation_entity", "name", "local_planning_authority"]],
    how = "left",
    on = "organisation_entity"
)

print(len(con_count_lpa_df))
con_count_lpa_df.head()

In [None]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

# load to gdf
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_gdf = gpd.GeoDataFrame(ca_df, geometry='point')

# Transform to ESPG:27700 for more interpretable area units
ca_gdf.set_crs(epsg=4326, inplace=True)
ca_gdf.to_crs(epsg=27700, inplace=True)

In [None]:
# Latest ONS LPA file, for flagging whether pdp LPAs are 2023 or not
ons_lpa_gpd = gpd.read_file("https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Local_Planning_Authorities_April_2023_Boundaries_UK_BGC/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson",)

print(len(ons_lpa_gpd))
ons_lpa_gpd.head()

In [None]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf["lpa_2023"] = np.where(lpa_gdf["reference"].isin(ons_lpa_gpd["LPA23CD"]), True, False)
lpa_gdf.rename(columns={'name':'lpa_name'}, inplace=True)

print(len(lpa_gdf))
lpa_gdf.head()

## Analysis

#### Check old LPAs

In [None]:
# check how many organisations who've supplied CA data are old/new LPAs

# distinct list of ca supplying orgs (excluding Historic England), and join to org table for LPACD
ca_suppliers = ca_df[
        ca_df["organisation_entity"] != 16][["organisation_entity"]].drop_duplicates(
    ).merge(
        org_df[["organisation_entity", "name", "local_planning_authority"]],
        how = "left",
        on = "organisation_entity"
    )

# flag whether in 2023 ONS codes
ca_suppliers["lpa_2023"] = np.where(ca_suppliers["local_planning_authority"].isin(ons_lpa_gpd["LPA23CD"]), 1, 0)

# see supplying orgs not in 2023 LPA code list - these may be areas where there are mis-matches
ca_suppliers[ca_suppliers["lpa_2023"] == 0]

In [None]:
# list of the counted areas which are not in the 2023 LPA codelist. In most cases this is because the data is still being supplied split up by the now-retired orgs.
con_count_lpa_df[~con_count_lpa_df["local_planning_authority"].isin(ons_lpa_gpd["LPA23CD"])].head()

### Spatial joining

In [None]:
# join LPAs to all conservation areas, then join on the names of supplying organisations for matching conservation areas
lpa_ca_join = gpd.sjoin(
    lpa_gdf[["reference", "lpa_name", "lpa_2023", "geometry"]],
    ca_gdf[["entity", "organisation_entity", "point"]],
    how = "left",
    predicate = "intersects"
).merge(
    org_df[["organisation_entity", "name"]],
    how = "left",
    on = "organisation_entity"
)

lpa_ca_join["name"] = lpa_ca_join["name"].astype(str)

print(len(lpa_ca_join))
lpa_ca_join.head()


In [None]:
# flag the providing org type - ranking so when we group and count we can count areas with LPA and Historic England providing as LPA
lpa_ca_join["org_type_rank"] = np.select(
    [
        (lpa_ca_join["organisation_entity"] != 16) & (lpa_ca_join["organisation_entity"].notnull()),
        lpa_ca_join["organisation_entity"] == 16
    ],
    [1, 2],
    default = 3)

lpa_ca_join.head()

In [None]:
# count no. of conservation areas per LPA then join on the manual counts
lpa_ca_join_count = lpa_ca_join.groupby(
        ["reference", "lpa_name", "lpa_2023"]
    ).agg(
        {"entity" : "count",
         "name" : lambda x: ', '.join(set(x)),
         "org_type_rank" : "min"}
    ).reset_index(    
    ).merge(
        con_count_lpa_df[["local_planning_authority", "name", "conservation_area_count"]],
        how = "left",
        left_on = "reference",
        right_on = "local_planning_authority"
    )

# rename cols
lpa_ca_join_count.rename(columns=
                         {"entity":"count_platform", 
                          "name_x":"platform_data_providers", 
                          "conservation_area_count":"count_manual",
                          "name_y":"lpa_name_manual"}, inplace = True)

# calculate count comparison delta
lpa_ca_join_count["count_delta"] = (lpa_ca_join_count["count_platform"] - lpa_ca_join_count["count_manual"]) / lpa_ca_join_count["count_manual"]
lpa_ca_join_count["count_delta_abs"] = abs(lpa_ca_join_count["count_delta"])
# use org type rank to flag the best provider for an area
lpa_ca_join_count["provider_org_type"] = lpa_ca_join_count["org_type_rank"].map({1:"LPA", 2:"Historic England", 3:"None"})

lpa_ca_join_count.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison.csv"), index = False)

lpa_ca_join_count.head()

#### Summary figures

In [None]:
count_23 = lpa_ca_join_count[lpa_ca_join_count["lpa_2023"] == True]

n_lpas_total = len(count_23)
n_lpas_on_pdp = len(count_23[count_23["count_platform"] > 0])
n_lpas_not_on_pdp = len(count_23[count_23["count_platform"] == 0])
n_perfect_matches = len(count_23[count_23["count_delta"] == 0])
n_within_10_pct = len(count_23[abs(count_23["count_delta"]) <= .1])

print(f"Total LPAs (2023 boundaries): {n_lpas_total}")
print(f"n LPAs with CA data on the site: {n_lpas_on_pdp} ({n_lpas_on_pdp/n_lpas_total:.0%} pct of total LPAs)")
print(f"n LPAs without CA data on the site: {n_lpas_not_on_pdp} ({n_lpas_not_on_pdp/n_lpas_total:.0%} pct of total LPAs)")

print("")

print(f"n LPAs where count of CAs for site and manual check matches exactly: {n_perfect_matches} ({n_perfect_matches/n_lpas_on_pdp:.0%} pct of LPAs with data on the site)")
print(f"n LPAs where count of CAs for site and manual check is within +/- 10%: {n_within_10_pct} ({n_within_10_pct/n_lpas_on_pdp:.0%}  pct of LPAs with data on the site)")

print("")
print("")

n_cas = sum(count_23["count_platform"])
mean_ca_per_lpa = n_cas / n_lpas_on_pdp
projected_missing = mean_ca_per_lpa * n_lpas_not_on_pdp

print(f"Total CAs on site (within 2023 LPA boundaries): {n_cas:,}")
print(f"mean no. of CAs per LPA on site: {mean_ca_per_lpa:.3g}")
print(f"projected CAs still to add: ~{projected_missing:,.0f}")





#### Get single LPA layers

Where we have manual CA counts from organisations which are now technically "retired" LPAs (i.e. replaced by a newer LPA), it indicates that the data is still divided and provided by these historic orgs. In these cases we don't want to show the new 2023 LPA on the map at the same time as it overlaps and is confusing as we haven't technically collected data from this new org.

So we want to find the new 2023 LPAs which sit over retired LPAs that have supplied us with data, so we can remove them from the map and get a single contiguous layer which is a mix of historic and current LPA boundaries.

In [None]:
# create gdf of the match counts for all LPAs
lpa_ca_join_count_gdf = lpa_gdf[["reference", "geometry"]].merge(
    lpa_ca_join_count,
    how = "left",
    on = "reference"
)


In [None]:
# lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == False) & (lpa_ca_join_count_gdf["count_manual"].notnull())]

In [None]:
# old lpas = those which are not a 2023 boundary and we have data on the platform for
old_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == False) & (lpa_ca_join_count_gdf["count_manual"].notnull())]

# buffer the boundaries of new 2023 lpas a bit, so we can find which old ones are contained within them
buffered_new_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == True)][["reference", "lpa_name", "geometry"]].copy()
buffered_new_lpas["geometry"] = buffered_new_lpas["geometry"].buffer(100)

# new 2023 lpas to flag are those which have an "old" lpa within them
new_lpas = gpd.sjoin(
    old_lpas[["reference", "lpa_name", "lpa_2023", "geometry"]],
    buffered_new_lpas,
    how = "inner",
    predicate = "within"
)

In [None]:
old_lpas_incl_list = old_lpas["reference"].drop_duplicates().values
new_lpas_excl_list = new_lpas["reference_right"].drop_duplicates().values

lpa_ca_join_count_gdf["old_lpa_combo_display"] = np.select(
    [
        # is in old exclude list - show
        lpa_ca_join_count_gdf["reference"].isin(old_lpas_incl_list),
        # is new and not in the new exclude list - show 
        (lpa_ca_join_count_gdf["lpa_2023"] == True) & (~lpa_ca_join_count_gdf["reference"].isin(new_lpas_excl_list))
    ],
    [True, True],
    default = False
)


In [None]:
# test whether new flag gives consistent single layer

# lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].explore(
#     color = "blue",
#     tooltip = False,
#     # popup = ["name", "LPACD"],
#         style_kwds = {
#         "fillOpacity" : "0.1"
#         }
# )

In [None]:
# show both on a map

# old_lpas_list = old_lpas["reference"].drop_duplicates().values
# new_lpas_list = new_lpas["reference_right"].drop_duplicates().values


# map_entities = lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["reference"].isin(old_lpas_list)].explore(
#     color = "red",
#     tooltip = False,
#     # popup = ["organisation_name", "entity", "name", "reference"],
#     tiles = "CartoDB positron",  # use "CartoDB positron" tiles
#     # highlight = False,
#     style_kwds = {
#     "fillOpacity" : "0.1"
#     }
# )

# buffered_new_lpas[buffered_new_lpas["reference"].isin(new_lpas_list)].explore(
#     m = map_entities,
#     color = "blue",
#     tooltip = False,
#     # popup = ["name", "LPACD"],
#         style_kwds = {
#         "fillOpacity" : "0"
#         }
# )

## Map

In [None]:
def org_type_colormap(value):  # scalar value defined in 'column'
    if value == "LPA":
        return "#28A197"
    if value == "Historic England":
        return "#12436D"
    return "grey"

lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].explore(
    tiles = "CartoDB positron",
    column = "provider_org_type", 
    tooltip = ["reference", "lpa_name", "platform_data_providersp"],
    cmap = ["#1d70b8", "#28A197", "white"],
    style_kwds=dict(color="black", weight = 1))

In [None]:
[-np.inf, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, np.inf]

In [None]:
# bins = np.append(np.arange(-1, 1.2, 0.2), np.inf)
# labels = ["-100% : -80%", "-80% : -60%", "-60% : -40%", "-40% : -20%", "-20% : 0", "0 : +20%", "+20% : +40%", "+40% : +60%", "+60% : +80%", "+80% : +100%", "> +100% "]

bins = [-np.inf, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, np.inf]
labels = ["< -50%", "-50% : -40%", "-40% : -30%", "-30% : -20%", "-20% : -10%", "-10% : 0", "0 : +10%", "+10% : +20%", "+20% : +30%", "+30% : +40%", "+40% : +50%", "> +50% "]


print(len(bins))
print(len(labels))
print(bins)

lpa_ca_join_count_gdf["count_delta_bins"] = pd.cut(lpa_ca_join_count_gdf["count_delta"], bins = bins, labels = labels)

lpa_ca_join_count_gdf.head()

In [None]:
len(lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True])

In [None]:
count_comp_gdf = lpa_ca_join_count_gdf[
    (lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
    (lpa_ca_join_count_gdf["count_platform"] > 0) &
    (lpa_ca_join_count_gdf["count_platform"] != lpa_ca_join_count_gdf["count_manual"])].copy()

print(len(count_comp_gdf))

In [None]:
count_comp_equal_gdf = lpa_ca_join_count_gdf[
    (lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
    (lpa_ca_join_count_gdf["count_platform"] == lpa_ca_join_count_gdf["count_manual"])].copy()

count_comp_equal_gdf["count_comparison"] = "match"

print(len(count_comp_equal_gdf))

In [None]:
count_comp_equal_gdf.head()

In [None]:
# show areas where the site count vs. manual count matches

count_comp_equal_gdf.explore(
    tiles = "CartoDB positron",
    color = "green", 
    tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_comparison"],
    style_kwds=dict(color="black", weight = 1, fillOpacity = 0.3)
)


In [None]:
# show map of areas where there are mis-matches between site and manual CA count

count_comp_gdf.explore(
    tiles = "CartoDB positron",
        column = "count_delta_bins", 
        tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_delta", "count_delta_bins"],
        cmap = "coolwarm",
    style_kwds=dict(color="black", weight = 1)
)

In [None]:
count_mixed = lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True]

n_lpas_total = len(count_mixed)
n_lpas_on_pdp = len(count_mixed[count_mixed["count_platform"] > 0])
n_lpas_not_on_pdp = len(count_mixed[count_mixed["count_platform"] == 0])
n_perfect_matches = len(count_mixed[count_mixed["count_delta"] == 0])
n_within_10_pct = len(count_mixed[abs(count_mixed["count_delta"]) <= .1])

print(f"Total LPAs (2023 boundaries): {n_lpas_total}")
print(f"n LPAs with CA data on the site: {n_lpas_on_pdp} ({n_lpas_on_pdp/n_lpas_total:.0%} pct of total LPAs)")
print(f"n LPAs without CA data on the site: {n_lpas_not_on_pdp} ({n_lpas_not_on_pdp/n_lpas_total:.0%} pct of total LPAs)")

print("")

print(f"n LPAs where count of CAs for site and manual check matches exactly: {n_perfect_matches} ({n_perfect_matches/n_lpas_on_pdp:.0%} pct of LPAs with data on the site)")
print(f"n LPAs where count of CAs for site and manual check is within +/- 10%: {n_within_10_pct} ({n_within_10_pct/n_lpas_on_pdp:.0%}  pct of LPAs with data on the site)")

print("")
print("")

n_cas = sum(count_mixed["count_platform"])
mean_ca_per_lpa = n_cas / n_lpas_on_pdp
projected_missing = mean_ca_per_lpa * n_lpas_not_on_pdp

print(f"Total CAs on site (within 2023 LPA boundaries): {n_cas:,}")
print(f"mean no. of CAs per LPA on site: {mean_ca_per_lpa:.3g}")
print(f"projected CAs still to add: ~{projected_missing:,.0f}")


In [None]:
lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["reference"] == "E60000328"]