# Conservation area overlaps report
**Author**:  Greg Slater <br>
**Date**:  6th August 2024 <br>
**Dataset Scope**: conservation-area <br>
**Report Type**: Ad-hoc <br>

## Purpose
This report helps investigate issues where LPA conservation area entities are overlapping with Historic England conservation area entities, by showing issues on a map.

Run the whole notebook and use the drop-down in the bottom cell to select an LPA. By default all conservation area entities within the LPA will be displayed, but there is a toggle button which can be used to show only those with overlap issues.

Overlap issues are defined as where there is some overlap between one or more LPA and Historic England entities. They are classified using the following definitions (note that a single entity can have multiple overlap issues if it overlaps with more than one other entity):

* **> 90% combined match** (high priority): 90% or more of each entity's area overlaps with the other - this suggests the boundaries of each almost perfectly match. Entities with a combined match can usually be merged.
* **> 90% single match** (medium priority): 90% of more of one entity's area overlaps with the other - this suggests the entities overlap but the boundaries don't closely match (one may be much larger than the other, for instance). These may need further investigation or escalation with the LPA.
* **unclassified** (low priority): the two entities overlap somewhat, but the overlapped area makes up less than 90% of each entity's area. These may need further investigation or escalation with the LPA.
* **edge intersection** (low priority): between 1 - 10% of each entity's area overlaps with the other. These may not be a serious issue, but may still be raised with the LPA.

In [None]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify

In [199]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np
import os
import folium
import ipywidgets as widgets

output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)

In [214]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as organisation_name, organisation, dataset, local_planning_authority
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

def get_provisions():
    global provisions_df  
    params = urllib.parse.urlencode({
        "sql": f"""
            SELECT
                cohort, notes, organisation, project, provision_reason, start_date
            FROM
                provision   
            WHERE 
                provision_reason = "expected"
                AND project = "open-digital-planning"
            GROUP BY organisation
            ORDER BY cohort
        """,
        "_size": "max"
    })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    provisions_df = pd.read_csv(url)
    return provisions_df

def get_old_entity(collection_name):
    params = urllib.parse.urlencode({
        "sql": f"""
        select *
        from old_entity
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{collection_name}.csv?{params}"
    df = pd.read_csv(url)
    return df

In [201]:
def display_lpa_issues(lpa_name, show_only_issues = False):

    if show_only_issues:
        map_gdf = lpa_ca_all[
            (lpa_ca_all["lpa_name"] == lpa_name) &
            lpa_ca_all["has_overlap_issues"]].copy()
        
    else: map_gdf = lpa_ca_all[lpa_ca_all["lpa_name"] == lpa_name].copy()

    print(map_gdf.groupby(["lpa_name", "overlap_issue_types"], dropna=False).size().reset_index(name = "n_instances"))

    ents_lpa = map_gdf[map_gdf["organisation_entity"] != 16]
    ents_he = map_gdf[map_gdf["organisation_entity"] == 16]

    if (len(ents_lpa) == 0) | (len(ents_he) == 0):
        print("")
        print("No entities for one of the organisations, not possible to display map")
        return None
    
    else:

        m = ents_lpa.explore(
            tiles = "CartoDB positron",
            column = "organisation_name", 
            cmap = "Accent",
            highlight = True,
            tooltip = ["organisation_name", "entity", "reference", "name", "has_overlap_issues", "n_overlap_issues", "overlap_issue_types"],
            style_kwds={"fillOpacity" : "0.1"},
            name = "LPA entities")

        ents_he.explore(
            m = m,
            column = "organisation_name", 
            cmap = ["#bf5b16"],
            highlight = True,
            tooltip = ["organisation_name", "entity", "reference", "name", "has_overlap_issues", "n_overlap_issues", "overlap_issue_types"],
            style_kwds={"fillOpacity" : "0.1"},
            name = "Historic England entities")

        folium.LayerControl(show = True).add_to(m)  # use folium to add layer control

        return m


## Data in

In [202]:
# get orgs and flag ODP
provisions_df = get_provisions()

org_df = get_all_organisations()
org_df["odp_flag"] = np.where(org_df["organisation"].isin(provisions_df["organisation"]), True, False)
print(len(org_df))

452


In [203]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point", "geometry"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

# join organisation name and LPA codes from lookup
ca_df = ca_df.merge(
    org_df[["organisation_entity", "organisation_name"]], 
    how = "left",
    on = "organisation_entity")

# load to gdf, both point and poly versions
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_point_gdf = gpd.GeoDataFrame(ca_df, geometry="point")

ca_df["geometry"] = ca_df["geometry"].apply(shapely.wkt.loads)
ca_poly_gdf = gpd.GeoDataFrame(ca_df, geometry="geometry")

# Transform to ESPG:27700 for more interpretable area units
ca_point_gdf.set_crs(epsg=4326, inplace=True)
ca_point_gdf.to_crs(epsg=27700, inplace=True)

ca_poly_gdf.set_crs(epsg=4326, inplace=True)
ca_poly_gdf.to_crs(epsg=27700, inplace=True)

print(len(ca_poly_gdf))

8407


In [204]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")
lpa_gdf.rename(columns={'name':'lpa_name', 'reference':'lpa_reference'}, inplace=True)
lpa_gdf["ODP_flag"] = np.where(lpa_gdf["lpa_reference"].isin(org_df[org_df["odp_flag"]]["local_planning_authority"]), True, False)
print(len(lpa_gdf))

337


In [205]:
# old_entity_df = get_old_entity("conservation-area")
# print(len(old_entity_df))

In [206]:
# ca_lookup = pd.read_csv("https://raw.githubusercontent.com/digital-land/config/main/pipeline/conservation-area/lookup.csv")
# ca_lookup.columns = [x.replace("-", "_") for x in ca_lookup.columns]
# print(len(ca_lookup))

## Calculations
### Spatial joining - LPA boundaries to conservation area points

In [207]:
# join LPAs to all conservation areas, then join on the names of supplying organisations for matching conservation areas
lpa_ca_join = gpd.sjoin(
    lpa_gdf[["lpa_reference", "lpa_name", "ODP_flag", "geometry"]],
    ca_point_gdf[["entity", "organisation_entity", "organisation_name", "point"]],
    how = "left",
    predicate = "intersects"
)

print(len(lpa_ca_join))


9275


### Geographical duplicate identification

In [208]:
MATCH_LOWER_THRESH = 0.9  # defines the lower limit of the shared overlap between two entities to be called a match
EDGE_UPPER_THRESH = 0.1   # defines the upper limit of the shared overlap between two entities to be called an edge intersection
EDGE_LOWER_THRESH = 0.01   # defines the lower limit of the shared overlap between two entities to be called an edge intersection

# calculate area
ca_poly_gdf["area"] = ca_poly_gdf["geometry"].area

# full join of all geometries
entity_join_all = gpd.overlay(
    ca_poly_gdf, 
    ca_poly_gdf,
    how = "intersection", keep_geom_type=False 
)


# remove self-intersections and duplicates of the same intersections
entity_join_all = entity_join_all[entity_join_all["entity_1"] != entity_join_all["entity_2"]]

# entity_join_all["entity_join"] = entity_join_all.apply(lambda x: '-'.join(sorted(x[["entity_1", "entity_2"]])), axis=1)

# # extra sort to make sure matches to Historic England always show as Historic England as org 2 
# entity_join_all["name_for_sort"] = np.where(entity_join_all["organisation_entity_1"] == 16, "Z", "A")
# entity_join_all.sort_values(["entity_join", "name_for_sort"], ascending=True, inplace=True)

# entity_join_all.drop_duplicates(subset="entity_join", inplace = True)  #Drop them by name

# nrow(entity_join_all)

# flag the types of intersections between organisations
# is org the same
entity_join_all["int_org_match"] = np.where(entity_join_all["organisation_entity_1"] == entity_join_all["organisation_entity_2"], True, False)

# the types of org-org matches
entity_join_all["int_org_types"] = np.select(
    [
        (entity_join_all["organisation_entity_1"] == 16) & (entity_join_all["organisation_entity_2"] == 16),
        (entity_join_all["organisation_entity_1"] != 16) & (entity_join_all["organisation_entity_2"] != 16),
        ((entity_join_all["organisation_entity_1"] != 16) & (entity_join_all["organisation_entity_2"] == 16)) |
        ((entity_join_all["organisation_entity_1"] == 16) & (entity_join_all["organisation_entity_2"] != 16))
    ],
    ["HE - HE", "LPA - LPA", "HE - other"],
    default = "-"
)

# does the entity entry date match?
entity_join_all["date_match"] = np.where(entity_join_all["entry_date_1"] == entity_join_all["entry_date_2"], True, False)

# calculate overlap %'s

entity_join_all["area_intersection"] = entity_join_all["geometry"].area

entity_join_all["p_pct_intersect"] = entity_join_all["area_intersection"] / entity_join_all["area_1"]
entity_join_all["pct_intersection"] = entity_join_all["area_intersection"] / (entity_join_all["area_1"] + entity_join_all["area_2"] - entity_join_all["area_intersection"])
entity_join_all["s_pct_intersect"] = entity_join_all["area_intersection"] / entity_join_all["area_2"]

# intersection area as % of smallest primary or secondary area
entity_join_all["pct_min_intersection"] = entity_join_all["area_intersection"] / entity_join_all[["area_1", "area_2"]].min(axis = 1)


entity_join_all["intersection_type"] = np.select(
    [
        (entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) & (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH),
        (entity_join_all["pct_min_intersection"] <= EDGE_UPPER_THRESH) & (entity_join_all["pct_min_intersection"] >= EDGE_LOWER_THRESH),
        (entity_join_all["pct_min_intersection"] < EDGE_LOWER_THRESH),
        ((entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) | (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH)),
        
    ],
    [
        "> 90% combined match", "edge intersection", "tiny edge - ignore", "> 90% single match"
    ],
    default = "unclassified"
)

print(len(entity_join_all))
# entity_join_all.head()

4110


In [209]:
# SUMMARISE OVERLAP ISSUES

# filter to just HE - LPA overlaps, and exclude tiny edges
ca_issues_he_lpa = entity_join_all[
    (entity_join_all["int_org_types"] == "HE - other") &
    (entity_join_all["intersection_type"] != "tiny edge - ignore")
]

# group and count
ca_issues_he_lpa_count = ca_issues_he_lpa.groupby(
        ["entity_1"] # , "name_1", "reference_1", "organisation_entity_1", "geometry"
    ).agg(
        {"entity_2" : "count",
         "intersection_type" : lambda x: ', '.join(set(x))}
    ).reset_index(    
    )

# rename cols
ca_issues_he_lpa_count.rename(columns=
    {"entity_1":"entity", 
     "entity_2":"n_overlap_issues", 
     "intersection_type" : "overlap_issue_types"}, inplace = True)

# ca_issues_he_lpa_count.sort_values("n_overlap_issues", ascending = False)

In [210]:
# create subset table with only single instances of each issue
overlap_issues_dist = ca_issues_he_lpa.copy()

overlap_issues_dist["entity_join"] = overlap_issues_dist.apply(lambda x: '-'.join(list(map(str, sorted(x[["entity_1", "entity_2"]])))), axis=1)

# extra sort to make sure matches to Historic England always show as Historic England as org 2 
overlap_issues_dist["name_for_sort"] = np.where(overlap_issues_dist["organisation_entity_1"] == 16, "Z", "A")
overlap_issues_dist.sort_values(["entity_join", "name_for_sort"], ascending=True, inplace=True)

overlap_issues_dist.drop_duplicates(subset="entity_join", inplace = True)  #Drop them by name

# flag entities with multiple issues
all_ents = pd.concat([overlap_issues_dist["entity_1"], overlap_issues_dist["entity_2"]], ignore_index = True)
multi_issue_ents = all_ents.loc[all_ents.duplicated()]

overlap_issues_dist["multi_issue_entities"] = np.where(
    (overlap_issues_dist["entity_2"].isin(multi_issue_ents)) |
    (overlap_issues_dist["entity_1"].isin(multi_issue_ents)),
    True, False)

# add in action field
overlap_issues_dist["action"] = np.select(
    [
        (overlap_issues_dist["intersection_type"] == "> 90% combined match") &
        (overlap_issues_dist["multi_issue_entities"] == False) 

    ],
    ["remap"],
    default = "investigate"
)

print(len(overlap_issues_dist))
# overlap_issues_dist.head()

309


### Combine

In [211]:
# Join LPAs with intersecting CAs to CA geometry, and then to CA overlap issues

print(len(lpa_ca_join))

lpa_ca_all = lpa_ca_join[["lpa_reference", "lpa_name", "ODP_flag", "entity", "organisation_entity", "organisation_name"]].merge(
        ca_poly_gdf[["entity", "reference", "name", "geometry"]],
        how = "left",
        on = "entity"
    ).merge(
        ca_issues_he_lpa_count,
        how = "left",
        on = "entity"
    )

lpa_ca_all["has_overlap_issues"] = np.where(lpa_ca_all["n_overlap_issues"].notnull(), True, False)
lpa_ca_all = gpd.GeoDataFrame(lpa_ca_all, geometry="geometry")

print(len(lpa_ca_all))
lpa_ca_all.head()

9275
9275


Unnamed: 0,lpa_reference,lpa_name,ODP_flag,entity,organisation_entity,organisation_name,reference,name,geometry,n_overlap_issues,overlap_issue_types,has_overlap_issues
0,E60000001,County Durham LPA,False,44006437.0,16.0,Historic England,5591,Gainford,"MULTIPOLYGON (((417192.759 517310.462, 417146....",,,False
1,E60000001,County Durham LPA,False,44000619.0,16.0,Historic England,5598,Little Newsham,"MULTIPOLYGON (((412216.036 517667.103, 412227....",,,False
2,E60000001,County Durham LPA,False,44000618.0,16.0,Historic England,5586,Cleatlam,"MULTIPOLYGON (((411693.930 518512.004, 411721....",,,False
3,E60000001,County Durham LPA,False,44006434.0,16.0,Historic England,5596,Langton,"MULTIPOLYGON (((417352.476 519332.851, 417352....",,,False
4,E60000001,County Durham LPA,False,44000615.0,16.0,Historic England,5595,Ingleton,"MULTIPOLYGON (((417377.325 520414.003, 417376....",,,False


In [212]:
ODP_overlap_issues_count = lpa_ca_all[
    (lpa_ca_all["has_overlap_issues"]) &
    (lpa_ca_all["ODP_flag"])].groupby("lpa_name").size().reset_index(name = "count").sort_values("count", ascending=False)

print("ODP LPAs by number of overlapping issues")
ODP_overlap_issues_count

ODP LPAs by number of overlapping issues


Unnamed: 0,lpa_name,count
9,Southwark LPA,96
5,North Somerset LPA,70
2,East Cambridgeshire LPA,56
10,St Albans LPA,27
6,Rossendale LPA,18
7,Sandwell LPA,18
3,Great Yarmouth LPA,17
12,Waverley LPA,10
1,Dorset LPA,5
0,Buckinghamshire LPA,4


## Map

In [213]:
dataset_options = dict(zip(ODP_overlap_issues_count["lpa_name"], ODP_overlap_issues_count["lpa_name"]))

dataset_dropdown = widgets.Dropdown(
    options=dataset_options,
    description="Select LPA:",
)

widgets.interact(display_lpa_issues, lpa_name=dataset_dropdown)
initial_dataset = dataset_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={'Southwark LPA': 'Southwark LPA', 'North So…