In [1]:
# from download_data import download_dataset
# from data import get_entity_dataset, nrow
# from plot import plot_map, plot_issues_map
import spatialite
import pandas as pd
import geopandas as gpd
import os
import itertools
import shapely.wkt
import logging

import matplotlib.pyplot as plt
import time
import urllib

import numpy as np

pd.set_option("display.max_rows", 100)


In [2]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify

In [3]:
# global variables
data_dir = "../../data/reports/conservation-area-duplicates/"
os.makedirs(data_dir, exist_ok=True)

### Functions

In [11]:
def nrow(df):
    return print(f"No. of records in df: {len(df):,}")


def plot_issues_map(gdf:gpd.GeoDataFrame, entity_list, chloro_var, palette):

    if type(gdf) != gpd.GeoDataFrame:
        logging.error('input is not a GeodataFrame')
    
    base = gdf[gdf["entity"].isin(entity_list)].explore(
        column = chloro_var,  # make choropleth based on "BoroName" column
        cmap = palette,
        tooltip = False,
        popup = ["organisation_name", "entity", "name", "entry_date", "reference"],
        tiles = "CartoDB positron",  # use "CartoDB positron" tiles
        highlight = True,
        style_kwds = {
        "fillOpacity" : "0.1"
        }
    )
    
    return base

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name, entity as organisation_entity, statistical_geography
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype="str")
    return df


def get_old_entity(collection_name):
    params = urllib.parse.urlencode({
        "sql": f"""
        select *
        from old_entity
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/{collection_name}.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_issue_entities(issues_df):

    return pd.concat([issues_df["entity_1"], issues_df["entity_2"]]).drop_duplicates().to_list()

def get_pdp_dataset(dataset, geometry_field = "geometry", crs_out=4326, underscore_cols=True):

    df = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype = "str")
    df.columns = [x.replace("-", "_") for x in df.columns]

    df_valid_geom = df[df[geometry_field].notnull()].copy()

    # load geometry and create GDF
    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(shapely.wkt.loads)
    gdf = gpd.GeoDataFrame(df_valid_geom, geometry = geometry_field)

    # Transform to ESPG:27700 for more interpretable area units
    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

### Data import

In [13]:
# get LAD to LPA lookup from github
lookup_lad_lpa = pd.read_csv("https://github.com/digital-land/organisation-collection/raw/main/data/local-authority.csv",
                             dtype="str",
                             usecols = ["entity", "local-authority-district", "local-planning-authority"])

lookup_lad_lpa.columns = ["organisation_entity", "LADCD", "LPACD"]

nrow(lookup_lad_lpa)
lookup_lad_lpa.head()

No. of records in df: 380


Unnamed: 0,organisation_entity,LADCD,LPACD
0,26,E07000223,E60000281
1,27,E07000026,E60000019
2,28,E07000032,E60000077
3,29,E07000224,E60000282
4,30,E07000105,E60000253


**Note on LAD to LPA mapping**   
Currently this [lookup file from github](https://github.com/digital-land/organisation-collection/raw/main/data/local-authority.csv) just records a 1:1 link between LADs and LPAs, but according to the ONS this relationship is actually 1:many. 
See [2020 lookup file](https://geoportal.statistics.gov.uk/datasets/ons::local-planning-authority-to-local-authority-district-april-2020-in-the-united-kingdom-lookup-1/about) and the example of Ryedale [`E07000167`], which is mapped to the following two LPAs:

* Ryedale LPA [`E60000061`]
* North York Moors National Park LPA [`E60000322`]

We need to agree some validation rules around this, i.e. can we expect Ryedale to submit data that might sit within either of these LPA areas, or for any London Boroughs to submit within the "London Legacy Development Corporation LPA" area?
But for simplicity's sake at the moment to get things up and running (as per Owen's advice), will test with existing 1:1 mapping and aim to develop logic once there is more clarity about multiple area handling.

The git lookup file also seems to be missing some areas, e.g. "Peak District National Park Authority" entity 405.

In [16]:
# get org data from datasette
lookup_org = get_all_organisations()

# lookup_org["organisation_entity"] = lookup_org["organisation_entity"].astype(str)
lookup_org.columns = ["organisation", "organisation_name", "organisation_entity", "statistical_geography"]

# split out org type and join on LPA codes from LAD to LPA lookup
lookup_org["organisation_type"] = lookup_org["organisation"].apply(lambda x: x.split(":")[0])
lookup_org = lookup_org.merge(lookup_lad_lpa, how = "left", on = "organisation_entity")

nrow(lookup_org)
lookup_org.head()

No. of records in df: 456


Unnamed: 0,organisation,organisation_name,organisation_entity,statistical_geography,organisation_type,LADCD,LPACD
0,development-corporation:Q20648596,Old Oak and Park Royal Development Corporation,1,E51000002,development-corporation,,
1,development-corporation:Q4916714,Birmingham Heartlands Development Corporation,2,,development-corporation,,
2,development-corporation:Q6670544,London Legacy Development Corporation,3,E51000001,development-corporation,,
3,development-corporation:Q6670837,London Thames Gateway Development Corporation,4,,development-corporation,,
4,development-corporation:Q72456968,South Tees Development Corporation,5,E51000004,development-corporation,,


In [17]:
# check what types of org are missing the LPA code
nrow(lookup_org[lookup_org["LPACD"].isnull()])
lookup_org[lookup_org["LPACD"].isnull()].groupby("organisation_type").size()

No. of records in df: 137


organisation_type
development-corporation          26
government-organisation          23
local-authority                  61
national-park-authority          10
nonprofit                         1
passenger-transport-executive     9
public-authority                  1
regional-park-authority           1
waste-authority                   5
dtype: int64

In [8]:
# LPA boundary data from planning.data.gov

LPA_boundary_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/local-planning-authority.csv", 
                                  usecols = ["reference", "name", "geometry"])

LPA_boundary_df.columns = ["geometry", "name", "LPACD"]


# load geometry and create GDF
LPA_boundary_df['geometry'] = LPA_boundary_df['geometry'].apply(shapely.wkt.loads)
LPA_boundary_gdf = gpd.GeoDataFrame(LPA_boundary_df, geometry='geometry')

# Transform to ESPG:27700 for more interpretable area units
LPA_boundary_gdf.set_crs(epsg=4326, inplace=True)
LPA_boundary_gdf.to_crs(epsg=27700, inplace=True)

nrow(LPA_boundary_gdf)
LPA_boundary_gdf.head()


No. of records in df: 326


Unnamed: 0,geometry,name,LPACD
0,"MULTIPOLYGON (((428366.003 554230.393, 428288....",County Durham LPA,E60000001
1,"MULTIPOLYGON (((436388.046 522354.244, 436372....",Darlington LPA,E60000002
2,"MULTIPOLYGON (((449073.036 536806.421, 448888....",Hartlepool LPA,E60000003
3,"MULTIPOLYGON (((451894.321 521145.352, 451858....",Middlesbrough LPA,E60000004
4,"MULTIPOLYGON (((429247.025 604972.344, 429241....",Northumberland LPA,E60000005


In [23]:
entity_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [25]:
# load conservation area entity dataset from planning.data.gov into geopandas and transform CRS to EPSG:27700

entity_gdf = get_pdp_dataset("conservation-area", crs_out=27700)
            
# join organisation name and LPA codes from lookup
entity_gdf = entity_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")


# calculate area
entity_gdf["area"] = entity_gdf["geometry"].area

# flag for whether org is HE or LPA
entity_gdf["org_HE_LPA"] = np.where(entity_gdf["organisation_entity"] == '16', "Historic England", "Local Planning Authority")

nrow(entity_gdf)
entity_gdf.head()

No. of records in df: 8,611


Unnamed: 0,dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,...,document_url,documentation_url,legislation,notes,organisation,organisation_name,organisation_type,LPACD,area,org_HE_LPA
0,conservation-area,,44000001,2004-06-01,,"MULTIPOLYGON (((516981.159 204270.242, 516973....",Napsbury,278,POINT(-0.312224 51.722268),conservation-area,...,,https://stalbans.gov.uk/sites/default/files/do...,,,,St Albans City and District Council,local-authority,E60000171,495087.300218,Local Planning Authority
1,conservation-area,,44000002,2004-06-01,,"MULTIPOLYGON (((512390.333 209659.962, 512382....",Shafford Mill,278,POINT(-0.370182 51.770914),conservation-area,...,,https://stalbans.gov.uk/sites/default/files/do...,,,,St Albans City and District Council,local-authority,E60000171,136187.979619,Local Planning Authority
2,conservation-area,,44000003,2004-06-01,,"MULTIPOLYGON (((511610.510 205098.079, 511611....",Potters Crouch,278,POINT(-0.383726 51.734475),conservation-area,...,,https://stalbans.gov.uk/sites/default/files/do...,,,,St Albans City and District Council,local-authority,E60000171,34603.675292,Local Planning Authority
3,conservation-area,,44000004,2004-06-01,,"MULTIPOLYGON (((512515.275 200300.431, 512520....",Old Bricket Wood,278,POINT(-0.371260 51.690943),conservation-area,...,,https://stalbans.gov.uk/sites/default/files/do...,,,,St Albans City and District Council,local-authority,E60000171,55128.469061,Local Planning Authority
4,conservation-area,,44000005,2004-06-01,,"MULTIPOLYGON (((520248.830 206717.191, 520410....",Sleapshyde,278,POINT(-0.259167 51.747633),conservation-area,...,,https://www.stalbans.gov.uk/sites/default/file...,,,,St Albans City and District Council,local-authority,E60000171,44167.433073,Local Planning Authority


In [26]:
old_entity_df = get_old_entity("conservation-area")
old_entity_df["entity"] = old_entity_df["entity"].astype('str')
old_entity_df["old_entity"] = old_entity_df["old_entity"].astype('str')

nrow(old_entity_df)
old_entity_df.head()

No. of records in df: 947


Unnamed: 0,end_date,entity,entry_date,notes,old_entity,start_date,status
0,,44009617.0,,,44008389,,301
1,,44009617.0,,,44008390,,301
2,,44009621.0,,,44008391,,301
3,,44009621.0,,,44008392,,301
4,,44009621.0,,,44008393,,301


In [11]:
# pd.concat([old_entity_df["entity"], old_entity_df["old_entity"]], ignore_index=True).drop_duplicates()

# Checking expected bounds of data  
## Method

### Data sources
Currently, this process is using 3 data sources to carry out this check
1. the [local authority organisation table](https://raw.githubusercontent.com/digital-land/organisation-collection/main/data/local-authority.csv) on digital-land github, which contains the fields `local-authority-district`, `local-planning-authority` for each organisation.
2. the [organisation table on Datasette](https://datasette.planning.data.gov.uk/digital-land?sql=select%0D%0A++entity%2C%0D%0A++name%2C%0D%0A++organisation%2C%0D%0A++statistical_geography%0D%0Afrom%0D%0A++organisation%0D%0Aorder+by%0D%0A++organisation%0D%0Alimit%0D%0A++1000), which contains a `statistical-geography` field for each organisation.
3. the [LPA boundaries on planning.data.gov.uk](https://www.planning.data.gov.uk/dataset/local-planning-authority), which appear to be copies of the ONS boundaries.

The LPA code for organisations is picked up by joining from #2 to #1 on the `organisation-entity` field, and then joining on the geometry from #3 by joining on the `LPACD` field. Note - this method means we're not checking boundaries for non-LPA organisation. See below for which orgs aren't included.

### Geometry checks
There are two options to run this check:

1. Find entities which are **completely outside** of the expected LPA boundary, i.e. the boundary of the entity does not cross the boundary of the LPA at all.
2. Find entities which are either **completely or partially outside** of the expected LPA boundary, in which the distance outside to check for can be set. This is a more stringent check which you could use to find entities which are mostly within the LPA boundary but part of them is more than 50 metres beyond.

Make sure the version you want to use is the un-commented one in the cell three below, if using #2 set the distance using the `BOUNDARY_BUFFER_DISTANCE` variable.

In [12]:
# check of the organisations that we don't have an LPA code for
entity_df[entity_df["LPACD"].isnull()].groupby(["organisation_type", "organisation_name"]).size()

organisation_type        organisation_name                    
development-corporation  London Legacy Development Corporation       2
government-organisation  Historic England                         7032
local-authority          North Dorset District Council              37
                         Purbeck District Council                  126
national-park-authority  Peak District National Park Authority      21
dtype: int64

In [13]:
# List LPA codes from entity df and check they're all in the LPA gdf
lpa_list = entity_df["LPACD"][entity_df["LPACD"].notnull()].drop_duplicates().to_list()

# check every one of our entity LPAs is in the LPA gdf
print(len(lpa_list))
nrow(LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(lpa_list)])

72
No. of records in df: 72


In [14]:
geogs_out_entities = []         # to store list of entities outside boundary 
BOUNDARY_BUFFER_DISTANCE = 50   # to set distance outside of expected boundary entity must exceed to be flagged

# loop through LPA codes and for each check whether any conservation areas with that code don't intersect at all with the LPA boundary
for lpa_code in lpa_list:

    cons_areas = entity_gdf.loc[entity_gdf["LPACD"] == lpa_code]
    lpa_boundary = LPA_boundary_gdf.loc[LPA_boundary_gdf["LPACD"] == lpa_code]

    # 1. Completely outside version
    # cons_areas_intersect = cons_areas.geometry.intersects(lpa_boundary.iloc[0].geometry)

    # 2. Exceeds by x metres version
    lpa_boundary.loc[:, 'geometry'] = lpa_boundary.buffer(BOUNDARY_BUFFER_DISTANCE)
    cons_areas_intersect = cons_areas.geometry.within(lpa_boundary.reset_index().loc[0, 'geometry'])

    # add areas which don't intersect to the list
    geogs_out_entities.extend(cons_areas.loc[~cons_areas_intersect]["entity"].to_list())


entity_outside_LPA_df = entity_df[entity_df["entity"].isin(geogs_out_entities)]

# list of LPAs with entities outside them
LPAs_with_bads = entity_outside_LPA_df["LPACD"].drop_duplicates().to_list()

print(f"No. of entities not contained by their expected boundary: {len(entity_outside_LPA_df):,}")
entity_outside_LPA_df.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False)


No. of entities not contained by their expected boundary: 24


LPACD      organisation_entity  organisation_name                     
E60000183  33                   Babergh District Council                  12
E60000184  132                  East Suffolk Council                       3
E60000067  294                  Sheffield City Council                     1
E60000163  268                  Rochford District Council                  1
E60000178  152                  Great Yarmouth Borough Council             1
E60000191  169                  London Borough of Hammersmith & Fulham     1
E60000198  329                  London Borough of Southwark                1
E60000208  115                  London Borough of Ealing                   1
E60000213  167                  London Borough of Hillingdon               1
E60000268  352                  Vale of White Horse District Council       1
E60000283  80                   Chichester District Council                1
dtype: int64

In [15]:
# entity_outside_LPA_df[entity_outside_LPA_df["organisation_entity"] == 33].to_csv(os.path.join(data_dir, "LPA_out_of_bounds-Babergh.csv"))

In [16]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

ents = entity_gdf[entity_gdf["entity"].isin(entity_outside_LPA_df["entity"])]


map_entities = ents.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(ents["LPACD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

In [17]:
# Plot all entities for a single LPA outside of expected boundaries, alongside the LPA boundary

# LPA_code = LPAs_with_bads[3]
LPA_code = "E60000191"
bad_ents = entity_outside_LPA_df["entity"][entity_outside_LPA_df["LPACD"] == LPA_code]


map_entities = entity_gdf[entity_gdf["entity"].isin(bad_ents)].explore(
        # column = chloro_var,  # make choropleth based on "BoroName" column
        # cmap = palette,
    color = "red",
        # tooltip = False,
        # popup = ["organisation_name", "entity", "name", "reference"],
        tiles = "CartoDB positron",  # use "CartoDB positron" tiles
        highlight = False,
        style_kwds = {
        "fillOpacity" : "0.1"
        }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"] == LPA_code].explore(
    m = map_entities,
    color = "blue",
        style_kwds = {
        "fillOpacity" : "0"
        }
)

# Identifying geographical duplicates  
## Report

Aim of this is to quickly categorise the overlaps based on whether they fall into the following groups:

Entity overlaps with another: 

1. within the same organisation
    
2. from a different organisation   

    a. LPA entity overlaps with entity from another LPA
        
    b. LPA entity overlaps with entity from Historic England


<br>

As well as classifying by how much the overlaps are happening in order inform possible resolutions. Each entity-entity overlap is put in one of the following groups, which are given a corresponding priority to address:


* **> 90% combined match** (high priority): 90% or more of each entity's area overlaps with the other - this suggests the boundaries of each almost perfectly match  

* **> 90% single match** (medium priority): 90% of more of one entity's area overlaps with the other - this suggests the entities overlap but the boundaries don't closely match (one may be much larger than the other, for instance)
* **> edge intersection** (low priority): between 1 - 10% of each entity's area overlaps with the other
* **> unclassified** (low priority): the two entities overlap somewhat, but the overlapped area makes up less than 90% of each entity's area
* **> tiny edge** (ignore): less than 1% of each entity's area overlaps with the other - there are a large number of these and are relatively normal when combining data from many sources.

<br>

Some other useful bits of information are flagged in order to make some suggestions about why the problem has happened. These should be checked when using the output csvs:
* does the entry date for each entity match? When it doesn't it suggests that the issue may have arisen from data being combined from successive endpoints.
* do either of the entities exist as an old entity in the `old-entity.csv` for the collection?
* do either of the entities have other issues associated with them (excluding edge intersections, as it's common to have many of these). This can be useful to know if an issue is that one much larger polygon covers many smaller ones.

<br>

In [27]:
MATCH_LOWER_THRESH = 0.9  # defines the lower limit of the shared overlap between two entities to be called a match
EDGE_UPPER_THRESH = 0.1   # defines the upper limit of the shared overlap between two entities to be called an edge intersection
EDGE_LOWER_THRESH = 0.01   # defines the lower limit of the shared overlap between two entities to be called an edge intersection


# full join of all geometries
entity_join_all = gpd.overlay(
    entity_gdf, 
    entity_gdf,
    how = "intersection", keep_geom_type=False 
)


# remove self-intersections and duplicates of the same intersections
entity_join_all = entity_join_all[entity_join_all["entity_1"] != entity_join_all["entity_2"]]

entity_join_all["entity_join"] = entity_join_all.apply(lambda x: '-'.join(sorted(x[["entity_1", "entity_2"]])), axis=1)

# extra sort to make sure matches to Historic England always show as Historic England as org 2 
entity_join_all["name_for_sort"] = np.where(entity_join_all["organisation_entity_1"] == "16", "Z", "A")
entity_join_all.sort_values(["entity_join", "name_for_sort"], ascending=True, inplace=True)

entity_join_all.drop_duplicates(subset="entity_join", inplace = True)  #Drop them by name

# nrow(entity_join_all)

# flag the types of intersections between organisations
# is org the same
entity_join_all["int_org_match"] = np.where(entity_join_all["organisation_entity_1"] == entity_join_all["organisation_entity_2"], True, False)

# the types of org-org matches
entity_join_all["int_org_types"] = np.select(
    [
        (entity_join_all["organisation_entity_1"] == "16") & (entity_join_all["organisation_entity_2"] == "16"),
        (entity_join_all["organisation_entity_1"] != "16") & (entity_join_all["organisation_entity_2"] != "16"),
        ((entity_join_all["organisation_entity_1"] != "16") & (entity_join_all["organisation_entity_2"] == "16")) |
        ((entity_join_all["organisation_entity_1"] == "16") & (entity_join_all["organisation_entity_2"] != "16"))
    ],
    ["HE - HE", "LPA - LPA", "HE - other"],
    default = "-"
)

# does the entity entry date match?
entity_join_all["date_match"] = np.where(entity_join_all["entry_date_1"] == entity_join_all["entry_date_2"], True, False)

# has one of the intersected entities already been re-mapped?
entity_join_all["entity_old"] = np.where(entity_join_all["entity_1"].isin(old_entity_df["old_entity"]) |
                                         entity_join_all["entity_2"].isin(old_entity_df["old_entity"]), True, False)


# calculate overlap %'s

entity_join_all["area_intersection"] = entity_join_all["geometry"].area

entity_join_all["p_pct_intersect"] = entity_join_all["area_intersection"] / entity_join_all["area_1"]
entity_join_all["pct_intersection"] = entity_join_all["area_intersection"] / (entity_join_all["area_1"] + entity_join_all["area_2"] - entity_join_all["area_intersection"])
entity_join_all["s_pct_intersect"] = entity_join_all["area_intersection"] / entity_join_all["area_2"]

# intersection area as % of smallest primary or secondary area
entity_join_all["pct_min_intersection"] = entity_join_all["area_intersection"] / entity_join_all[["area_1", "area_2"]].min(axis = 1)


entity_join_all["intersection_type"] = np.select(
    [
        (entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) & (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH),
        (entity_join_all["pct_min_intersection"] <= EDGE_UPPER_THRESH) & (entity_join_all["pct_min_intersection"] >= EDGE_LOWER_THRESH),
        (entity_join_all["pct_min_intersection"] < EDGE_LOWER_THRESH),
        ((entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) | (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH)),
        
    ],
    [
        "> 90% combined match", "edge intersection", "tiny edge - ignore", "> 90% single match"
    ],
    default = "unclassified"
)

nrow(entity_join_all)
entity_join_all.head()

No. of records in df: 2,347


Unnamed: 0,dataset_1,end_date_1,entity_1,entry_date_1,geojson_1,name_1,organisation_entity_1,point_1,prefix_1,reference_1,...,int_org_match,int_org_types,date_match,entity_old,area_intersection,p_pct_intersect,pct_intersection,s_pct_intersect,pct_min_intersection,intersection_type
7,conservation-area,,44000009,2004-06-01,,Childwickbury,278,POINT(-0.349369 51.784599),conservation-area,CA3,...,True,LPA - LPA,True,False,2.170036,1e-06,4.225115e-07,6.675948e-07,1e-06,tiny edge - ignore
19,conservation-area,,44000770,2023-07-22,,Leominster Town,16,POINT(-2.737060 52.227795),conservation-area,2499,...,True,HE - HE,True,False,2.033437,8e-06,4.665495e-06,1.128278e-05,1.1e-05,tiny edge - ignore
47,conservation-area,,44000043,2023-07-22,,Butterworth Hall,16,POINT(-2.105575 53.607415),conservation-area,7716,...,True,HE - HE,True,False,0.0,0.0,0.0,0.0,0.0,tiny edge - ignore
57,conservation-area,,44003132,2023-07-22,,Worcester and Birmingham Canal,16,POINT(-2.121045 52.248916),conservation-area,449,...,True,HE - HE,True,False,4.129991,1.6e-05,7.315925e-06,1.329709e-05,1.6e-05,tiny edge - ignore
100,conservation-area,,44005296,2023-07-22,,Lewes,16,POINT(0.007409 50.872619),conservation-area,3207,...,True,HE - HE,True,False,125.147014,9.2e-05,8.53257e-05,0.001178229,0.001178,tiny edge - ignore


In [28]:
# Flag entities which have multiple issues (this is discounting where the issue type is tiny edge intersections)
no_tinies = entity_join_all[entity_join_all["intersection_type"] != "tiny edge - ignore"]

all_ents = pd.concat([no_tinies["entity_1"], no_tinies["entity_2"]], ignore_index = True)
multi_issue_ents = all_ents.loc[all_ents.duplicated()]


entity_join_all["multiple_issues"] = np.where(((entity_join_all["intersection_type"] != "tiny edge - ignore") & (entity_join_all["entity_2"].isin(multi_issue_ents))) |
                                              ((entity_join_all["intersection_type"] != "tiny edge - ignore") & (entity_join_all["entity_1"].isin(multi_issue_ents))),
                                                 True, False)

entity_join_all.groupby("multiple_issues").size()


multiple_issues
False    2063
True      284
dtype: int64

In [29]:
# FLAGGING ISSUE DETAILS

# Org overlap types
entity_join_all["issue_type"] = np.select(
    [
        (entity_join_all["int_org_match"] == False) & (entity_join_all["int_org_types"] == "HE - other"),
        (entity_join_all["int_org_match"] == False) & (entity_join_all["int_org_types"] == "LPA - LPA"),
        (entity_join_all["int_org_match"] == True) & (entity_join_all["int_org_types"] == "HE - HE"),
        (entity_join_all["int_org_match"] == True) & (entity_join_all["int_org_types"] == "LPA - LPA")
    ],
    ["Between organisations - Historic England to LPA",
     "Between organisations - LPA to a different LPA",
     "Within organisation - Historic England",
     "Within organisation - LPA"],

    default = "-"
)

# Action
entity_join_all["action"] = np.select(
    [
        (entity_join_all["intersection_type"] == "tiny edge - ignore"),
        (entity_join_all["int_org_match"] == True) & (entity_join_all["int_org_types"] == "HE - HE"),
        (entity_join_all["int_org_match"] == False) & 
            (entity_join_all["int_org_types"] == "HE - other") & 
            (entity_join_all["intersection_type"] == "> 90% combined match") &
            (entity_join_all["multiple_issues"] == False) 

    ],
    ["ignore", "ignore", "remap"],
    default = "investigate"
)

# Priority and hint
issue_priority_mapping = {
    "> 90% combined match" : "high",
    "> 90% single match" : "medium",
    "edge intersection" : "low",
    "unclassified" : "low",
    "tiny edge - ignore" : ""
}

issue_hint_mapping = {
    True : "raise with LPA",
    False : "check endpoints"
}

entity_join_all["priority"] = [issue_priority_mapping[data["intersection_type"]] if data["action"] != "ignore" else "" for (index, data) in entity_join_all.iterrows()]
entity_join_all["hint"] = [issue_hint_mapping[data["date_match"]] if 
                           (data["int_org_match"] == True) & (data["int_org_types"] == "LPA - LPA") & (data["action"] != "ignore") 
                           else "" for (index, data) in entity_join_all.iterrows()]

In [30]:
# write full report table to csv

nicecols = [
    'entity_join', 'entity_1', 'entry_date_1', 'name_1', 'organisation_entity_1',
    'reference_1', 'organisation_name_1', 
    'entity_2', 'entry_date_2', 'name_2', 'organisation_entity_2',
    'reference_2', 'organisation_name_2', 
    'pct_min_intersection', 
    'date_match', 'entity_old',
    'intersection_type', 'issue_type', 'action',
    'priority', 'hint', 'multiple_issues'
    ]

entity_join_all[nicecols].to_csv(os.path.join(data_dir, "issues_all.csv"), index=False)

In [31]:
addressable_issues = entity_join_all[entity_join_all["action"] != "ignore"]

print(f"there are {len(addressable_issues)} addressable issues in total")
print("\n")
addressable_issues.groupby(['issue_type', 'intersection_type', 'action', 'priority']).size()

there are 524 addressable issues in total




issue_type                                       intersection_type     action       priority
Between organisations - Historic England to LPA  > 90% combined match  investigate  high          5
                                                                       remap        high         43
                                                 > 90% single match    investigate  medium      201
                                                 edge intersection     investigate  low          10
                                                 unclassified          investigate  low          10
Between organisations - LPA to a different LPA   > 90% combined match  investigate  high          2
                                                 > 90% single match    investigate  medium        1
                                                 edge intersection     investigate  low           2
Within organisation - LPA                        > 90% combined match  investigate  high        194
       

In [23]:
# count of issues by type breakdown

print(f"there are {len(entity_join_all)} issues in total")
print("\n")
entity_join_all.groupby(['issue_type', 'intersection_type', 'action', 'priority', 'hint']).size().head(40)


there are 2860 issues in total




issue_type                                       intersection_type     action       priority  hint           
Between organisations - Historic England to LPA  > 90% combined match  investigate  high                          17
                                                                       remap        high                         285
                                                 > 90% single match    investigate  medium                       260
                                                 edge intersection     investigate  low                           17
                                                 tiny edge - ignore    ignore                                    467
                                                 unclassified          investigate  low                           19
Between organisations - LPA to a different LPA   edge intersection     investigate  low                            5
                                                 tiny edge - ignore    

### Testing grouping issues

In [None]:
nar = no_tinies[["entity_join", "entity_1", "entity_2"]].melt(["entity_join"]).sort_values("entity_join")

multi_issue_ents = nar.loc[nar.duplicated(subset = "value", keep = False)].copy()

multi_issue_ents["issue_group_id"] = multi_issue_ents.groupby("value").ngroup() + 1
multi_issue_ents

In [None]:
issue_group_ids = multi_issue_ents[["entity_join", "issue_group_id"]].drop_duplicates()


entity_join_multi_issues = entity_join_all[nicecols].merge(
    issue_group_ids,
    on = "entity_join",
    how = "left"
)

entity_join_multi_issues.to_csv(os.path.join(data_dir, "issues_all_multis_flagged.csv"), index=False)

nrow(entity_join_all)
nrow(entity_join_multi_issues)
entity_join_multi_issues.head()

In [None]:
nar[nar["value"].isin(["44000170", "44000126"])]

In [None]:
multi_issue_ents[multi_issue_ents["value"].isin(["44000602", "44009051"])]

In [None]:
plot_issues_map(
    entity_gdf,
    get_issue_entities(entity_join_multi_issues[entity_join_multi_issues["issue_group_id"] == 2]),
    "org_HE_LPA",
    "Accent")

In [None]:
plot_issues_map(
    entity_gdf,
    get_issue_entities(entity_join_multi_issues[entity_join_multi_issues["organisation_entity_1"] == 261]),
    "org_HE_LPA",
    "Accent")

In [None]:
grp_test = entity_join_all[entity_join_all["entity_join"].isin(["44001139-44009684", "44001139-44009685"])][["entity_1", "entity_2"]]
grp_test["issue_id"] = np.arange(1, len(grp_test) + 1).astype(str)
grp_test.head()

In [None]:
# pivot to issue and entities narrow table
grp_test_long = grp_test.melt(["issue_id"]).sort_values("issue_id")

grp_test_long

In [None]:
grp_test_long["test_rank"] = grp_test_long.groupby('issue_id').ngroup() + 1

grp_test_long

In [None]:
counted = grp_test_long.groupby("value").size().reset_index(name = "count")
links = grp_test_long[grp_test_long["value"].isin(counted[counted["count"] > 1]["value"])].copy()

links["issue_group_id"] = links.groupby('value')['value'].rank(method='first')

links

In [None]:
links.groupby('value').rank(method = "first")

In [None]:
grp_test_long[["value", "issue_id"]].drop_duplicates()

In [None]:


# join on entity to flag entities in one issue which also have another issue
grp_test_long_joined = grp_test_long.merge(
    grp_test_long[["value", "issue_id"]], 
    on = "value")

# de-dupe to just keep the first instance of each self join (e.g. keep issue1-issue2 and bin issue2-issue1)
grp_test_long_joined["issue_join"] = grp_test_long_joined.apply(lambda x: '-'.join(sorted(x[["issue_id_x", "issue_id_y"]])), axis=1)

# grp_test_long_joined.drop_duplicates(subset="issue_join", inplace = True)  #Drop them by name

grp_test_long_joined

In [None]:
grp_test_long_joined.drop_duplicates(subset="issue_join")

In [None]:
# check the flagging or intersections between different org types is correct
# entity_join_all.groupby(["int_org_match", "int_org_types", "organisation_entity_1", "organisation_entity_2"]).size()

## 1 - Intersection within organisation
Here we're only interested in issues where the intersections are within LPA datasets, the Historic England dataset contains overlapping polygons.

In [24]:
# filter df and write to csv
issues_1_df = entity_join_all[(entity_join_all["issue_type"] == "Within organisation - LPA") & (entity_join_all["action"] != "ignore")]
issues_1_df[nicecols].to_csv(os.path.join(data_dir, "issues_type_1-within_org-LPA.csv"), index = False)

# summarise
issues_1_df.groupby(["issue_type", "intersection_type", "action", "priority"]).size()

issue_type                 intersection_type     action       priority
Within organisation - LPA  > 90% combined match  investigate  high        10
                           > 90% single match    investigate  medium      36
                           edge intersection     investigate  low          4
                           unclassified          investigate  low          3
dtype: int64

In [25]:
# org count
issues_1_df.groupby(["organisation_entity_1", "organisation_name_1", "action", "hint"]).size().sort_values(ascending = False)

organisation_entity_1  organisation_name_1                     action       hint           
206                    Maldon District Council                 investigate  raise with LPA     21
192                    London Borough of Lambeth               investigate  check endpoints     8
129                    Epsom and Ewell Borough Council         investigate  check endpoints     7
169                    London Borough of Hammersmith & Fulham  investigate  check endpoints     7
329                    London Borough of Southwark             investigate  check endpoints     4
294                    Sheffield City Council                  investigate  check endpoints     3
113                    Dudley Metropolitan Borough Council     investigate  raise with LPA      2
67                     Buckinghamshire Council                 investigate  raise with LPA      1
dtype: int64

In [34]:
# inspect example
plot_issues_map(entity_gdf, 
                # get_issue_entities(issues_1_df[issues_1_df["organisation_entity_1"] == 206]), 
                get_issue_entities(issues_1_df[issues_1_df["organisation_name_1"] == "London Borough of Lambeth"].iloc[[0]]),
                "reference", 
                "Accent")

## 2 - Intersection across organisations
   
### 2.a LPA entity overlaps with entity from another LPA

In [36]:
# filter df and write to csv
issues_2a_df = entity_join_all[(entity_join_all["issue_type"] == "Between organisations - LPA to a different LPA") & (entity_join_all["action"] != "ignore")]
# issues_2a_df[nicecols].to_csv(os.path.join(data_dir, "issues_type_2a-between_org-LPA_to_LPA.csv"), index = False)

# summarise

issues_2a_df.groupby(["issue_type", "intersection_type", "action", "priority"]).size()

issue_type                                      intersection_type  action       priority
Between organisations - LPA to a different LPA  edge intersection  investigate  low         5
                                                unclassified       investigate  low         1
dtype: int64

In [37]:
# org count
issues_2a_df.groupby(["action", "organisation_entity_1", "organisation_name_1", "intersection_type"]).size().sort_values(ascending = False)

action       organisation_entity_1  organisation_name_1                      intersection_type
investigate  182                    Royal Borough of Kensington and Chelsea  edge intersection    2
             75                     Canterbury City Council                  edge intersection    1
             198                    London Borough of Lewisham               edge intersection    1
             309                    South Oxfordshire District Council       edge intersection    1
             329                    London Borough of Southwark              unclassified         1
dtype: int64

In [40]:
get_issue_entities(issues_2a_df[issues_2a_df["organisation_entity_1"] == 329])

['44009059', '44008830']

In [45]:
# inspect example
plot_issues_map(entity_gdf, 
                get_issue_entities(issues_2a_df[issues_2a_df["organisation_entity_1"] == 329]), 
                "organisation_name", 
                "Accent")

### 2.b LPA entity overlaps with entity from Historic England 

In [54]:
# summarise and output all non-ignore issues
issues_2b_df = entity_join_all.loc[(entity_join_all["issue_type"] == "Between organisations - Historic England to LPA") & (entity_join_all["action"] != "ignore")].copy()

# add in extra LPA - HE overlap types field for this table
issues_2b_df.loc[:, "org_overlap_type"] = np.select(
    [
        (issues_2b_df["p_pct_intersect"] >= MATCH_LOWER_THRESH) & (issues_2b_df["s_pct_intersect"] >= MATCH_LOWER_THRESH),
        (issues_2b_df["p_pct_intersect"] <= EDGE_LOWER_THRESH) & (issues_2b_df["s_pct_intersect"] <= EDGE_LOWER_THRESH),
        (issues_2b_df["p_pct_intersect"] >= MATCH_LOWER_THRESH),
        (issues_2b_df["s_pct_intersect"] >= MATCH_LOWER_THRESH)
    ],
    [
        "LPA and HE mostly cover each other", "LPA and HE edges overlap", "LPA mostly covered by HE", "LPA mostly covers HE"
    ],
    default = "-"
)

# issues_2b_df[nicecols + ["org_overlap_type"]].to_csv(os.path.join(data_dir, "issues_type_2b-between_org-HE_to_LPA.csv"), index = False)

# summarise
issues_2b_df.groupby(["issue_type", "intersection_type", "action", "priority", "multiple_issues", "org_overlap_type"]).size()

issue_type                                       intersection_type     action       priority  multiple_issues  org_overlap_type                  
Between organisations - Historic England to LPA  > 90% combined match  investigate  high      True             LPA and HE mostly cover each other     17
                                                                       remap        high      False            LPA and HE mostly cover each other    285
                                                 > 90% single match    investigate  medium    False            LPA mostly covered by HE               48
                                                                                                               LPA mostly covers HE                   19
                                                                                              True             LPA mostly covered by HE              183
                                                                                         

In [None]:
# org count
issues_2b_df.groupby("organisation_name_1").size().sort_values(ascending = False)

In [48]:
# OPTIONAL - save individual files for remap and investigate issues
# note - here remaps are only those entity-entity matches where neither entity also has another issue associated with it

# save HE to LPA issues which need merging/remapping
remaps = issues_2b_df.loc[(issues_2b_df["action"] == "remap")].copy()
# remaps[nicecols + ["org_overlap_type"]].to_csv(os.path.join(data_dir, "issues_type_2b-between_org-HE_to_LPA-remaps.csv"), index = False)

# save HE to LPA issues which need further investigation
investigates = issues_2b_df.loc[issues_2b_df["action"] == "investigate"].copy()

# investigates[nicecols + ["org_overlap_type"]].to_csv(os.path.join(data_dir, "issues_type_2b-between_org-HE_to_LPA-investigates.csv"), index = False)



In [49]:
nrow(remaps)

No. of records in df: 285


#### Validation map - entity merging

In [50]:
# Map all >90% close match issues where entities are to be merged to check no obvious misfires
remap_ents = pd.concat([remaps["entity_1"], remaps["entity_2"]]).drop_duplicates()
remap_gdf = entity_gdf[entity_gdf["entity"].isin(remap_ents)].copy()

remap_gdf["org_HE_LPA"] = np.where(remap_gdf["organisation_entity"] == 16, "Historic England", "Local Planning Authority")

remap_gdf.explore(
    column = "org_HE_LPA",  
    cmap = "Accent",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    highlight = False,
    style_kwds = {
    "fillOpacity" : "0.3"
    }
)


In [None]:
# check specific issue:
plot_issues_map(entity_gdf, get_issue_entities(issues_2b_df.iloc[[0]]), "organisation_name", "Accent")

#### Validation map - further investigation

In [51]:
# Map all >90% close match issues where entities are to be merged to check no obvious misfires
inv_ents = pd.concat([investigates["entity_1"], investigates["entity_2"]]).drop_duplicates()
inv_gdf = entity_gdf[entity_gdf["entity"].isin(inv_ents)].copy()

inv_gdf["org_HE_LPA"] = np.where(inv_gdf["organisation_entity"] == 16, "Historic England", "Local Planning Authority")

inv_gdf.explore(
    column = "org_HE_LPA",  
    cmap = "Accent",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    highlight = False,
    style_kwds = {
    "fillOpacity" : "0.3"
    }
)


In [56]:
# Map all unclassified

unclass = issues_2b_df[issues_2b_df["intersection_type"] == "unclassified"]

unclass_ents = pd.concat([unclass["entity_1"], unclass["entity_2"]]).drop_duplicates()
unclass_gdf = entity_gdf[entity_gdf["entity"].isin(unclass_ents)].copy()

unclass_gdf["org_HE_LPA"] = np.where(unclass_gdf["organisation_entity"] == 16, "Historic England", "Local Planning Authority")

unclass_gdf.explore(
    column = "org_HE_LPA",  
    cmap = "Accent",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    highlight = False,
    style_kwds = {
    "fillOpacity" : "0.3"
    }
)
