# Entities out of LPA bounds demo
**Author**:  Greg Slater <br>
**Date**:  20th August 2024 <br>
**Dataset Scope**: all <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose
To hand over to infrastructure team the proposed method of identifying entities outside of expected LPA bounds. Related to [Jira ticket #1335](https://trello.com/c/Xsd2eIAe).

In [1]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np

pd.set_option("display.max_rows", 100)


### Functions

In [2]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation as org, entity as org_entity, name as org_name, dataset as org_dataset, local_planning_authority as LPACD
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf


def get_provisions():
    global provisions_df  
    params = urllib.parse.urlencode({
        "sql": f"""
            SELECT
                cohort, notes, organisation, project, provision_reason, start_date
            FROM
                provision   
            WHERE 
                provision_reason = "expected"
                AND project = "open-digital-planning"
            GROUP BY organisation
            ORDER BY cohort
        """,
        "_size": "max"
    })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    provisions_df = pd.read_csv(url)
    return provisions_df

### Data import

In [3]:
# get prov
provisions_df = get_provisions()

# get orgs
org_df = get_all_organisations()
# flag ODP
org_df["odp_flag"] = np.where(org_df["org"].isin(provisions_df["organisation"]), True, False)

print(len(org_df))

452


In [4]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point", "geometry"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

ca_df.rename(columns={"organisation_entity":"org_entity"}, inplace=True)

# join organisation name and LPA codes from lookup
ca_df = ca_df.merge(
    org_df[["org_entity", "org_name", "org_dataset",  "LPACD"]], 
    how = "left",
    on = "org_entity")

# load to gdf, both point and poly versions
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_point_gdf = gpd.GeoDataFrame(ca_df, geometry="point")

ca_df.drop("point", axis=1, inplace=True)

ca_df["geometry"] = ca_df["geometry"].apply(shapely.wkt.loads)
ca_poly_gdf = gpd.GeoDataFrame(ca_df, geometry="geometry")

# Transform to ESPG:27700 for more interpretable area units
ca_point_gdf.set_crs(epsg=4326, inplace=True)
ca_point_gdf.to_crs(epsg=27700, inplace=True)

ca_poly_gdf.set_crs(epsg=4326, inplace=True)
ca_poly_gdf.to_crs(epsg=27700, inplace=True)

# calculate area
ca_poly_gdf["area"] = ca_poly_gdf["geometry"].area

print(len(ca_poly_gdf))
print(len(ca_point_gdf))

8310
8310


In [5]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf.rename(columns={'name':'lpa_name', 'reference':'LPACD'}, inplace=True)

print(len(lpa_gdf))

337


## Geo

In [6]:
# check of the organisations that we don't have an LPA code for
ca_df[ca_df["LPACD"].isnull()].groupby(["org_dataset", "org_name"]).size()

org_dataset              org_name                     
government-organisation  Historic England                 6916
local-authority          North Dorset District Council      37
                         Purbeck District Council          126
dtype: int64

In [7]:
# List LPA codes from entity df and check they're all in the LPA gdf
lpa_list = ca_df["LPACD"][ca_df["LPACD"].notnull()].drop_duplicates().to_list()

# check every one of our entity LPAs is in the LPA gdf
print(len(lpa_list))
print(len(lpa_gdf[lpa_gdf["LPACD"].isin(lpa_list)]))

46
46


In [8]:
entities_beyond = []         # to store list of entities outside boundary 
BOUNDARY_BUFFER_DISTANCE = 50   # to set distance outside of expected boundary entity must exceed to be flagged

# loop through LPA codes and for each check whether any conservation areas with that code are not within the buffered LPA boundary
for lpa_code in lpa_list:

    cons_areas = ca_poly_gdf.loc[ca_poly_gdf["LPACD"] == lpa_code]
    lpa_boundary = lpa_gdf.loc[lpa_gdf["LPACD"] == lpa_code].reset_index().geometry[0]
    
    # 2. Exceeds by x metres version
    lpa_boundary_buff = lpa_boundary.buffer(BOUNDARY_BUFFER_DISTANCE)
    cons_areas_beyond = cons_areas.geometry.within(lpa_boundary_buff)

    # add areas which don't intersect to the list
    entities_beyond.extend(cons_areas.loc[~cons_areas_beyond]["entity"].to_list())


entity_outside_LPA_df = ca_df[ca_df["entity"].isin(entities_beyond)]

# list of LPAs with entities outside them
LPAs_with_bads = entity_outside_LPA_df["LPACD"].drop_duplicates().to_list()

print(f"No. of entities not contained by their expected (and buffered) boundary: {len(entity_outside_LPA_df):,}")
entity_outside_LPA_df.groupby(["LPACD", "org_entity", "org_name"]).size().sort_values(ascending=False)


No. of entities not contained by their expected (and buffered) boundary: 47


LPACD      org_entity  org_name                   
E60000291  92          Cornwall Council               41
E60000184  132         East Suffolk Council            4
E60000163  268         Rochford District Council       1
E60000283  80          Chichester District Council     1
dtype: int64