# LPA bounds exceeded report
**Author**:  Greg Slater <br>
**Date**:  12 April 2024 <br>
**Data Scope**: article-4-direction, listed-building, conservation-area, and tree-preservation-order collections <br>
**Report Type**: Recurring weekly <br>

## Purpose
This report identifies data which has been submitted by an Local Planning Authority that is outside of the LPA boundary.

In [None]:
import pandas as pd
import geopandas as gpd
import os
import urllib

pd.set_option("display.max_rows", 100)


In [None]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify

In [None]:
# global variables
data_dir = "../../data/reports/data_exceeding_lpa_bounds/"
os.makedirs(data_dir, exist_ok=True)

download = input("Do you want to download the result? (yes/no): ")
show_maps = input("Do you want to display entities outside LPA bounds on interactive maps? (yes/no): ")

### Functions

In [None]:
import pandas as pd
import geopandas as gpd
import urllib


def nrow(df):
    return print(f"No. of records in df: {len(df):,}")

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name as organisation_name, entity as organisation_entity, local_planning_authority as LPACD, statistical_geography
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


def get_provision_orgs():
    params = urllib.parse.urlencode({
        "sql": f"""
        select distinct organisation, cohort as provision_cohort
        from provision
        where provision_reason = "expected"
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


# function to grab geojson from pdp, replace hyphens in field names and transform to ESPG:27700
def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]

    try:
        gdf["organisation_entity"] = gdf["organisation_entity"].astype(int)

    except:
        print("issue with organisation_entity column") 

    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf


# functions to check a table of geometries and return the ones that don't intersect with a border
def find_geos_beyond_border(geos, border):

    # filter table of geos to those which are outside of a single border
    geos_beyond = geos.loc[
        geos.geometry.disjoint(
            border.iloc[0].geometry)
        ]
    
    return geos_beyond

# given a geometry gdf (which has a LAD/LPA code field) and a boundary gdf, and the LPA/LAD boundary type
# loop and check geometries against the boundary they should be in
def check_against_border(geos_gdf, border_gdf, code_type):

    # list of the area codes in input geos_gdf to loop through
    code_list = geos_gdf[code_type][geos_gdf[code_type].notnull()].drop_duplicates().to_list()

    # check geos in each area code against area border
    results = [
       find_geos_beyond_border(
        geos_gdf[geos_gdf[code_type] == code], 
        border_gdf.loc[border_gdf[code_type] == code]
    ) for code in code_list 
    ]

    return pd.concat(results)


def check_LPA_codes_valid(pdp_entity_gdf):

    join_check = pdp_entity_gdf[["LPACD", "organisation_name"]].drop_duplicates().merge(
        LPA_boundary_gdf[["LPACD"]],
        how = "left",
        on = "LPACD",
        indicator = True
    )

    return join_check[join_check["_merge"] == "left_only"][["LPACD", "organisation_name"]]


def map_error_geos(pdp_entity_gdf):

    map_entities = pdp_entity_gdf.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    })

    map_fin = LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(pdp_entity_gdf["LPACD"])].explore(
        m = map_entities,
        color = "blue",
        tooltip = False,
        popup = ["name", "LPACD"],
            style_kwds = {
            "fillOpacity" : "0"
            })
    
    return map_fin

### Data import

In [None]:
orgs_prov = get_provision_orgs()
# orgs_prov.head()

In [None]:
# get org data from datasette
lookup_org = get_all_organisations()

# split out org type and join on LPA codes from LAD to LPA lookup
lookup_org["organisation_type"] = lookup_org["organisation"].apply(lambda x: x.split(":")[0])
lookup_org = lookup_org.merge(orgs_prov, how = "left", on = "organisation")

nrow(lookup_org)
lookup_org.head()

In [None]:
LPA_boundary_gdf = get_pdp_geo_dataset("local-planning-authority")[["reference", "name", "geometry"]]

LPA_boundary_gdf.rename(columns={"reference":"LPACD"}, inplace=True)

nrow(LPA_boundary_gdf)
LPA_boundary_gdf.head()

In [None]:
# quick counts and cross-checks between LPA codes from org table and LPA boundaries
pdp_la_orgs = lookup_org[lookup_org["organisation_type"] == "local-authority"]

LPA_valid = pdp_la_orgs.merge(
    LPA_boundary_gdf,
    how = "inner",
    on = "LPACD"
)

print(f"n local-authority orgs in PDP organisation entity table: {len(pdp_la_orgs)}")
print(f"n local authorities in pdp local-authority-district dataset : {len(LPA_boundary_gdf)}")
print(f"n in both (by LPA code): {len(LPA_valid)}")


# Checking expected bounds of data  

## Conservation areas

In [None]:
# download CA dataset
ca_gdf = get_pdp_geo_dataset("conservation-area")

nrow(ca_gdf)

ca_gdf = ca_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(ca_gdf)

ca_missing_codes = check_LPA_codes_valid(ca_gdf)

if len(ca_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the checks''')
    print("")
    print(ca_missing_codes)


In [None]:
# check against LPA
ca_beyond_LPA = check_against_border(ca_gdf, LPA_boundary_gdf, "LPACD")

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_conservation-area_LPA.csv")
    ca_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

print("")
print(f"No. of entities outside of their expected boundary: {len(ca_beyond_LPA):,}")
ca_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
if show_maps.lower() == "yes": 
    display(map_error_geos(ca_beyond_LPA))

## Listed Building Outlines

In [None]:
# download LB dataset
lb_gdf = get_pdp_geo_dataset("listed-building-outline")

nrow(lb_gdf)

lb_gdf = lb_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(lb_gdf)

lb_missing_codes = check_LPA_codes_valid(lb_gdf)

if len(lb_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the checks''')
    print("")
    print(lb_missing_codes)


In [None]:
# check against LPA
lb_beyond_LPA = check_against_border(lb_gdf, LPA_boundary_gdf, "LPACD")

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_listed-building-outline_LPA.csv")
    lb_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

print("")
print(f"No. of entities outside of their expected boundary: {len(lb_beyond_LPA):,}")
lb_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
if show_maps.lower() == "yes": 
    display(map_error_geos(lb_beyond_LPA))

## Article 4 direction areas

In [None]:
# download A4D dataset
a4_gdf = get_pdp_geo_dataset("article-4-direction-area")

nrow(a4_gdf)

a4_gdf = a4_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(a4_gdf)

a4_missing_codes = check_LPA_codes_valid(a4_gdf)

if len(a4_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the checks''')
    print("")
    print(a4_missing_codes)


In [None]:
# check against LPA
a4_beyond_LPA = check_against_border(a4_gdf, LPA_boundary_gdf, "LPACD")

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_article-4-direction-area_LPA.csv")
    a4_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

print("")
print(f"No. of entities outside of their expected boundary: {len(a4_beyond_LPA):,}")
a4_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
if show_maps.lower() == "yes": 
    display(map_error_geos(a4_beyond_LPA))

## Trees

In [None]:
# download tree dataset
tree_gdf = get_pdp_geo_dataset("tree")

nrow(tree_gdf)

tree_gdf = tree_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(tree_gdf)

tree_missing_codes = check_LPA_codes_valid(tree_gdf)

if len(tree_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the checks''')
    print("")
    print(tree_missing_codes)


In [None]:
# check against LPA
tree_beyond_LPA = check_against_border(tree_gdf, LPA_boundary_gdf, "LPACD")

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_tree_LPA.csv")
    tree_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

print("")
print(f"No. of entities outside of their expected boundary: {len(tree_beyond_LPA):,}")
tree_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
if show_maps.lower() == "yes": 
    display(map_error_geos(tree_beyond_LPA))

## Tree preservation zones

In [None]:
# download tree zone dataset
tpz_gdf = get_pdp_geo_dataset("tree-preservation-zone")

nrow(tpz_gdf)

tpz_gdf = tpz_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(tpz_gdf)

tpz_missing_codes = check_LPA_codes_valid(tpz_gdf)

if len(tpz_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the checks''')
    print("")
    print(tpz_missing_codes)


In [None]:
# check against LPA
tpz_beyond_LPA = check_against_border(tpz_gdf, LPA_boundary_gdf, "LPACD")

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_tree-preservation-zone_LPA.csv")
    tpz_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

print("")
print(f"No. of entities outside of their expected boundary: {len(tpz_beyond_LPA):,}")
tpz_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
if show_maps.lower() == "yes": 
    display(map_error_geos(tpz_beyond_LPA))

## Brownfield Land

In [None]:
# download bf land dataset
bf_gdf = get_pdp_geo_dataset("brownfield-land")
bf_gdf["organisation_entity"] = pd.to_numeric(bf_gdf["organisation_entity"]).fillna(0).astype(int)
nrow(bf_gdf)

bf_gdf = bf_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD"]], 
    how = "left",
    on = "organisation_entity")

nrow(bf_gdf)

bf_missing_codes = check_LPA_codes_valid(bf_gdf)

if len(bf_missing_codes) > 0:
    print("")
    print('''
          Warning: the following LPAs in the dataset are missing LPA codes, or have a code which isn't in the boundary dataset.
          Data for these areas will not be included in the LPA boundary checks''')
    print("")
    print(bf_missing_codes)


In [None]:
bf_missing_codes

In [None]:
# check against LPA
bf_beyond_LPA = check_against_border(bf_gdf, LPA_boundary_gdf, "LPACD")

# flag invalid geometries
bf_beyond_LPA["geometry_valid"] = bf_beyond_LPA["geometry"].is_valid

print("Number of errors found where geometry is invalid: ")
print(len(bf_beyond_LPA[bf_beyond_LPA["geometry_valid"] == False]))

if download.lower() == "yes":
    path = os.path.join(data_dir, "boundary-check_brownfield-land_LPA.csv")
    bf_beyond_LPA.to_csv(path, index=False)
    print(f"Results saved as '{path}'")

bad_geoms = bf_beyond_LPA[bf_beyond_LPA["geometry_valid"] == False]

print("")
print(f"No. of entities outside of their expected boundary: {len(bf_beyond_LPA):,}")
print(f"No. of entities outside of their expected boundary (with invalid geometries): {len(bad_geoms):,}")
bf_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name", "geometry_valid"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# note - map below only displays entities which have valid geometries

if show_maps.lower() == "yes": 
    display(map_error_geos(bf_beyond_LPA[bf_beyond_LPA["geometry"].is_valid]))