In [None]:
import spatialite
import pandas as pd
import geopandas as gpd
import os
import itertools
import shapely.wkt
import logging
import urllib
import numpy as np

pd.set_option("display.max_rows", 100)


In [None]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify

In [None]:
# global variables
data_dir = "../data/geo_analysis/area_bounds/"
os.makedirs(data_dir, exist_ok=True)

### Functions

In [None]:
def nrow(df):
    return print(f"No. of records in df: {len(df):,}")

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name, entity as organisation_entity, statistical_geography
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

In [None]:
def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]

    try:
        gdf["organisation_entity"] = gdf["organisation_entity"].astype(int)

    except:
        print("issue with organisation_entity column") 

    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

### Data import

In [None]:
# get LAD to LPA lookup from github
lookup_lad_lpa = pd.read_csv("https://github.com/digital-land/organisation-collection/raw/main/data/local-authority.csv",
                             usecols = ["entity", "local-authority-district", "local-planning-authority"])

lookup_lad_lpa.columns = ["organisation_entity", "LADCD", "LPACD"]

nrow(lookup_lad_lpa)
lookup_lad_lpa.head()

**Note on LAD to LPA mapping**   
Currently this [lookup file from github](https://github.com/digital-land/organisation-collection/raw/main/data/local-authority.csv) just records a 1:1 link between LADs and LPAs, but according to the ONS this relationship is actually 1:many. 
See [2020 lookup file](https://geoportal.statistics.gov.uk/datasets/ons::local-planning-authority-to-local-authority-district-april-2020-in-the-united-kingdom-lookup-1/about) and the example of Ryedale [`E07000167`], which is mapped to the following two LPAs:

* Ryedale LPA [`E60000061`]
* North York Moors National Park LPA [`E60000322`]

We need to agree some validation rules around this, i.e. can we expect Ryedale to submit data that might sit within either of these LPA areas, or for any London Boroughs to submit within the "London Legacy Development Corporation LPA" area?
But for simplicity's sake at the moment to get things up and running (as per Owen's advice), will test with existing 1:1 mapping and aim to develop logic once there is more clarity about multiple area handling.

The git lookup file also seems to be missing some areas, e.g. "Peak District National Park Authority" entity 405.

In [None]:
# get org data from datasette
lookup_org = get_all_organisations()

# lookup_org["organisation_entity"] = lookup_org["organisation_entity"].astype(str)
lookup_org.columns = ["organisation", "organisation_name", "organisation_entity", "statistical_geography"]

# split out org type and join on LPA codes from LAD to LPA lookup
lookup_org["organisation_type"] = lookup_org["organisation"].apply(lambda x: x.split(":")[0])
lookup_org = lookup_org.merge(lookup_lad_lpa, how = "left", on = "organisation_entity")

nrow(lookup_org)
lookup_org.head()

In [None]:
# check the number of lpas which we don't have a LADCD for
len(lookup_org[(lookup_org["LADCD"].isnull()) & (lookup_org["organisation_type"] == "local-authority")])

In [None]:
# LPA boundary data from planning.data.gov

LPA_boundary_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/local-planning-authority.csv", 
                                  usecols = ["reference", "name", "geometry"])

LPA_boundary_df.columns = ["geometry", "name", "LPACD"]


# load geometry and create GDF
LPA_boundary_df['geometry'] = LPA_boundary_df['geometry'].apply(shapely.wkt.loads)
LPA_boundary_gdf = gpd.GeoDataFrame(LPA_boundary_df, geometry='geometry')

# Transform to ESPG:27700 for more interpretable area units
LPA_boundary_gdf.set_crs(epsg=4326, inplace=True)
LPA_boundary_gdf.to_crs(epsg=27700, inplace=True)

nrow(LPA_boundary_gdf)
LPA_boundary_gdf.head()


In [None]:
# LAD boundary data from planning.data.gov

LAD_boundary_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/local-authority-district.csv", 
                                  usecols = ["reference", "name", "geometry"])

LAD_boundary_df.columns = ["geometry", "name", "LADCD"]


# load geometry and create GDF
LAD_boundary_df['geometry'] = LAD_boundary_df['geometry'].apply(shapely.wkt.loads)
LAD_boundary_gdf = gpd.GeoDataFrame(LAD_boundary_df, geometry='geometry')

# Transform to ESPG:27700 for more interpretable area units
LAD_boundary_gdf.set_crs(epsg=4326, inplace=True)
LAD_boundary_gdf.to_crs(epsg=27700, inplace=True)

nrow(LAD_boundary_gdf)
LAD_boundary_gdf.head()


In [None]:
pdp_la_orgs = lookup_org[lookup_org["organisation_type"] == "local-authority"]

LAD_valid = pdp_la_orgs.merge(
    LAD_boundary_df,
    how = "inner",
    on = "LADCD"
)


print(f"n local-authority orgs in PDP organisation entity table: {len(pdp_la_orgs)}")
print(f"n local authorities in pdp local-authority-district dataset : {len(LAD_boundary_df)}")
print(f"n in both (by LAD code): {len(LAD_valid)}")

# nrow(LAD_boundary_df)
# nrow(lookup_org[lookup_org["organisation_type"] == "local-authority"])
# nrow(pdp_LAD_2020)

# Checking expected bounds of data  

In [None]:
def find_geos_beyond_border(geos, border):

    # filter table of geos to those which are outside of a single border
    return geos.loc[
        geos.geometry.disjoint(
            border.iloc[0].geometry)
        ]

def check_against_border(geos_gdf, border_gdf, code_type):

    # list of the area codes in input geos_gdf to loop through
    code_list = geos_gdf[code_type][geos_gdf[code_type].notnull()].drop_duplicates().to_list()

    # check geos in each area code against area border
    results = [
       find_geos_beyond_border(
        geos_gdf[geos_gdf[code_type] == code], 
        border_gdf.loc[border_gdf[code_type] == code]
    ) for code in code_list 
    ]

    return pd.concat(results)


## Conservation areas

In [None]:
# download CA dataset
ca_gdf = get_pdp_geo_dataset("conservation-area")

nrow(ca_gdf)

ca_gdf = ca_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD", "LADCD"]], 
    how = "left",
    on = "organisation_entity")

nrow(ca_gdf)
ca_gdf.head(2)

In [None]:
# check against LAD
ca_beyond_LAD = check_against_border(ca_gdf, LAD_boundary_gdf, "LADCD")

print(f"No. of entities not contained by their expected boundary: {len(ca_beyond_LAD):,}")
ca_beyond_LAD.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# check against LPA
ca_beyond_LPA = check_against_border(ca_gdf, LPA_boundary_gdf, "LPACD")

print(f"No. of entities not contained by their expected boundary: {len(ca_beyond_LPA):,}")
ca_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

Checking conservation areas against LAD boundaries throws up issues with Babergh entities which are outside of the Babergh area.
However, using LPA boundaries throws issues with other LPAs which have part of their admin area covered by a development corporation or a national park, and so the LPA boundary doesn't cover the full extent of the admin area.

See map below, which shows areas outside of LPA boundaries alongside the boundaries themselves.

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = ca_beyond_LPA.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(ca_beyond_LPA["LPACD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

In [None]:
# Map showing issues with Hammersmith & Fulham, and Ealing entities which fall inside development corp LPA boundariy

map_bounds = LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(["E60000191", "E60000208", "E60000330"])].explore(
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    color = "blue",
    tooltip = False,
    # highlight = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

ca_beyond_LPA[ca_beyond_LPA["LPACD"].isin(["E60000191", "E60000208"])].explore(
    m = map_bounds,
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

## Listed Building Outlines

In [None]:
# download listed-building-outline dataset
lbo_gdf = get_pdp_geo_dataset("listed-building-outline")

nrow(lbo_gdf)

lbo_gdf = lbo_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD", "LADCD"]], 
    how = "left",
    on = "organisation_entity")

nrow(lbo_gdf)
lbo_gdf.head(2)

In [None]:
# check against LAD
lbo_beyond_LAD = check_against_border(lbo_gdf, LAD_boundary_gdf, "LADCD")

print(f"No. of entities not contained by their expected boundary: {len(lbo_beyond_LAD):,}")
lbo_beyond_LAD.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# check against LPA
lbo_beyond_LPA = check_against_border(lbo_gdf, LPA_boundary_gdf, "LPACD")

print(f"No. of entities not contained by their expected boundary: {len(lbo_beyond_LPA):,}")
lbo_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = lbo_beyond_LAD.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LAD_boundary_gdf[LAD_boundary_gdf["LADCD"].isin(lbo_beyond_LAD["LADCD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LADCD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

## Article 4 direction areas

In [None]:
# download article-4-direction-area dataset
afa_gdf = get_pdp_geo_dataset("article-4-direction-area")

nrow(afa_gdf)

afa_gdf = afa_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD", "LADCD"]], 
    how = "left",
    on = "organisation_entity")

nrow(afa_gdf)
afa_gdf.head(2)

In [None]:
# check against LAD
afa_beyond_LAD = check_against_border(afa_gdf, LAD_boundary_gdf, "LADCD")

print(f"No. of entities not contained by their expected boundary: {len(afa_beyond_LAD):,}")
afa_beyond_LAD.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# check against LPA
afa_beyond_LPA = check_against_border(afa_gdf, LPA_boundary_gdf, "LPACD")

print(f"No. of entities not contained by their expected boundary: {len(afa_beyond_LPA):,}")
afa_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = afa_beyond_LPA.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(afa_beyond_LPA["LPACD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

## Trees

In [None]:
# download listed-building-outline dataset
tree_gdf = get_pdp_geo_dataset("tree")

nrow(tree_gdf)

tree_gdf = tree_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD", "LADCD"]], 
    how = "left",
    on = "organisation_entity")

nrow(tree_gdf)
tree_gdf.head(2)

In [None]:
# check against LAD
tree_beyond_LAD = check_against_border(tree_gdf, LAD_boundary_gdf, "LADCD")

print(f"No. of entities not contained by their expected boundary: {len(tree_beyond_LAD):,}")
tree_beyond_LAD.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# check against LPA
tree_beyond_LPA = check_against_border(tree_gdf, LPA_boundary_gdf, "LPACD")

print(f"No. of entities not contained by their expected boundary: {len(tree_beyond_LPA):,}")
tree_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

None of the organisations with issues here are those which have big differences between LAD and LPA boundaries, so these small differences in the number of issues with each boundary type appear to be caused by very minor variations in the LAD vs. LPA boundary.

The main result to note is that there are a high number of issues with Barnet trees being outside of the LAD boundary.

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = tree_beyond_LAD.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LAD_boundary_gdf[LAD_boundary_gdf["LADCD"].isin(tree_beyond_LAD["LADCD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LADCD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = tree_beyond_LPA.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(tree_beyond_LPA["LPACD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

## Tree preservation zones

In [None]:
# download listed-building-outline dataset
tpz_gdf = get_pdp_geo_dataset("tree-preservation-zone")

nrow(tpz_gdf)

tpz_gdf = tpz_gdf.merge(
    lookup_org[["organisation_name", "organisation_type", "organisation_entity", "LPACD", "LADCD"]], 
    how = "left",
    on = "organisation_entity")

nrow(tpz_gdf)
tpz_gdf.head(2)

In [None]:
# check against LAD
tpz_beyond_LAD = check_against_border(tpz_gdf, LAD_boundary_gdf, "LADCD")

print(f"No. of entities not contained by their expected boundary: {len(tpz_beyond_LAD):,}")
tpz_beyond_LAD.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

In [None]:
# check against LPA
tpz_beyond_LPA = check_against_border(tpz_gdf, LPA_boundary_gdf, "LPACD")

print(f"No. of entities not contained by their expected boundary: {len(tpz_beyond_LPA):,}")
tpz_beyond_LPA.groupby(["LPACD", "organisation_entity", "organisation_name"]).size().sort_values(ascending=False).reset_index(name = "count")

Here the results are similar to those for trees, where the results are largely the same for LAD vs. LPA as these are all organisations where those borders don't differ much. Again, there seems to be a bigger issue with data from Barnet.

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = tpz_beyond_LAD.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LAD_boundary_gdf[LAD_boundary_gdf["LADCD"].isin(tpz_beyond_LAD["LADCD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LADCD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)

In [None]:
# Plot all entities outside of expected boundaries, alongside LPA boundaries

map_entities = tree_beyond_LPA.explore(
    color = "red",
    tooltip = False,
    popup = ["organisation_name", "entity", "name", "reference"],
    tiles = "CartoDB positron",  # use "CartoDB positron" tiles
    # highlight = False,
    style_kwds = {
    "fillOpacity" : "0.1"
    }
)

LPA_boundary_gdf[LPA_boundary_gdf["LPACD"].isin(tree_beyond_LPA["LPACD"])].explore(
    m = map_entities,
    color = "blue",
    tooltip = False,
    popup = ["name", "LPACD"],
        style_kwds = {
        "fillOpacity" : "0"
        }
)