Notebook to get the Local Authority names for CA entities on planning.data.gov.uk based on their location rather than their organisation. This is being used by Mark and Dave in their data matching work

In [1]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np
import os

In [2]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)

In [3]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name, organisation, dataset, local_planning_authority
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

## Data in

In [4]:
# get orgs
org_df = get_all_organisations()
print(len(org_df))

452


In [41]:
# CA from pdp
ca_in = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point", "geometry"])

ca_df = ca_in[ca_in["point"].notnull()].copy()

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]
ca_df.rename(columns={"name":"entity_name"}, inplace=True)

# load to gdf
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_gdf = gpd.GeoDataFrame(ca_df, geometry='point')

# Transform to ESPG:27700 for more interpretable area units
ca_gdf.set_crs(epsg=4326, inplace=True)
ca_gdf.to_crs(epsg=27700, inplace=True)

In [6]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf.rename(columns={'name':'lpa_name'}, inplace=True)

print(len(lpa_gdf))

337


## Analysis

### Spatial joining - LPA boundaries to conservation area points

In [42]:
# join LPAs to all conservation areas, then join on the LA name based on location
lpa_ca_join = gpd.sjoin(
    lpa_gdf[["reference", "lpa_name", "geometry"]],
    ca_gdf[["entity", "entity_name", "organisation_entity", "point"]],
    how = "inner",
    predicate = "intersects"
)

print(len(lpa_ca_join))

lpa_ca_la_join = lpa_ca_join.merge(
    org_df[["name", "local_planning_authority"]],
    how = "inner",
    left_on = "reference",
    right_on = "local_planning_authority"
)

lpa_ca_la_join.rename(columns={
    'reference':'lpa_reference',
    'name':'la_name'}, inplace=True)

lpa_ca_la_join["entity"] = lpa_ca_la_join["entity"].astype(int)

print(len(lpa_ca_la_join))

lpa_ca_la_join.head()


9169
9091


Unnamed: 0,lpa_reference,lpa_name,geometry,index_right,entity,entity_name,organisation_entity,la_name,local_planning_authority
0,E60000001,County Durham LPA,"POLYGON ((428366.003 554230.393, 428288.990 55...",4462,44004818,Aycliffe Village,16,Durham County Council,E60000001
1,E60000001,County Durham LPA,"POLYGON ((428366.003 554230.393, 428288.990 55...",4466,44004822,Mordon,16,Durham County Council,E60000001
2,E60000001,County Durham LPA,"POLYGON ((428366.003 554230.393, 428288.990 55...",4467,44004823,Hardwick Park,16,Durham County Council,E60000001
3,E60000001,County Durham LPA,"POLYGON ((428366.003 554230.393, 428288.990 55...",5161,44005530,Bishop Middleham,16,Durham County Council,E60000001
4,E60000001,County Durham LPA,"POLYGON ((428366.003 554230.393, 428288.990 55...",4465,44004821,Mainsforth,16,Durham County Council,E60000001


In [45]:
lpa_ca_la_join[["entity", "entity_name", "lpa_reference", "lpa_name", "la_name"]].to_csv("output/ca_la_lpa_lookup_2024-09-30.csv", index = False)