# Title
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose


In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import urllib
# from sqlite_query_functions import DatasetSqlite
from datetime import datetime
import shapely

pd.set_option("display.max_rows", 100)

td = datetime.today().strftime('%Y-%m-%d')
data_dir = "../../data/deleted_entities/"
os.makedirs(data_dir, exist_ok=True)
# fn = os.path.join(data_dir, f"FILENAME_{td}.csv")


In [3]:
def get_pdp_dataset(dataset, geometry_field = "geometry", crs_out=4326, underscore_cols=True):

    df = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", 
                     dtype = {"organisation-entity": pd.Int64Dtype()})  #, dtype = "str"
    df.columns = [x.replace("-", "_") for x in df.columns]

    df_valid_geom = df[df[geometry_field].notnull()].copy()

    # load geometry and create GDF
    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(shapely.wkt.loads)
    gdf = gpd.GeoDataFrame(df_valid_geom, geometry = geometry_field)

    # Transform to ESPG:27700 for more interpretable area units
    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

In [4]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as org_name, organisation, dataset as org_type, end_date
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype={"organisation_entity" : pd.Int64Dtype()})
    return df

## Data Import

In [5]:
lookup_org = get_all_organisations()

In [29]:
bfl_gdf = get_pdp_dataset("brownfield-land", geometry_field="point", crs_out=27700)

In [7]:
uk_gdf = gpd.read_file("Countries_December_2023_Boundaries_UK_BFC_-7514124880420163797.gpkg")

In [32]:
old_ents_df = pd.read_csv(os.path.join(data_dir, "test - old ents - 2025-01-23.csv"))

In [33]:
bfl_gdf["old_entity"] = np.where(
    (bfl_gdf["entity"].isin(old_ents_df["entity"])) |
    (bfl_gdf["end_date"].notnull())
    , 
    True, False)

In [None]:
bfl_gdf["old_entity"].value_counts()

## Analysis

In [None]:
# spatial join between UK extent and BFL points
bfl_uk = gpd.sjoin(
    bfl_gdf,
    uk_gdf,
    how = "inner",
    predicate = "intersects"
)

# beyond is where bfl points aren't in intersected table
bfl_beyond = bfl_gdf[~bfl_gdf["entity"].isin(bfl_uk["entity"])]

print(len(bfl_gdf))
print(len(bfl_uk))
print(len(bfl_beyond))

In [None]:
bfl_beyond["old_entity"].value_counts()

In [None]:
bfl_beyond_org_count = bfl_beyond.groupby(["organisation_entity"], as_index=False).agg(
        "size"
    ).merge(
        lookup_org[["organisation_entity", "org_name", "organisation"]],
        how = "inner",
        on = "organisation_entity"
    )

bfl_beyond_org_count.sort_values("size", ascending=False).head(20)

In [None]:
# count n beyond UK per org
bfl_beyond_org_count = bfl_beyond.groupby(["organisation_entity", "old_entity"], as_index=False).agg(
        "size"
    ).merge(
        lookup_org[["organisation_entity", "org_name", "organisation"]],
        how = "inner",
        on = "organisation_entity"
    )

# bfl_beyond_org_count.sort_values("size", ascending=False).to_csv(f"bfl_beyond_uk_by_org-{td}.csv", index = False)
bfl_beyond_org_count[bfl_beyond_org_count["old_entity"] == False].sort_values("size", ascending=False).head(20)

In [None]:
# bfl_beyond_org_count[bfl_beyond_org_count["old_entity"] == True].sort_values("size", ascending=False).head(20)

In [None]:
bfl_beyond[
    (bfl_beyond["organisation_entity"] == 315) 
    # (bfl_beyond["old_entity"] == False)
][["entity", "reference", "name", "point", "site_address", "end_date", "old_entity"]]

In [None]:
bfl_to_retire = bfl_beyond[bfl_beyond["old_entity"] == True].merge(
        lookup_org[["organisation_entity", "org_name", "organisation"]],
        how = "inner",
        on = "organisation_entity"
    )

bfl_to_retire.head()

In [22]:
bfl_to_retire["old_entity_message"] = np.where(
    bfl_to_retire["end_date"].notnull(),
    bfl_to_retire["entity"].astype(str) + ",410,,retire " + bfl_to_retire["organisation_y"] + " entity which is end-dated and is in the sea,,,",
    bfl_to_retire["entity"].astype(str) + ",410,,retire " + bfl_to_retire["organisation_y"] + " entity which is not on latest resource and is in the sea,,,"
)

bfl_to_retire.to_csv("bfl_entities_to_retire.csv", index = False)

In [None]:
bfl_to_retire.explore()

In [None]:
bfl_gdf[bfl_gdf["site_address"].str.contains("Port Royal", na=False)]