In [1]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import numpy as np

pd.set_option("display.max_rows", 100)


### Functions

In [2]:
def nrow(df):
    return print(f"No. of records in df: {len(df):,}")


def plot_issues_map(gdf:gpd.GeoDataFrame, entity_list, chloro_var, palette):

    if type(gdf) != gpd.GeoDataFrame:
        logging.error('input is not a GeodataFrame')
    
    base = gdf[gdf["entity"].isin(entity_list)].explore(
        column = chloro_var,  # make choropleth based on "BoroName" column
        cmap = palette,
        tooltip = False,
        popup = ["org_name", "entity", "name", "entry_date"],
        tiles = "CartoDB positron",  # use "CartoDB positron" tiles
        highlight = True,
        style_kwds = {
        "fillOpacity" : "0.1"
        }
    )
    
    return base

def get_issue_entities(issues_df):

    return pd.concat([issues_df["entity_1"], issues_df["entity_2"]]).drop_duplicates().to_list()


def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select organisation as org, entity as org_entity, name as org_name, dataset as org_dataset, local_planning_authority as LPACD
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):

    url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
    gdf = gpd.read_file(url)

    if underscore_cols:
        gdf.columns = [x.replace("-", "_") for x in gdf.columns]


    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf


def get_provisions():
    global provisions_df  
    params = urllib.parse.urlencode({
        "sql": f"""
            SELECT
                cohort, notes, organisation, project, provision_reason, start_date
            FROM
                provision   
            WHERE 
                provision_reason = "expected"
                AND project = "open-digital-planning"
            GROUP BY organisation
            ORDER BY cohort
        """,
        "_size": "max"
    })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    provisions_df = pd.read_csv(url)
    return provisions_df

### Data import

In [3]:
# get prov
provisions_df = get_provisions()

# get orgs
org_df = get_all_organisations()
# flag ODP
org_df["odp_flag"] = np.where(org_df["org"].isin(provisions_df["organisation"]), True, False)

print(len(org_df))

452


In [4]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
                            usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point", "geometry"])

ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]

ca_df.rename(columns={"organisation_entity":"org_entity"}, inplace=True)

# join organisation name and LPA codes from lookup
ca_df = ca_df.merge(
    org_df[["org_entity", "org_name", "org_dataset",  "LPACD"]], 
    how = "left",
    on = "org_entity")

# load to gdf, both point and poly versions
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_point_gdf = gpd.GeoDataFrame(ca_df, geometry="point")

ca_df.drop("point", axis=1, inplace=True)

ca_df["geometry"] = ca_df["geometry"].apply(shapely.wkt.loads)
ca_poly_gdf = gpd.GeoDataFrame(ca_df, geometry="geometry")

# Transform to ESPG:27700 for more interpretable area units
ca_point_gdf.set_crs(epsg=4326, inplace=True)
ca_point_gdf.to_crs(epsg=27700, inplace=True)

ca_poly_gdf.set_crs(epsg=4326, inplace=True)
ca_poly_gdf.to_crs(epsg=27700, inplace=True)

# calculate area
ca_poly_gdf["area"] = ca_poly_gdf["geometry"].area

print(len(ca_poly_gdf))
print(len(ca_point_gdf))

8310
8310


In [5]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")

lpa_gdf.rename(columns={'name':'lpa_name', 'reference':'LPACD'}, inplace=True)

print(len(lpa_gdf))

337


## Geo

In [6]:
MATCH_LOWER_THRESH = 0.9  # defines the lower limit of the shared overlap between two entities to be called a match
# EDGE_UPPER_THRESH = 0.1   # defines the upper limit of the shared overlap between two entities to be called an edge intersection
# EDGE_LOWER_THRESH = 0.01   # defines the lower limit of the shared overlap between two entities to be called an edge intersection
EDGE_UPPER_THRESH = 0.1   # defines the upper limit of the shared overlap between two entities to be called an edge intersection

# full join of all geometries
entity_join_all = gpd.overlay(
    ca_poly_gdf[["entity", "name", "org_entity", "org_name", "org_dataset", "area", "geometry"]], 
    ca_poly_gdf[["entity", "name", "org_entity", "org_name", "org_dataset", "area", "geometry"]], 
    how = "intersection", keep_geom_type=False 
)


# remove self-intersections 
entity_join_all = entity_join_all[entity_join_all["entity_1"] != entity_join_all["entity_2"]].copy()

# flag the types of intersections between organisations
# is org the same
entity_join_all["ee_same_org_flag"] = np.where(entity_join_all["org_entity_1"] == entity_join_all["org_entity_2"], "within org", "between orgs")
entity_join_all["ee_org_types"] = entity_join_all.apply(lambda x: ' : '.join(sorted(x[["org_dataset_1", "org_dataset_2"]])), axis=1)

entity_join_all["ee_org_match_class"] = entity_join_all["ee_same_org_flag"] + " - " + entity_join_all["ee_org_types"]

# calculate overlap %'s
entity_join_all["area_int"] = entity_join_all["geometry"].area

entity_join_all["p_pct_intersect"] = entity_join_all["area_int"] / entity_join_all["area_1"]
entity_join_all["pct_intersection"] = entity_join_all["area_int"] / (entity_join_all["area_1"] + entity_join_all["area_2"] - entity_join_all["area_int"])
entity_join_all["s_pct_intersect"] = entity_join_all["area_int"] / entity_join_all["area_2"]

# intersection area as % of smallest primary or secondary area
entity_join_all["pct_min_intersection"] = entity_join_all["area_int"] / entity_join_all[["area_1", "area_2"]].min(axis = 1)


entity_join_all["ee_int_type"] = np.select(
    [
        (entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) & (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH),
        (entity_join_all["pct_min_intersection"] < EDGE_UPPER_THRESH),
        ((entity_join_all["p_pct_intersect"] >= MATCH_LOWER_THRESH) | (entity_join_all["s_pct_intersect"] >= MATCH_LOWER_THRESH)),
        
    ],
    [
        "Complete match (two-way)", "Edge overlap", "Single match (one-way)"
    ],
    default = "Partial match"
)

In [7]:
# remove edge intersections and reduce columns

entity_issues = entity_join_all[[
    "entity_1", "entity_2", "p_pct_intersect", "s_pct_intersect", "ee_org_match_class", "ee_int_type"
]][
    entity_join_all["ee_int_type"] != "Edge overlap"
    ].copy()


# this table is I think the minimum amount of info we'd like captured in the issues table:
# - the type of organisation match
# - the type of intersection
# - the % overlap of each entity
#
# note: that intersections are duplicated in this table, e.g 44000170-44000126 will also appear as 44000126-44000170
# and that an entity can have multiple intersections, e.g. 44000170 could appear many times in `entity_1` field

print(len(entity_join_all))
print(len(entity_issues))
entity_issues.head()

3706
580


Unnamed: 0,entity_1,entity_2,p_pct_intersect,s_pct_intersect,ee_org_match_class,ee_int_type
134,44000170,44000126,0.012364,0.999973,within org - government-organisation : governm...,Single match (one-way)
135,44000126,44000170,0.999973,0.012364,within org - government-organisation : governm...,Single match (one-way)
136,44000128,44000170,0.999995,0.01243,within org - government-organisation : governm...,Single match (one-way)
137,44000164,44000170,0.999878,0.002437,within org - government-organisation : governm...,Single match (one-way)
138,44000166,44000170,0.999962,0.002061,within org - government-organisation : governm...,Single match (one-way)


## Map

In [8]:
print("ORG MATCH CLASS & INTERSECTION TYPE OPTIONS")
print(entity_join_all["ee_int_type"].drop_duplicates().to_list())
print(entity_join_all["ee_org_match_class"].drop_duplicates().to_list())

ORG MATCH CLASS & INTERSECTION TYPE OPTIONS
['Edge overlap', 'Single match (one-way)', 'Partial match', 'Complete match (two-way)']
['within org - government-organisation : government-organisation', 'within org - local-authority : local-authority', 'between orgs - government-organisation : local-authority', 'between orgs - local-authority : local-authority', 'between orgs - government-organisation : national-park-authority']


In [9]:
# map to view and check issue classifications
# use issue_subset table to specify conditions for `ee_org_match_class` and `ee_int_type`
issue_subset = entity_join_all[
    # (entity_join_all["ee_org_match_class"] == m_class) &
    (entity_join_all["ee_int_type"] == "Complete match (two-way)")
    ]

print(f"n entities with issues = {len(issue_subset)}")

plot_issues_map(ca_poly_gdf, 
                get_issue_entities(issue_subset),
                "org_dataset", 
                "Accent")

n entities with issues = 38
