In [1]:
import pandas as pd
import numpy as np
import urllib


In [2]:
# get org data from datasette - flag LPAs (note - Purbeck and North Dorset are incorrectly missing LPA codes, so manually add in to flag)

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as org_name, organisation, dataset as org_type, end_date, 
        local_planning_authority as LPACD, local_authority_district,
        case when local_planning_authority != "" or organisation in ("local-authority:NDO", "local-authority:PUR") then 1 else 0 end as lpa_flag
        from organisation
        where name != "Waveney District Council"
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype = str)
    return df

In [3]:
org_lookup = get_all_organisations()


In [4]:
# conservation_area lookup file
data = pd.read_csv('https://raw.githubusercontent.com/digital-land/config/refs/heads/main/pipeline/conservation-area/lookup.csv')
print(len(data))

data = data.merge(
    org_lookup[["organisation", "lpa_flag"]],
    how = "left", 
    on = "organisation"
)
print(len(data))

17848
17848


In [5]:
# check which organisations in lookup aren't flagged as LPAs
data[data["lpa_flag"] == "0"].groupby(["organisation"]).size()

organisation
government-organisation:D1342     4545
government-organisation:PB1164    7683
local-authority:GLA                 13
dtype: int64

In [6]:
# we only want to assign ranges for LPAs
data_lpa = data[data["lpa_flag"] == "1"].copy()

data_lpa = data_lpa.dropna(subset=['entity', 'organisation'])

data_lpa['entity'] = pd.to_numeric(data_lpa['entity'])
data_lpa = data_lpa.sort_values(by=['prefix', 'entity']).reset_index(drop=True)

# increment_id tracks when the organisation or non-consecutive entity changes
data_lpa['increment_id'] = (data_lpa['organisation'] != data_lpa['organisation'].shift(1)) | \
                           (data_lpa['prefix'] != data_lpa['prefix'].shift(1)) | \
                           ((data['entity'].shift(1) - data['entity']) == 1)


# Cumulatively sum the 'increment_id' to get the unique range IDs
data_lpa['increment_id'] = data_lpa['increment_id'].cumsum()

# Group by organisation and the 'increment_id' to calculate min and max entities for each range
entity_ranges = data_lpa.groupby(['prefix','organisation', 'increment_id']).agg(
    min_entity=('entity', 'min'),
    max_entity=('entity', 'max')
).reset_index()

# entity_ranges = entity_ranges.drop(columns=['increment_id'])

entity_ranges


Unnamed: 0,prefix,organisation,increment_id,min_entity,max_entity
0,conservation-area,development-corporation:Q6670544,183,44003422,44003423
1,conservation-area,development-corporation:Q6670544,246,44006543,44006543
2,conservation-area,development-corporation:Q6670544,434,44009061,44009061
3,conservation-area,local-authority:BAB,233,44005968,44005997
4,conservation-area,local-authority:BAB,305,44008683,44008684
...,...,...,...,...,...
522,conservation-area-document,local-authority:PTE,520,6300158,6300213
523,conservation-area-document,local-authority:ROH,521,6300214,6300229
524,conservation-area-document,local-authority:ROS,518,6300062,6300095
525,conservation-area-document,local-authority:SAL,516,6300000,6300044


In [15]:
entity_organisation = entity_ranges.copy()
entity_organisation.drop("increment_id", axis=1, inplace=True)
entity_organisation.rename(columns={"prefix":"dataset", "min_entity": "entity-minimum", "max_entity": "entity-maximum"}, inplace=True)
entity_organisation.to_csv('entity-organisation.csv', index=False)


## Check ranges

In [21]:
# check if there are any entities in multiple ranges

df = pd.read_csv('entity-organisation.csv')  

# need to test conservation-area and conservation-area-document ranges separately
er_test = df[df["dataset"] == "conservation-area"]

# entity range for chunk
e_range = np.arange(
    er_test["entity-minimum"].min(), 
    er_test["entity-maximum"].max()
    )

print(f"checking ranges for {len(e_range)} entities")

# check how many ranges in range table each entity has
range_checks = [len(er_test[(er_test["entity-minimum"] <= e) & (er_test["entity-maximum"] >= e)]) for e in e_range]

# df for results
check_df = pd.DataFrame(
    {
        "entity" : e_range,
        "n_ranges" : range_checks
    }
)

# test if any with > 1 range
check_df[check_df["n_ranges"] > 1]

checking ranges for 12344 entities


Unnamed: 0,entity,n_ranges
6431,44006432,2
6542,44006543,2
8379,44008380,2


In [22]:

df = pd.read_csv('entity-organisation.csv')  

# Ensure numeric  entities
df['entity-minimum'] = pd.to_numeric(df['entity-minimum'], errors='coerce')
df['entity-maximum'] = pd.to_numeric(df['entity-maximum'], errors='coerce')

overlapping_ranges = []

# Compare each row with every other row
for i in range(len(df)):
    for j in range(i+1, len(df)):
        #  organisations are different to avoid comparing same 
        if df.loc[i, 'organisation'] != df.loc[j, 'organisation']:
            # Check  overlap
            overlap = not (
                df.loc[i, 'entity-maximum'] < df.loc[j, 'entity-minimum'] or 
                df.loc[i, 'entity-minimum'] > df.loc[j, 'entity-maximum']
            )
            
            if overlap:
                overlap_info = {
                    'Dataset 1': df.loc[i, 'dataset'],
                    'Organisation 1': df.loc[i, 'organisation'],
                    'Entity Min 1': df.loc[i, 'entity-minimum'],
                    'Entity Max 1': df.loc[i, 'entity-maximum'],
                    'Dataset 2': df.loc[j, 'dataset'],
                    'Organisation 2': df.loc[j, 'organisation'],
                    'Entity Min 2': df.loc[j, 'entity-minimum'],
                    'Entity Max 2': df.loc[j, 'entity-maximum']
                }
                overlapping_ranges.append(overlap_info)

overlap_df = pd.DataFrame(overlapping_ranges)
overlap_df.to_csv('entity-organisation-overlap.csv', index=False)
print("Total Overlapping Ranges:", len(overlap_df))
display(overlap_df)

# by organisation
print("\nOverlaps by Organisation:")
overlap_summary = overlap_df.groupby(['Organisation 1', 'Organisation 2']).size().reset_index(name='Overlap Count')
display(overlap_summary)




Total Overlapping Ranges: 3


Unnamed: 0,Dataset 1,Organisation 1,Entity Min 1,Entity Max 1,Dataset 2,Organisation 2,Entity Min 2,Entity Max 2
0,conservation-area,development-corporation:Q6670544,44006543,44006543,conservation-area,local-authority:TWH,44006543,44006545
1,conservation-area,local-authority:BUC,44006432,44006432,conservation-area,local-authority:CON,44006432,44006432
2,conservation-area,local-authority:HMF,44008347,44008380,conservation-area,local-authority:KEC,44008380,44008380



Overlaps by Organisation:


Unnamed: 0,Organisation 1,Organisation 2,Overlap Count
0,development-corporation:Q6670544,local-authority:TWH,1
1,local-authority:BUC,local-authority:CON,1
2,local-authority:HMF,local-authority:KEC,1
