# Assign entity-organisation ranges
**Author**:  Greg Slater <br>
**Date**:  11th February 2025 <br>
**Dataset Scope**: ODP datasets <br>
**Report Type**: Tool / analysis aid <br>
**Purpose**: Some scripts which use the `lookup.csv` file to generate the ranges required in the `entity-organisation.csv` config file. "Assign Ranges" section will generate the file required, while the "Check ranges" section does some standard QA checks of the outputs - these should be used to sense-check the output before it's used. Likely will require some iteration between checking and fixing the lookup file, then re-running the file generation part.

In [None]:
import pandas as pd
import numpy as np
import os
import urllib
from datetime import datetime

td = datetime.today().strftime('%Y-%m-%d')


pd.set_option("display.max_rows", 100)

data_dir = "../../data/"
os.makedirs(data_dir, exist_ok=True)
out_dir = "../../data/org-entity-ranges/"
os.makedirs(out_dir, exist_ok=True)

## Functions

In [16]:
# get org data from datasette - flag LPAs (note - Purbeck and North Dorset are incorrectly missing LPA codes, so manually add in to flag)

def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as organisation_entity, name as org_name, organisation, dataset as org_type, end_date, 
        local_planning_authority as LPACD, local_authority_district,
        case when local_planning_authority != "" then 1 else 0 end as lpa_flag
        from organisation
        where name != "Waveney District Council"
        and end_date = ""
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url, dtype = str)
    return df

## Data Import

In [15]:
# set dataset to assign organisation-entity ranges for
dataset = "brownfield-land"

In [None]:
org_lookup = get_all_organisations()
print(len(org_lookup))

# lookup file
lookup_df= pd.read_csv(f"https://raw.githubusercontent.com/digital-land/config/refs/heads/main/pipeline/{dataset}/lookup.csv")
print(len(lookup_df))

# old-entity file
old_ent_df= pd.read_csv(f"https://raw.githubusercontent.com/digital-land/config/refs/heads/main/pipeline/{dataset}/old-entity.csv")
print(len(old_ent_df))

lookup_df = lookup_df.merge(
    org_lookup[["organisation", "org_name", "lpa_flag"]],
    how = "left", 
    on = "organisation"
)
print(len(lookup_df))

## Assign ranges

In [None]:
# we only want to assign ranges for LPAs (as they're the authoritative source)
# and for entities which haven't been retired or redirected (as we don't want to give ranges for orgs which were wrongly assigned from bad OrganisationURI values)
lookup_lpa = lookup_df[
    (lookup_df["lpa_flag"] == "1") &
    (~lookup_df["entity"].isin(old_ent_df["old-entity"]))
].copy()

print(f"len of lookup_lpa: {len(lookup_lpa)}")

lookup_lpa = lookup_lpa.dropna(subset=['entity', 'organisation'])

lookup_lpa['entity'] = pd.to_numeric(lookup_lpa['entity'])
lookup_lpa = lookup_lpa.sort_values(by=['prefix', 'entity']).reset_index(drop=True)

# increment_id tracks when the organisation or non-consecutive entity changes
lookup_lpa['increment'] = (lookup_lpa['organisation'] != lookup_lpa['organisation'].shift(1)) | \
                           (lookup_lpa['prefix'] != lookup_lpa['prefix'].shift(1)) | \
                           ((lookup_lpa['entity'] - lookup_lpa['entity'].shift(1)) != 1)


# Cumulatively sum the 'increment_id' to get the unique range IDs
lookup_lpa['increment_id'] = lookup_lpa['increment'].cumsum()

lookup_lpa.to_csv(os.path.join(out_dir, f"{dataset}_lookup_lpa_incremented.csv"), index = False)

# Group by organisation and the 'increment_id' to calculate min and max entities for each range
entity_ranges = lookup_lpa.groupby(['prefix','organisation', 'increment_id']).agg(
    entity_min=('entity', 'min'),
    entity_max=('entity', 'max')
).reset_index()

# add field for the range size
entity_ranges["entity_range"] = entity_ranges["entity_max"] - entity_ranges["entity_min"]

print(f"count of ranges: {len(entity_ranges)}")
entity_ranges.head(10)


In [None]:
lookup_lpa.head()

In [None]:
# Save entity-organisation.csv
entity_organisation = entity_ranges.copy()
entity_organisation.drop(["increment_id", "entity_range"], axis=1, inplace=True)
entity_organisation.rename(columns={"prefix":"dataset", "entity_min": "entity-minimum", "entity_max": "entity-maximum"}, inplace=True)

entity_organisation.to_csv(os.path.join(out_dir, f"{dataset}_entity-organisation.csv"), index=False)


## Check ranges
### Entities in multiple ranges

In [None]:
# check if there are any entities in multiple ranges

# df = pd.read_csv('entity-organisation.csv')  

# need to test conservation-area and conservation-area-document ranges separately
er_test = entity_ranges[entity_ranges["prefix"] == dataset].copy()

# entity range for chunk
e_range = np.arange(
    er_test["entity_min"].min(), 
    er_test["entity_max"].max()
    )

print(f"checking ranges for {len(e_range)} entities")

# check how many ranges in range table each entity has
range_checks = [len(er_test[(er_test["entity_min"] <= e) & (er_test["entity_max"] >= e)]) for e in e_range]

# df for results
check_df = pd.DataFrame(
    {
        "entity" : e_range,
        "n_ranges" : range_checks
    }
)

# test if any with > 1 range
entity_dupes = check_df[check_df["n_ranges"] > 1]
print(f"Found {len(entity_dupes)} entities which appear more than once in lookup.csv")


In [None]:
entity_dupes

In [67]:
lookup_entity_dupes = lookup_lpa[lookup_lpa["entity"].isin(entity_dupes["entity"])].sort_values("entity")
lookup_entity_dupes.to_csv(os.path.join(out_dir, f"{dataset}_lookup_entity_dupes.csv"), index = False)

### Short entity-org ranges (a sign of funny entities which may need manual fixing)

In [112]:
short_ranges = entity_ranges[entity_ranges["entity_range"] <= 5]

lookup_range_flagged = lookup_df.merge(
    short_ranges[["entity_min", "entity_range"]],
    how = "left",
    left_on = "entity",
    right_on = "entity_min"
)

lookup_range_flagged[[
    'prefix', 'resource', 'endpoint', 'entry-number', 'organisation',
    'reference', 'entity', 'entry-date', 'start-date', 'end-date', 'entity_range']
    ].to_csv(
        os.path.join(out_dir, f"{dataset}_lookup_short-ranges-flagged.csv"), 
        index = False)