In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# conservation_area lookup file
data = pd.read_csv('lookup.csv')

In [3]:
data.head()

Unnamed: 0,prefix,resource,endpoint,entry-number,organisation,reference,entity,entry-date,start-date,end-date
0,conservation-area,,,,government-organisation:PB1164,5080,44000001,,,
1,conservation-area,,,,government-organisation:PB1164,5071,44000002,,,
2,conservation-area,,,,government-organisation:PB1164,5074,44000003,,,
3,conservation-area,,,,government-organisation:PB1164,5075,44000004,,,
4,conservation-area,,,,government-organisation:PB1164,5078,44000005,,,


In [14]:
data = data.dropna(subset=['entity', 'organisation'])

data['entity'] = pd.to_numeric(data['entity'])

data = data.sort_values(by=['organisation', 'entity']).reset_index(drop=True)

# increment_id tracks when the organisation or non-consecutive entity changes
data['increment_id'] = (data['organisation'] != data['organisation'].shift(1)) | \
                       (data['prefix'] != data['prefix'].shift(1)) | \
                       ((data['entity'] - data['entity'].shift(1)) != 1)


# Cumulatively sum the 'increment_id' to get the unique range IDs
data['increment_id'] = data['increment_id'].cumsum()

# Group by organisation and the 'increment_id' to calculate min and max entities for each range
entity_ranges = data.groupby(['prefix','organisation', 'increment_id']).agg(
    min_entity=('entity', 'min'),
    max_entity=('entity', 'max')
).reset_index()

entity_ranges = entity_ranges.drop(columns=['increment_id'])

entity_ranges


Unnamed: 0,prefix,organisation,min_entity,max_entity
0,conservation-area,development-corporation:Q6670544,44003422,44003423
1,conservation-area,development-corporation:Q6670544,44006543,44006543
2,conservation-area,development-corporation:Q6670544,44009061,44009061
3,conservation-area,government-organisation:D1342,44010311,44010413
4,conservation-area,government-organisation:D1342,44010623,44012250
...,...,...,...,...
1427,conservation-area-document,local-authority:ROH,6300214,6300229
1428,conservation-area-document,local-authority:ROS,6300062,6300095
1429,conservation-area-document,local-authority:SAL,6300000,6300044
1430,conservation-area-document,local-authority:WRL,6303118,6303261


In [5]:
entity_ranges.to_csv('entity_ranges_pandas_approach.csv', index=False)


In [20]:
# ======== RETURNS: DATASET, ORGANISATION, MIN and MAX ENTITY RANGE
def find_organization_entity_ranges(df):
    """
    Find the overall entity ranges for each organization
    
   
    """
    # Group  organization, find  overall min and max 
    org_ranges = df.groupby(['prefix','organisation']).agg({
        'min_entity': 'min',
        'max_entity': 'max'
    }).reset_index()

    org_ranges.rename(columns={"min_entity": "entity-minimum", "max_entity": "entity-maximum"}, inplace=True)

    
    org_ranges = org_ranges.sort_values('entity-minimum')
    org_ranges['range_span'] = org_ranges['entity-maximum'] - org_ranges['entity-minimum'] +1
    
    return org_ranges

# Load entity_ranges csv from above cell
entity_ranges = pd.read_csv('entity_ranges_pandas_approach.csv')


final_org_ranges = find_organization_entity_ranges(entity_ranges)

print("Organization Entity Ranges:")
print("-" * 50)
for _, row in final_org_ranges.iterrows():
    print(f"Organisation: {row['organisation']}")
    print(f"First Entity: {row['entity-minimum']}")
    print(f"Last Entity:  {row['entity-maximum']}")
    print("-" * 50)


final_org_ranges.to_csv('organization_final_entity_ranges.csv', index=False)




final_org_ranges


Organization Entity Ranges:
--------------------------------------------------
Organisation: government-organisation:D1342
First Entity: 4210000
Last Entity:  4210003
--------------------------------------------------
Organisation: local-authority:SAL
First Entity: 6300000
Last Entity:  6300044
--------------------------------------------------
Organisation: local-authority:NLN
First Entity: 6300045
Last Entity:  6300061
--------------------------------------------------
Organisation: local-authority:ROS
First Entity: 6300062
Last Entity:  6300095
--------------------------------------------------
Organisation: local-authority:CMD
First Entity: 6300096
Last Entity:  6300157
--------------------------------------------------
Organisation: local-authority:PTE
First Entity: 6300158
Last Entity:  6300213
--------------------------------------------------
Organisation: local-authority:ROH
First Entity: 6300214
Last Entity:  6300229
--------------------------------------------------
Organisa

PermissionError: [Errno 13] Permission denied: 'organization_final_entity_ranges.csv'

In [29]:
# ======== RETURNS: DATASET, ORGANISATION, MIN, MAX ENTITY RANGE, RANGE_SPAN and FILLED_SPACES

def find_organization_entity_ranges(df):
    """
    Find the overall entity ranges and calculate range span for each organization.
    """
    # Group prefix and organisation, find min/max entities for  organization
    org_ranges = df.groupby(['prefix', 'organisation']).agg({
        'min_entity': 'min',
        'max_entity': 'max'
    }).reset_index()

    # Calculate  total possible spaces in the range
    org_ranges = org_ranges.sort_values('min_entity')
    org_ranges['range_span'] = org_ranges['max_entity'] - org_ranges['min_entity'] + 1
    
    return org_ranges


entity_ranges = pd.read_csv('entity_ranges_pandas_approach.csv')

final_org_ranges = find_organization_entity_ranges(entity_ranges)


lookup_df = pd.read_csv('lookup.csv')

# Count  actual filled entities by prefix and organisation
filled_counts = lookup_df.groupby(['prefix', 'organisation']).size().reset_index(name='filled_spaces')

#  filled_spaces into final_org_ranges
final_org_ranges = final_org_ranges.merge(filled_counts, on=['prefix', 'organisation'], how='left')

#  NaN  'filled_spaces' with 0, where no filled entries in lookup 
final_org_ranges['filled_spaces'] = final_org_ranges['filled_spaces'].fillna(0).astype(int)

final_org_ranges.rename(columns={"prefix":"dataset", "min_entity": "entity-minimum", "max_entity": "entity-maximum"}, inplace=True)

final_org_ranges = final_org_ranges[["dataset","entity-minimum", "entity-maximum", "organisation"]]
print("Organization Entity Ranges with Filled Spaces:")
print("-" * 50)
for _, row in final_org_ranges.iterrows():
    print(f"Organisation: {row['organisation']}")
    print(f"First Entity: {row['entity-minimum']}")
    print(f"Last Entity:  {row['entity-maximum']}")
    # print(f"Possible Spaces: {row['range_span']}")
    # print(f"Filled Spaces: {row['filled_spaces']}")
    print("-" * 50)


final_org_ranges.to_csv('entity-organisation.csv', index=False)

final_org_ranges


Organization Entity Ranges with Filled Spaces:
--------------------------------------------------
Organisation: government-organisation:D1342
First Entity: 4210000
Last Entity:  4210003
--------------------------------------------------
Organisation: local-authority:SAL
First Entity: 6300000
Last Entity:  6300044
--------------------------------------------------
Organisation: local-authority:NLN
First Entity: 6300045
Last Entity:  6300061
--------------------------------------------------
Organisation: local-authority:ROS
First Entity: 6300062
Last Entity:  6300095
--------------------------------------------------
Organisation: local-authority:CMD
First Entity: 6300096
Last Entity:  6300157
--------------------------------------------------
Organisation: local-authority:PTE
First Entity: 6300158
Last Entity:  6300213
--------------------------------------------------
Organisation: local-authority:ROH
First Entity: 6300214
Last Entity:  6300229
----------------------------------------

Unnamed: 0,dataset,entity-minimum,entity-maximum,organisation
0,conservation-area-document-type,4210000,4210003,government-organisation:D1342
1,conservation-area-document,6300000,6300044,local-authority:SAL
2,conservation-area-document,6300045,6300061,local-authority:NLN
3,conservation-area-document,6300062,6300095,local-authority:ROS
4,conservation-area-document,6300096,6300157,local-authority:CMD
...,...,...,...,...
112,conservation-area,44010296,44010305,local-authority:ROH
113,conservation-area,44010309,44010310,local-authority:CAS
114,conservation-area,44010311,44012250,government-organisation:D1342
115,conservation-area,44010589,44010622,local-authority:BST


In [34]:

df = pd.read_csv('entity-organisation.csv')  

# Ensure numeric  entities
df['entity-minimum'] = pd.to_numeric(df['entity-minimum'], errors='coerce')
df['entity-maximum'] = pd.to_numeric(df['entity-maximum'], errors='coerce')

overlapping_ranges = []

# Compare each row with every other row
for i in range(len(df)):
    for j in range(i+1, len(df)):
        #  organisations are different to avoid comparing same 
        if df.loc[i, 'organisation'] != df.loc[j, 'organisation']:
            # Check  overlap
            overlap = not (
                df.loc[i, 'entity-maximum'] < df.loc[j, 'entity-minimum'] or 
                df.loc[i, 'entity-minimum'] > df.loc[j, 'entity-maximum']
            )
            
            if overlap:
                overlap_info = {
                    'Dataset 1': df.loc[i, 'dataset'],
                    'Organisation 1': df.loc[i, 'organisation'],
                    'Entity Min 1': df.loc[i, 'entity-minimum'],
                    'Entity Max 1': df.loc[i, 'entity-maximum'],
                    'Dataset 2': df.loc[j, 'dataset'],
                    'Organisation 2': df.loc[j, 'organisation'],
                    'Entity Min 2': df.loc[j, 'entity-minimum'],
                    'Entity Max 2': df.loc[j, 'entity-maximum']
                }
                overlapping_ranges.append(overlap_info)

overlap_df = pd.DataFrame(overlapping_ranges)
overlap_df.to_csv('entity-organisation-overlap.csv', index=False)
print("Total Overlapping Ranges:", len(overlap_df))
display(overlap_df)

# by organisation
print("\nOverlaps by Organisation:")
overlap_summary = overlap_df.groupby(['Organisation 1', 'Organisation 2']).size().reset_index(name='Overlap Count')
display(overlap_summary)




Total Overlapping Ranges: 3123


Unnamed: 0,Dataset 1,Organisation 1,Entity Min 1,Entity Max 1,Dataset 2,Organisation 2,Entity Min 2,Entity Max 2
0,conservation-area-document,government-organisation:D1342,6300232,6303117,conservation-area-document,local-authority:BIR,6300438,6300479
1,conservation-area-document,government-organisation:D1342,6300232,6303117,conservation-area-document,local-authority:BST,6300480,6300513
2,conservation-area,local-authority:SAL,44000001,44010124,conservation-area,government-organisation:PB1164,44000001,44009885
3,conservation-area,local-authority:SAL,44000001,44010124,conservation-area,local-authority:BRT,44000071,44007977
4,conservation-area,local-authority:SAL,44000001,44010124,conservation-area,national-park-authority:Q72617988,44000191,44007439
...,...,...,...,...,...,...,...,...
3118,conservation-area,local-authority:GAT,44009857,44010270,conservation-area,local-authority:ROS,44010125,44010134
3119,conservation-area,local-authority:GAT,44009857,44010270,conservation-area,local-authority:NSM,44010141,44010179
3120,conservation-area,local-authority:GAT,44009857,44010270,conservation-area,local-authority:ECA,44010180,44010207
3121,conservation-area,local-authority:GAT,44009857,44010270,conservation-area,local-authority:SAW,44010208,44010216



Overlaps by Organisation:


Unnamed: 0,Organisation 1,Organisation 2,Overlap Count
0,development-corporation:Q6670544,local-authority:BAB,1
1,development-corporation:Q6670544,local-authority:BEN,1
2,development-corporation:Q6670544,local-authority:BRD,1
3,development-corporation:Q6670544,local-authority:BUC,1
4,development-corporation:Q6670544,local-authority:DUD,1
...,...,...,...
3117,national-park-authority:Q72617988,local-authority:WND,1
3118,national-park-authority:Q72617988,local-authority:WOX,1
3119,national-park-authority:Q72617988,local-authority:WRL,1
3120,national-park-authority:Q72617988,local-authority:WSM,1
