In [3]:
import urllib.request
import os
import sqlite3
import pandas as pd
import geopandas as gpd
import time
import numpy as np
from datetime import datetime
import shapely.wkt


In [4]:
FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

In [5]:
def query_sqlite(db_path, query_string):

    with sqlite3.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

In [6]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as org_entity, name as org_name, organisation, dataset
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

In [7]:
data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)

## Get data

In [83]:
# download_dataset("digital-land", data_dir, overwrite=True)
download_dataset("conservation-area", data_dir, overwrite=True)

dl_db_path = os.path.join(data_dir, "digital-land.db")
ca_db_path = os.path.join(data_dir, "conservation-area.db")

downloading data from https://datasette.planning.data.gov.uk/conservation-area.db
to: ../../data/db_downloads/conservation-area.db
download complete


In [9]:
org_df = get_all_organisations()

## Analysis

In [84]:
# for each entity get all geometry factsand the resources they've come from

q = """
    with facts_latest_resource as (

        select f.entity, f.fact, fr.resource, f.field, fr.entry_date, f.value as geometry, row_number() over (partition by f.fact order by fr.entry_date desc) as res_rank
        from fact f
        inner join fact_resource fr on f.fact = fr.fact
        where f.field = 'geometry'
        )
  
    select * 
    from facts_latest_resource
    where res_rank = 1
"""

ca_geom_facts = query_sqlite(ca_db_path, q)

print(len(ca_geom_facts))
ca_geom_facts.head()


11233


Unnamed: 0,entity,fact,resource,field,entry_date,geometry,res_rank
0,44010606,00005bab21d5b17176f33e8792d8d5bcaf731472087a70...,a1435d709a70c881b3e2ae5dc4b4767d681c04c43e2b42...,geometry,2024-06-14,"MULTIPOLYGON (((-0.141906 51.545525,-0.141950 ...",1
1,44009989,000358afb6c5b2efde45ede6cbdbadca39e64b7b75f979...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((1.178727 51.357590,1.178593 51...",1
2,44003286,0004a551d59e521aee3c27af1bd67375e00acabb461573...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-3.472297 50.639420,-3.472308 ...",1
3,44001057,00063bf7e87227f2731ef36a9fe0fd04d76ad9e822e34d...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-0.162925 51.432615,-0.162994 ...",1
4,44005468,0007afcfc1d037d7f7a6198ff881ab6b598e2b58d4038a...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-2.615800 51.490436,-2.615912 ...",1


In [85]:
# get resource to org lookup and join on other org details

q = """
    select * from resource_organisation
"""

res_org_lookup = query_sqlite(dl_db_path, q)

res_org_lookup["organisation"] = res_org_lookup["organisation"].apply(lambda x: x.replace("-eng", ""))

res_org_lookup = res_org_lookup.merge(
    org_df[["organisation", "org_name", "dataset"]],
    how = "left",
    on = "organisation"
)

res_org_lookup.rename(columns={"dataset":"org_type"}, inplace = True)


print(len(res_org_lookup))
print(len(res_org_lookup.drop_duplicates()))
print(len(res_org_lookup["resource"].drop_duplicates()))
res_org_lookup.head()


8305
8305
8151


Unnamed: 0,resource,organisation,org_name,org_type
0,0b4f1ab864ad163800ef661121c26cac26bccd5518fcb4...,government-organisation:PB202,Natural England,government-organisation
1,1d5336e3a650cb037328bf9c2911309328cdc743935842...,government-organisation:PB202,Natural England,government-organisation
2,80709f042768e421a82f4aaa523f34b837e77af71b4c8a...,government-organisation:PB202,Natural England,government-organisation
3,87a4898736b177e886dc0722c3403b12989a563c272a6c...,government-organisation:PB202,Natural England,government-organisation
4,89632d544d34d2745cbfc2ec034fbdae2a74c235ea07ac...,government-organisation:PB202,Natural England,government-organisation


In [86]:
# find resources which are duplicated across orgs
# res_org_count = res_org_lookup.groupby(["resource"]).size().reset_index(name = "count")
# res_dupes = res_org_count[res_org_count["count"] > 1]

# res_org_lookup[res_org_lookup["resource"].isin(res_dupes["resource"])].sort_values("resource")

In [87]:
print(len(ca_geom_facts))

# join resouce org lookup to entity_fact_resource table
ca_geom_facts_org = ca_geom_facts.merge(
    res_org_lookup,
    how = "left",
    on = "resource"
)

# make gdf for mapping later
ca_geom_facts_org["geometry"] = ca_geom_facts_org["geometry"].apply(shapely.wkt.loads)
ca_geom_facts_org = gpd.GeoDataFrame(ca_geom_facts_org, geometry="geometry")

ca_geom_facts_org.set_crs(4326, inplace=True)

print(len(ca_geom_facts_org))
ca_geom_facts_org.head()

11233
11233


Unnamed: 0,entity,fact,resource,field,entry_date,geometry,res_rank,organisation,org_name,org_type
0,44010606,00005bab21d5b17176f33e8792d8d5bcaf731472087a70...,a1435d709a70c881b3e2ae5dc4b4767d681c04c43e2b42...,geometry,2024-06-14,"MULTIPOLYGON (((-0.14191 51.54552, -0.14195 51...",1,local-authority:CMD,London Borough of Camden,local-authority
1,44009989,000358afb6c5b2efde45ede6cbdbadca39e64b7b75f979...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((1.17873 51.35759, 1.17859 51.3...",1,government-organisation:PB1164,Historic England,government-organisation
2,44003286,0004a551d59e521aee3c27af1bd67375e00acabb461573...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-3.47230 50.63942, -3.47231 50...",1,government-organisation:PB1164,Historic England,government-organisation
3,44001057,00063bf7e87227f2731ef36a9fe0fd04d76ad9e822e34d...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-0.16292 51.43261, -0.16299 51...",1,government-organisation:PB1164,Historic England,government-organisation
4,44005468,0007afcfc1d037d7f7a6198ff881ab6b598e2b58d4038a...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-2.61580 51.49044, -2.61591 51...",1,government-organisation:PB1164,Historic England,government-organisation


In [88]:
# count number of org_types and org per entity
ent_fact_org_count = ca_geom_facts_org.groupby(
        ["entity"], as_index=False
    ).agg(
        n_org_types = ("org_type", "nunique"),
        n_orgs = ("organisation", "nunique")
    )

# issues are when we have a higher number of organisations than org types (e.g. two different LPA resources for just one entity)
bad_ents = ent_fact_org_count[ent_fact_org_count["n_orgs"] > ent_fact_org_count["n_org_types"]]
print(len(bad_ents))
bad_ents

176


Unnamed: 0,entity,n_org_types,n_orgs
2219,44002494,2,7
2220,44002495,2,12
2221,44002496,2,12
2222,44002497,2,10
2223,44002498,2,9
...,...,...,...
8432,44010620,1,5
8433,44010621,1,4
8434,44010622,1,5
8437,44012283,1,11


In [89]:
ca_geom_facts_org[ca_geom_facts_org["entity"] == 44005888]

Unnamed: 0,entity,fact,resource,field,entry_date,geometry,res_rank,organisation,org_name,org_type
908,44005888,145ebb5003451d1ddfd914f1bf26029c6367d3e8be0fb7...,e39e0a1256d297795a32fa37a508718e7dd1f14d4bdfb8...,geometry,2024-10-08,"MULTIPOLYGON (((1.06972 51.28429, 1.06976 51.2...",1,local-authority:CAT,Canterbury City Council,local-authority
2941,44005888,43005be3852dfba97940537ecdf35957305c1ac71c8df8...,08fbb782cc005066ce927d643cfd0300769648c98e1520...,geometry,2020-09-08,"MULTIPOLYGON (((-2.07404 53.27970, -2.07398 53...",1,national-park-authority:Q72617988,Peak District National Park Authority,national-park-authority
8250,44005888,bc4303fffc4f1fba940d7e7a99cc466ed240a38bb42f2e...,c7f486b33d60ace51e993c99540a055851ebe913ae499c...,geometry,2023-07-22,"MULTIPOLYGON (((-2.07091 53.28381, -2.07116 53...",1,government-organisation:PB1164,Historic England,government-organisation


In [None]:
bad_ents_gdf = ca_geom_facts_org[ca_geom_facts_org["entity"].isin(bad_ents.entity)].sort_values("entity")
bad_ents_gdf.head()

In [20]:
# for each entity, flag the geometry facts which don't intersect with any of the others (i.e. they're likely here from a bad merge)
def flag_non_intersecting(group):
    group['non_intersecting'] = ~group.geometry.apply(lambda x: group.geometry.intersects(x).sum() > 1)
    return group

# Apply the function to each group
bad_ents_gdf_flagged = bad_ents_gdf.groupby('entity').apply(flag_non_intersecting).reset_index(drop=True)

# save
bad_ents_gdf_flagged[["entity", "org_name", "non_intersecting", "geometry"]].to_csv("conservation-area_entity-bad-merges.csv", index=False)

In [81]:
# check facts for specific entity
bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44005888]

Unnamed: 0,entity,fact,resource,field,entry_date,geometry,res_rank,organisation,org_name,org_type,non_intersecting


In [78]:
# map entity fact
bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44009888].iloc[[2]].explore(  #
    column = "org_name"
)

In [80]:
# map all entity facts
bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44009888].explore(  #
    column = "org_name"
)

In [72]:
lookup = pd.read_csv("lookup.csv")

print(len(lookup))

lookup = lookup.merge(
    org_df[["organisation", "org_name", "dataset"]],
    how = "left",
    on = "organisation"
)

print(len(lookup))
lookup.head()

17848
17848


Unnamed: 0,prefix,resource,endpoint,entry-number,organisation,reference,entity,entry-date,start-date,end-date,org_name,dataset
0,conservation-area,,,,government-organisation:PB1164,5080,44000001,,,,Historic England,government-organisation
1,conservation-area,,,,government-organisation:PB1164,5071,44000002,,,,Historic England,government-organisation
2,conservation-area,,,,government-organisation:PB1164,5074,44000003,,,,Historic England,government-organisation
3,conservation-area,,,,government-organisation:PB1164,5075,44000004,,,,Historic England,government-organisation
4,conservation-area,,,,government-organisation:PB1164,5078,44000005,,,,Historic England,government-organisation


In [73]:
# count number of org_types and org per entity

lookup_no_gla = lookup[lookup["organisation"] != "local-authority:GLA"]

lookup_ent_count = lookup_no_gla.groupby(
        ["entity"], as_index=False
    ).agg(
        n_org_types = ("dataset", "nunique"),
        n_orgs = ("organisation", "nunique")
    )

# issues are when we have a higher number of organisations than org types (e.g. two different LPA resources for just one entity)
bad_lookup_ents = lookup_ent_count[lookup_ent_count["n_orgs"] > lookup_ent_count["n_org_types"]]

bad_lookup_ents

Unnamed: 0,entity,n_org_types,n_orgs
9636,44006432,2,3
11406,44008380,1,2


In [None]:
# count number of org_types and org per entity

lookup_no_gla = lookup[lookup["organisation"] != "local-authority:GLA"]

lookup_ent_count = lookup_no_gla.groupby(
        ["entity"], as_index=False
    ).agg(
        n_org_types = ("dataset", "nunique"),
        n_orgs = ("organisation", "nunique")
    )

# issues are when we have a higher number of organisations than org types (e.g. two different LPA resources for just one entity)
bad_lookup_ents = lookup_ent_count[lookup_ent_count["n_orgs"] > lookup_ent_count["n_org_types"]]

bad_lookup_ents

Unnamed: 0,entity,n_org_types,n_orgs
8281,44005091,2,3
9621,44006432,2,3
11390,44008380,1,2


In [57]:
org_df

Unnamed: 0,org_entity,org_name,organisation,dataset
0,1,Old Oak and Park Royal Development Corporation,development-corporation:Q20648596,development-corporation
1,2,Birmingham Heartlands Development Corporation,development-corporation:Q4916714,development-corporation
2,3,London Legacy Development Corporation,development-corporation:Q6670544,development-corporation
3,4,London Thames Gateway Development Corporation,development-corporation:Q6670837,development-corporation
4,5,South Tees Development Corporation,development-corporation:Q72456968,development-corporation
...,...,...,...,...
450,416,East London Waste Authority,waste-authority:Q20711950,waste-authority
451,417,North London Waste Authority,waste-authority:Q20713477,waste-authority
452,418,West London Waste Authority,waste-authority:Q21921612,waste-authority
453,419,Western Riverside Waste Authority,waste-authority:Q21935501,waste-authority
