In [3]:
import urllib.request
import os
import sqlite3
import pandas as pd
import geopandas as gpd
import time
import numpy as np
from datetime import datetime
import shapely.wkt


In [4]:
FILES_URL = 'https://datasette.planning.data.gov.uk/'

def download_dataset(dataset, output_dir_path, overwrite=False):
    dataset_file_name = f'{dataset}.db'
    
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)
    
    output_file_path = os.path.join(output_dir_path, dataset_file_name)

    if overwrite is False and os.path.exists(output_file_path):
        return
    
    final_url = os.path.join(FILES_URL, dataset_file_name)
    print(f'downloading data from {final_url}')
    print(f'to: {output_file_path}')
    urllib.request.urlretrieve(final_url, os.path.join(output_dir_path, dataset_file_name))
    print('download complete')

In [5]:
def query_sqlite(db_path, query_string):

    with sqlite3.connect(db_path) as con:
            
        cursor = con.execute(query_string)
        cols = [column[0] for column in cursor.description]
        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)

    return results_df

In [6]:
def get_all_organisations():
    params = urllib.parse.urlencode({
        "sql": f"""
        select entity as org_entity, name as org_name, organisation, dataset
        from organisation
        """,
        "_size": "max"
        })
    url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df

In [7]:
data_dir = "../../data/db_downloads/"
os.makedirs(data_dir, exist_ok=True)

## Get data

In [8]:
# download_dataset("digital-land", data_dir, overwrite=True)
# download_dataset("conservation-area", data_dir, overwrite=True)

dl_db_path = os.path.join(data_dir, "digital-land.db")
ca_db_path = os.path.join(data_dir, "conservation-area.db")

In [9]:
org_df = get_all_organisations()

## Analysis

In [None]:
# for each entity get all geometry factsand the resources they've come from

q = """
    with facts_latest_resource as (

        select f.entity, f.fact, fr.resource, f.field, fr.entry_date, f.value as geometry, row_number() over (partition by f.fact order by fr.entry_date desc) as res_rank
        from fact f
        inner join fact_resource fr on f.fact = fr.fact
        where f.field = 'geometry'
        )
  
    select * 
    from facts_latest_resource
    where res_rank = 1
"""

ca_geom_facts = query_sqlite(ca_db_path, q)

print(len(ca_geom_facts))
ca_geom_facts.head()


In [None]:
# get resource to org lookup and join on other org details

q = """
    select * from resource_organisation
"""

res_org_lookup = query_sqlite(dl_db_path, q)

res_org_lookup["organisation"] = res_org_lookup["organisation"].apply(lambda x: x.replace("-eng", ""))

res_org_lookup = res_org_lookup.merge(
    org_df[["organisation", "org_name", "dataset"]],
    how = "left",
    on = "organisation"
)

res_org_lookup.rename(columns={"dataset":"org_type"}, inplace = True)


print(len(res_org_lookup))
print(len(res_org_lookup.drop_duplicates()))
print(len(res_org_lookup["resource"].drop_duplicates()))
res_org_lookup.head()


In [15]:
# find resources which are duplicated across orgs
# res_org_count = res_org_lookup.groupby(["resource"]).size().reset_index(name = "count")
# res_dupes = res_org_count[res_org_count["count"] > 1]

# res_org_lookup[res_org_lookup["resource"].isin(res_dupes["resource"])].sort_values("resource")

In [None]:
print(len(ca_geom_facts))

# join resouce org lookup to entity_fact_resource table
ca_geom_facts_org = ca_geom_facts.merge(
    res_org_lookup,
    how = "left",
    on = "resource"
)

# make gdf for mapping later
ca_geom_facts_org["geometry"] = ca_geom_facts_org["geometry"].apply(shapely.wkt.loads)
ca_geom_facts_org = gpd.GeoDataFrame(ca_geom_facts_org, geometry="geometry")

ca_geom_facts_org.set_crs(4326, inplace=True)

print(len(ca_geom_facts_org))
ca_geom_facts_org.head()

In [None]:
# count number of org_types and org per entity
ent_fact_org_count = ca_geom_facts_org.groupby(
        ["entity"], as_index=False
    ).agg(
        n_org_types = ("org_type", "nunique"),
        n_orgs = ("organisation", "nunique")
    )

# issues are when we have a higher number of organisations than org types (e.g. two different LPA resources for just one entity)
bad_ents = ent_fact_org_count[ent_fact_org_count["n_orgs"] > ent_fact_org_count["n_org_types"]]
print(len(bad_ents))
bad_ents

In [None]:
bad_ents_gdf = ca_geom_facts_org[ca_geom_facts_org["entity"].isin(bad_ents.entity)].sort_values("entity")
bad_ents_gdf.head()

In [20]:
# for each entity, flag the geometry facts which don't intersect with any of the others (i.e. they're likely here from a bad merge)
def flag_non_intersecting(group):
    group['non_intersecting'] = ~group.geometry.apply(lambda x: group.geometry.intersects(x).sum() > 1)
    return group

# Apply the function to each group
bad_ents_gdf_flagged = bad_ents_gdf.groupby('entity').apply(flag_non_intersecting).reset_index(drop=True)

# save
bad_ents_gdf_flagged[["entity", "org_name", "non_intersecting", "geometry"]].to_csv("conservation-area_entity-bad-merges.csv", index=False)

In [53]:
# check facts for specific entity
# bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44009929]

In [51]:
# map entity fact
# bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44009916].iloc[[0]].explore(  #
#     column = "org_name"
# )

In [52]:
# map all entity facts
# bad_ents_gdf_flagged[bad_ents_gdf_flagged["entity"] == 44009916].explore(  #
#     column = "org_name"
# )