In [1]:
import pandas as pd
import urllib.parse
import ipywidgets as widgets

# Merging the Entity Tables of Two Datasets
The following cell should allow you to select a combination of datasets, fetch and merge the two respective entity dataframes. This often leads to null values in the x/y fields, depending on which dataset entries did not match with one in the other dataset.

In [2]:
datasette_url = "https://datasette.planning.data.gov.uk/"

global collection_options    
collection_options = {
    "Article 4 Direction and Article 4 Direction Area": ["article-4-direction", "article-4-direction-area"],
    "Conservation Area and Documents": ["conservation-area","conservation-area-document"],
    "Tree Preservation Order and Tree": ["tree-preservation-order", "tree"],
    "Tree Preservation Order and Tree Preservation Zones":["tree-preservation-order", "tree-preservation-zone"]
}

collection_dropdown = widgets.Dropdown(
    options=collection_options,
    description="Select dataset combination:",
)

def get_organisations():
    global org_df  
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          *
        from
          organisation o
        """,
        "_size": "max"
    })
    url = f"{datasette_url}digital-land.csv?{params}"
    org_df = pd.read_csv(url)
    return org_df

def get_spatial_doc_matched_df(dataset_combination):
    global spatial_doc_matched_df
    df_one = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset_combination[0]}.csv")
    df_two = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset_combination[1]}.csv")
    df_one = df_one[['entity', 'dataset', 'organisation-entity', 'reference']]
    df_two = df_two[['entity','dataset', 'organisation-entity', dataset_combination[0]]]
    merged_df = pd.merge(df_one, df_two, how='outer', left_on=['reference', 'organisation-entity'], right_on=[dataset_combination[0], 'organisation-entity'])
    org_df = get_organisations()[['name', 'entity']]
    merged_df = pd.merge(merged_df, org_df, how='left', left_on='organisation-entity', right_on='entity')
    spatial_doc_matched_df = merged_df[['entity_x','entity_y','dataset_x', 'dataset_y', 'name', 'reference', dataset_combination[0]]]
    return spatial_doc_matched_df

widgets.interact(get_spatial_doc_matched_df, dataset_combination=collection_options)
initial_organisation = collection_dropdown.value

interactive(children=(Dropdown(description='dataset_combination', options={'Article 4 Direction and Article 4 …

In [3]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    spatial_doc_matched_df.to_csv("spatial_doc_matched_df.csv", index=False)
    print("Query result downloaded as 'spatial_doc_matched_df.csv'")

Do you want to download the table? (yes/no):  yes


Query result downloaded as 'spatial_doc_matched_df.csv'


# Filtering for Rows with Null Values
The following cell takes the dataframe generated above and filters it, selecting only rows with null values. This is to identify which entities from dataframe 'x' do not match with entities in dataframe 'y' (and vice versa).

In [4]:
def get_mismatched_entities(dataset_combination):
    global mismatched_entities
    df = get_spatial_doc_matched_df(dataset_combination)
    null_mask = df.isnull().any(axis=1)
    mismatched_entities = df[null_mask].reset_index(drop=True)
    return mismatched_entities
    
widgets.interact(get_mismatched_entities, dataset_combination=collection_options)
initial_organisation = collection_dropdown.value

interactive(children=(Dropdown(description='dataset_combination', options={'Article 4 Direction and Article 4 …

In [5]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    mismatched_entities.to_csv("mismatched_entities.csv", index=False)
    print("Query result downloaded as 'mismatched_entities.csv'")

Do you want to download the table? (yes/no):  yes


Query result downloaded as 'mismatched_entities.csv'
