In [None]:
import pandas as pd
import urllib

This code accepts an input of a .csv file with the columns 'organisation', 'name' and 'start_date'. It will then find the most recent endpoints added for these organisations in the specified collection, and find the entity count for these endpoints.

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)                                                                                                                                                 

datasette_url = "https://datasette.planning.data.gov.uk/"

collection_input = "brownfield-land"

last_updated_file_input = "brownfield-land-last-updated.csv"

last_updated_df = pd.read_csv(last_updated_file_input)

endpoints_df = []
for row in last_updated_df.itertuples():
    params = urllib.parse.urlencode({
        "sql": f"""
        select x.organisation, x.endpoint, x.entry_date, x.resource
        from (
            select e.endpoint, e.entry_date, s.organisation, re.resource,
                row_number() over (partition by s.organisation order by e.entry_date desc) as row_number
            from endpoint e
            inner join source s
            on e.endpoint = s.endpoint
            inner join resource_endpoint re
            on e.endpoint = re.endpoint
            where s.organisation = '{row.organisation}'
            and s.collection = '{collection_input}'
        ) x
        where x.row_number=1
        """,
        "_size": "max"
    })
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    endpoints_df.append(df)
endpoints_df = pd.concat(endpoints_df)

In [None]:
entity_counts_df = []
for row in endpoints_df.itertuples():
    params = urllib.parse.urlencode({
        "sql": f"""
        select resource, entity_count
        from dataset_resource
        where resource = '{row.resource}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}{collection_input}.csv?{params}"
    df = pd.read_csv(url)
    entity_counts_df.append(df)
entity_counts_df = pd.concat(entity_counts_df)

In [None]:
output_df = pd.merge(endpoints_df, entity_counts_df, left_on="resource", right_on="resource")
output_df = pd.merge(output_df, brownfield_last_updated_df, on="organisation")
output_df = output_df[["organisation", "name", "entity_count", "start_date"]]

output_df["start_date"] = pd.to_datetime(output_df["start_date"], dayfirst=True)
output_df = output_df.sort_values(by=["start_date"], ascending=True)

output_df = output_df.drop_duplicates(ignore_index=True)

display(output_df)
output_df.to_csv(f"{collection_input}-entity-count-last-updated.csv", index=False)