# Identify Updated Endpoints
**Author**:  Kena Vyas <br>
**Date**:  23rd Feb 2024 <br>
**Data Scope**: First four datasets <br>
**Report Type**: Recurring daily <br>

## Purpose
This Report gets a list of endpoints that have updated along with the entity count

In [1]:
import urllib
import pandas as pd
import ipywidgets as widgets

In [2]:
datasette_url = "https://datasette.planning.data.gov.uk/"
result_df = pd.DataFrame()

def get_all_endpoints(dataset):
    global result_df
    params = urllib.parse.urlencode({
        "sql": f"""
        select t1.endpoint,
        t3.pipeline from(select endpoint from resource_endpoint 
        group by endpoint having count(endpoint)>1) t1 
        join source t2 on t1.endpoint=t2.endpoint 
        join source_pipeline t3 on t2.source=t3.source 
        where t3.pipeline='{dataset}'
        """,
        "_size": "max"
    })
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    result_df = df
    return df
   
dataset_options = {
    "Article 4 Direction":"article-4-direction","Article 4 Direction Area": "article-4-direction-area","Conservation Area": "conservation-area","Listed Building Outline": "listed-building-outline",
    "Tree":"tree","Tree Preservation Zone": "tree-preservation-zone","Tree":"tree","Tree Preservation Order":"tree-preservation-order"
}

dataset_dropdown = widgets.Dropdown(
    options=dataset_options,
)

widgets.interact(get_all_endpoints, dataset=dataset_options)
initial_dataset = dataset_dropdown.value

interactive(children=(Dropdown(description='dataset', options={'Article 4 Direction': 'article-4-direction', '…

This table lists the endpoints along with its endpoint URL and the number of entities updated

In [3]:
selected_dataset=result_df['pipeline'][0]
get_resource={}

for index,row in result_df.iterrows():
    resource_endpoint=''
    params = urllib.parse.urlencode({
            "sql": f"""
            select re.resource, re.endpoint, r.start_date from resource_endpoint re inner join resource r on re.resource=r.resource
            where re.endpoint = '{row['endpoint']}'
            """,
            "_size": "max"
        })
    url = f"{datasette_url}digital-land.csv?{params}"
    resource_endpoint = pd.read_csv(url)

    resource_endpoint.sort_values(by='start_date', ascending=False, inplace=True)
    resource_endpoint.reset_index(drop=True, inplace=True)
    df=resource_endpoint.head(2)
    grouped_data = df.groupby('endpoint').apply(lambda group: [dict(zip(['resource', 'start_date'], values)) for values in group[['resource', 'start_date']].values]).reset_index(name='resource_start_date_list')
    get_resource.update(dict(zip(grouped_data['endpoint'], grouped_data['resource_start_date_list'])))

all_resource_count={}
for key,list_resource in get_resource.items():
    per_resource_count=[]
    for ele in list_resource:
        params = urllib.parse.urlencode({
                "sql": f"""
                SELECT COUNT(*)
                FROM ( select rowid, end_date, fact, entry_date, entry_number, resource, start_date 
                from fact_resource 
                where "resource" ='{ele['resource']}' group by entry_number);
                """,
                "_size": "max"
            })
        url = f"{datasette_url}{selected_dataset}.csv?{params}"
        count = pd.read_csv(url)
        per_resource_count.append(count.iloc[0, 0])
    all_resource_count[key] = per_resource_count

endpoints_with_diff_count={}
for key,value in all_resource_count.items():
    flag = True if value[0] != value[1] else False
    if flag:
        diff=value[0]-value[1]
        if diff < 0:
            if diff==-1:
                msg = str(abs(diff)) + ' entity deleted'
            else:
                msg = str(abs(diff)) + ' entities deleted'
        elif diff == 1:
            msg = str(diff) + ' entity added'
        else:
            msg = str(diff) + ' entities added'
        endpoints_with_diff_count[key]=msg
result_df = pd.DataFrame()

if len(endpoints_with_diff_count)>1:
    endpoint_tuple=tuple(endpoints_with_diff_count)
    params = urllib.parse.urlencode({
        "sql": f"""
        select endpoint, endpoint_url from endpoint where endpoint in {endpoint_tuple}
        """,
        "_size": "max"
    })
else:
    endpoint_tuple=list(endpoints_with_diff_count.keys())[0]
    params = urllib.parse.urlencode({
        "sql": f"""
        select endpoint, endpoint_url from endpoint where endpoint =='{endpoint_tuple}'
        """,
        "_size": "max"
    })
url = f"{datasette_url}digital-land.csv?{params}"
df = pd.read_csv(url)

to_df = pd.DataFrame.from_dict(endpoints_with_diff_count, orient='index').reset_index()
to_df = to_df.rename(columns={'index': 'endpoint',0:'update'})
updated_endpoints = pd.merge(to_df, df, on='endpoint', how='left')
updated_endpoints

Unnamed: 0,endpoint,update,endpoint_url
0,03b17776d0e7707bdb98768bdcac82cf7d01595a353603...,20 entities added,https://services-eu1.arcgis.com/xk4RA36G57mVH7...


In [4]:
download = input("Do you want to download the result? (yes/no): ")

if download.lower() == "yes":
    updated_endpoints.to_csv("updated_endpoints.csv", index=False)
    print("Query result downloaded as 'updated_endpoints.csv'")

Do you want to download the result? (yes/no):  yes


Query result downloaded as 'updated_endpoints.csv'
