# Compare Entity Count
**Author**:  Kena Vyas <br>
**Date**:  2nd May 2024 <br>
**Data Scope**: First four datasets <br>
**Report Type**: Recurring daily <br>

## Purpose
This report provides a comparative analysis of the entity counts for each endpoint against the corresponding counts on the platform per Organisation per dataset.

In [1]:
import urllib
import pandas as pd
import ipywidgets as widgets

In [2]:
datasette_url = "https://datasette.planning.data.gov.uk/"
result_df = pd.DataFrame()

def get_endpoint_resource_info(dataset):
    params = urllib.parse.urlencode({
        "sql": f"""
        select p.organisation,p.start_date,re.name,re.collection,re.pipeline,re.endpoint,
        re.endpoint_url,re.status,re.resource,re.latest_log_entry_date,re.endpoint_entry_date
        from provision p join reporting_latest_endpoints re 
        on p.organisation= replace(re.organisation, '-eng', '') 
        where cohort in ('ODP-Track1','ODP-Track2','ODP-Track3','ODP-Track4','RIPA-Beta','RIPA-BOPS') and 
        re.pipeline = '{dataset}' and status='200' 
        group by endpoint
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df
 
dataset_options = {
    "Article 4 direction": "article-4-direction","Article 4 direction area": "article-4-direction-area","Conservation area": "conservation-area",
    "Listed building outline": "listed-building-outline","Tree": "tree","Tree preservation order": "tree-preservation-order","Tree preservation zone": "tree-preservation-zone",    
}

dataset_dropdown = widgets.Dropdown(
    options=dataset_options,
    description="Select Dataset:",
)

widgets.interact(get_endpoint_resource_info, dataset=dataset_dropdown)
initial_dataset = dataset_dropdown.value

interactive(children=(Dropdown(description='Select Dataset:', options={'Article 4 direction': 'article-4-direc…

In [3]:
def get_endpoint_resource_info_(dataset):
    result_df = get_endpoint_resource_info(dataset)
    resource_list=result_df['resource']
    dataset_input=result_df['pipeline'][0]
    info={}
    latest_res_list=[]
    platform_list=[]
    
    for res in resource_list:
        info[res] = []
        params = urllib.parse.urlencode({
                "sql": f"""
                select count(*) from ( 
                    select rowid, end_date, fact, entry_date, entry_number, resource, start_date 
                    from fact_resource 
                    where "resource" ='{res}' group by entry_number
                );
                """,
                "_size": "max"
            })
        
        url = f"{datasette_url}{dataset_input}.csv?{params}"
        df = pd.read_csv(url)
        info[res].append(df.iloc[0, 0])

    updated_dict={}
    for index, row in result_df.iterrows():
        resource, organisation = row['resource'], row['organisation']
        if resource in info:
            updated_dict[organisation] = info[resource]

    org_entity={}
    org_list=result_df['organisation']
    
    for org in org_list:
        org_entity[org] = []
        params = urllib.parse.urlencode({
                    "sql": f"""
                    select entity from organisation where organisation = '{org}'
                    """,
                    "_size": "max"
                })
            
        url = f"{datasette_url}digital-land.csv?{params}"
        df = pd.read_csv(url)
        org_entity[org]=df.iloc[0, 0]

    entity_count={}
    for key,value in org_entity.items():
        entity_count[key]=[]
        params = urllib.parse.urlencode({
                "sql": f"""
                select count(*) from (select * from entity where "organisation_entity" = '{value}')
                """,
                "_size": "max"
            })
        
        url = f"{datasette_url}{dataset_input}.csv?{params}"
        df = pd.read_csv(url)
        if key in updated_dict:
            updated_dict[key].append(df.iloc[0, 0])

    filtered_data = {key: value for key, value in updated_dict.items() if value[0] != value[1]}

    res_df = pd.DataFrame(filtered_data).transpose()
    if res_df.empty:
        return "Entities Match for all endpoints"
    res_df.columns = ['Latest resource entity count', 'Platform entity count']
    print("Dataset : ",dataset_input)

    for key,value in updated_dict.items():
        params = urllib.parse.urlencode({
                    "sql": f"""
                    select entity from organisation where organisation = '{key}'
                    """,
                    "_size": "max"
                })
            
        url = f"{datasette_url}digital-land.csv?{params}"
        df = pd.read_csv(url)
        
        o_entity=df.iloc[0,0]
        resource = result_df[result_df['organisation'] == key]['resource'].values[0]
       
        Bool=True
        value=0
        while Bool:
            params = urllib.parse.urlencode({
                    "sql": f"""
                    select fe.end_date, fe.fact, fe.entry_date, fe.entry_number, fe.resource, fe.start_date,f.entity,e.reference,e.organisation_entity
                    from fact_resource fe join fact f on fe.fact=f.fact join entity e on f.entity=e.entity
                    where resource="{resource}" and fe.entry_number>{value}
                    group by entry_number
                    """,
                    "_size": "max"
            })
                
            url = f"{datasette_url}{dataset_input}.csv?{params}"
            df1 = pd.read_csv(url)
            list_new=df1[['entity','reference','organisation_entity']].values.tolist()  
            list_entity=df1['entry_number'].tolist()
            latest_res_list.extend(list_new)
            if list_entity:
                value = list_entity[-1]
            else:
                break
            
                    
            if len(df1)<1000:
                Bool=False
    
        Bool=True
        value=0
        while Bool:
            params = urllib.parse.urlencode({
                "sql": f"""
                select dataset, end_date, entity, entry_date, geojson, geometry, json, name, organisation_entity, point, prefix, reference, start_date, typology from entity where organisation_entity = "{o_entity}"
                and entity>{value}
                """,
                "_size": "max"
            })
            
            url1 = f"{datasette_url}{dataset_input}.csv?{params}"
            df2 = pd.read_csv(url1)
            list_new=df2[['entity','reference','organisation_entity']].values.tolist()
            list_entity=df2['entity'].tolist()
            platform_list.extend(list_new)
            if list_entity:
                value =list_entity[-1]
            else:
                break
            
            if len(df2)<1000:
                Bool=False
    
    ele = {item[0] for item in latest_res_list}
    filtered_list = [item for item in platform_list if item[0] not in ele]
    result_df = pd.DataFrame(filtered_list, columns=['entity', 'reference', 'organisation_entity'])
    result_df.to_csv('result.csv', index=False)
    
    print("File result.csv downloaded")
    return res_df

widgets.interact(get_endpoint_resource_info_, dataset=dataset_dropdown)
initial_dataset = dataset_dropdown.value

interactive(children=(Dropdown(description='Select Dataset:', options={'Article 4 direction': 'article-4-direc…