In [1]:
from datetime import datetime as dt
from IPython.display import display
import urllib
import numpy as np
import math
import pandas as pd
import ipywidgets as widgets
import requests
import urllib.parse

# All Endpoints Information

In [2]:
datasette_url = "https://datasette.planning.data.gov.uk/"

result_df = pd.DataFrame()

def update_dataframe(organisation):
    global result_df  
    if organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          e.endpoint_url,
          l.status,
          l.exception,
          s.collection,
          group_concat(DISTINCT sp.pipeline) as pipelines,
          s.organisation,
          o.name,
          max(l.entry_date) maxentrydate,
          max(e.entry_date) entrydate,
          e.end_date
        from
          log l
          inner join source s on l.endpoint = s.endpoint
          inner join organisation o on s.organisation=o.organisation
          inner join endpoint e on l.endpoint = e.endpoint
          inner join source_pipeline sp on s.source = sp.source
        where
           {query} and not collection="brownfield-land"
        group by
          l.endpoint,
          l.status
        order by
          l.endpoint,
          s.collection,
          maxentrydate desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    result_df = df
    return df

global organisation_options    
organisation_options = {
    "All LPA's":None,"Newcastle": "local-authority-eng:NET","Medway": "local-authority-eng:MDW","Lambeth": "local-authority-eng:LBH",
    "Gloucester": "local-authority-eng:GLO","Doncaster": "local-authority-eng:DNC","Buckinghamshire": "local-authority-eng:BUC","Epsom and Ewell": "local-authority-eng:EPS",
    "Canterbury": "local-authority-eng:CAT","Bolton": "local-authority-eng:BOL", "London Borough of Southwark": "local-authority-eng:SWK"
    
}
global organisation_dropdown
organisation_dropdown = widgets.Dropdown(
    options=organisation_options,
    description="Select LPA:",
)

widgets.interact(update_dataframe, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={"All LPA's": None, 'Newcastle': 'local-auth…

In [3]:
download = input("Do you want to download the table with all endpoints? (yes/no): ")

if download.lower() == "yes":
    result_df.to_csv("endpoints_with_all_status.csv", index=False)
    print("Query result downloaded as 'endpoints_with_all_status.csv'") 

Do you want to download the table with all endpoints? (yes/no):  no


# Endpoints with current/latest status

In [4]:
new_df = pd.DataFrame()

def update_dataframe_latest_status(organisation):
    global new_df
    all_endpoints=update_dataframe(organisation)
    new_df=all_endpoints.copy()
    new_df['maxentrydate'] = pd.to_datetime(new_df['maxentrydate'])
    new_df['last_status'] = None
    new_df['last_updated_date'] = None
    new_df['date_last_status_200'] = None
    
    for index, row in new_df.iterrows():
        if index < len(new_df) - 1 and (row['status']!=200 or pd.isna(row['status'])):
            if row['endpoint_url'] == new_df.at[index + 1, 'endpoint_url']:
                new_df.at[index, 'last_status'] = new_df.at[index + 1, 'status']
                new_df.at[index, 'last_updated_date'] = new_df.at[index + 1, 'maxentrydate']   
    
    new_df.drop_duplicates(subset='endpoint_url', keep='first', inplace=True)
    new_df.reset_index(drop=True, inplace=True)
    for index, row in new_df.iterrows():
        if row['last_status'] is not None:
            if row['last_status'] != 200  or row['last_status'] is None:
                filtered_df = all_endpoints[(all_endpoints['endpoint_url'] == row['endpoint_url'] ) & (all_endpoints['status'] == 200)]
                if not filtered_df.empty:
                    new_df.at[index, 'date_last_status_200'] = filtered_df['maxentrydate'].values[0][:19] 
    return new_df

widgets.interact(update_dataframe_latest_status, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={"All LPA's": None, 'Newcastle': 'local-auth…

In [5]:
download = input("Do you want to download the table with latest endpoints? (yes/no): ")

if download.lower() == "yes":
    new_df.to_csv("endpoints_with_latest_status.csv", index=False)
    print("Query result downloaded as 'endpoints_with_latest_status.csv'")

Do you want to download the table with latest endpoints? (yes/no):  no


# Endpoints with status NOT 200

In [6]:
filtered_df = pd.DataFrame()

def update_dataframe_erroring_endpoints(organisation):
    global filtered_df
    filtered_df=update_dataframe_latest_status(organisation)
    filtered_df = filtered_df[filtered_df['status'] != 200] 
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df

widgets.interact(update_dataframe_erroring_endpoints, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={"All LPA's": None, 'Newcastle': 'local-auth…

In [7]:
download = input("Do you want to download the table with erroring endpoints being collected till date? (yes/no): ")

if download.lower() == "yes":
    filtered_df.to_csv("endpoints_not_200.csv", index=False)
    print("Query result downloaded as 'endpoints_not_200.csv'")

Do you want to download the table with erroring endpoints being collected till date? (yes/no):  no


# First 4 datasets - Endpoints with status NOT 200 - All LPA's

In [8]:
datasette_url = "https://datasette.planning.data.gov.uk/"

result_df = pd.DataFrame()

def update_dataframe1(collection):
    global df1  
    if collection:
        query = f" s.collection = '{collection}'"
    else:
        query = f" s.collection IN ('article-4-direction', 'listed-building', 'tree-preservation-order', 'conservation-area','article-4-direction-area','article-4-direction-rule','listed-building-grade','listed-building-outline','listed-building-building','tree','tree-preservation-zone','tree-preservation-zone-type')"
    params = urllib.parse.urlencode({
        "sql": f"""
        select
          e.endpoint_url,
          l.status,
          l.exception,
          s.collection,
          group_concat(DISTINCT sp.pipeline) as pipelines,
          s.organisation,
          o.name,
          max(l.entry_date) maxentrydate,
          max(e.entry_date) entrydate,
          e.end_date
        from
          log l
          inner join source s on l.endpoint = s.endpoint
          inner join organisation o on s.organisation=o.organisation
          inner join endpoint e on l.endpoint = e.endpoint
          inner join source_pipeline sp on s.source = sp.source
        where
           {query} 
        group by
          l.endpoint,
          l.status
        order by
          pipelines,
          o.name,
          maxentrydate desc
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df1 = pd.read_csv(url)
    result_df1 = df1.copy()

    df1['maxentrydate'] = pd.to_datetime(df1['maxentrydate'])
    df1['last_status'] = None
    df1['last_updated_date'] = None
    df1['date_last_status_200'] = None
    
    for index, row in df1.iterrows():
        if index < len(df1) - 1 and (row['status']!=200 or pd.isna(row['status'])):
            if row['endpoint_url'] == df1.at[index + 1, 'endpoint_url']:
                df1.at[index, 'last_status'] = df1.at[index + 1, 'status']
                df1.at[index, 'last_updated_date'] = df1.at[index + 1, 'maxentrydate']   
    
    df1.drop_duplicates(subset='endpoint_url', keep='first', inplace=True)
    df1.reset_index(drop=True, inplace=True)
    for index, row in df1.iterrows():
        if row['last_status'] is not None:
                if row['last_status'] != 200  or row['last_status'] is None:
                    filtered_df = result_df1[(result_df1['endpoint_url'] == row['endpoint_url'] ) & (result_df1['status'] == 200)]
                    if not filtered_df.empty:
                        df1.at[index, 'date_last_status_200'] = filtered_df['maxentrydate'].values[0][:19]
    df1 = df1[df1['status'] != 200]
    df1.reset_index(drop=True, inplace=True)
    
    return df1


global collection_options    
collection_options = {
    "All 4 datasets":None,"article 4 direction": "article-4-direction","conservation area": "conservation-area","listed building": "listed-building",
    "tree preservation order": "tree-preservation-order"
    
}
global organisation_dropdown
collection_dropdown = widgets.Dropdown(
    options=collection_options,
    description="Select dataset:",
)

widgets.interact(update_dataframe1, collection=collection_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select dataset:', options={'All 4 datasets': None, 'article 4 dire…

In [9]:
download = input("Do you want to download the table with erroring endpoints being collected till date? (yes/no): ")

if download.lower() == "yes":
    df1.to_csv("endpoints_not_200_first_4_datasets.csv", index=False)
    print("Query result downloaded as 'endpoints_not_200_first_4_datasets.csv'")

Do you want to download the table with erroring endpoints being collected till date? (yes/no):  no


# Counting the Number of Distinct Non-ended Endpoints

This SQL query will **count** the total number of **distinct** and **active** endpoint urls for each collection dataset, for each organisation, and will only return rows with counts greater than 1. Datasets with more than one endpoint url can then be further investigated, so see whether these additional endpoint urls are disfunctional or no longer needed.

**N.B:**
- Whether the endpoint_url is active it determined by the presence of an end_date value.
- Currently the LPA list isn't exhaustive and may need to be added to (defined above) but "all LPAs" can be selected.
- The BFL dataset is not included in this query because of how large the BFL dataset is.
- The number of non-ended endpoints where `status != 200` can also be counted here but it pushes the datasette query limit.


In [10]:
def get_endpoint_aggregate(organisation):
    global endpoint_aggregate  
    if organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            s.organisation AS organisation,
            s.collection AS collection,
            sp.pipeline AS pipeline,
            COUNT ( 
                DISTINCT 
                    CASE 
                        WHEN e.end_date = "" THEN e.endpoint_url ELSE NULL END
                )
                AS non_ended_endpoints,
            COUNT ( 
                DISTINCT 
                    CASE 
                        WHEN e.end_date != "" THEN e.endpoint_url ELSE NULL END
                )
                AS ended_endpoints
        FROM
          log l
          INNER JOIN source s ON l.endpoint = s.endpoint
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          LEFT JOIN source_pipeline sp ON s.source = sp.source
        WHERE
           ({query})
           AND (NOT collection="brownfield-land")
        GROUP BY
            s.organisation,
            s.collection,
            sp.pipeline
        HAVING
            non_ended_endpoints > 1
        ORDER BY
            non_ended_endpoints DESC
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    endpoint_aggregate = pd.read_csv(url)
    return endpoint_aggregate

widgets.interact(get_endpoint_aggregate, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={"All LPA's": None, 'Newcastle': 'local-auth…

In [11]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    endpoint_aggregate.to_csv("endpoint_aggregate.csv", index=False)
    print("Query result downloaded as 'endpoint_aggregate.csv'")

Do you want to download the table? (yes/no):  no


# Status and end_date of **all** Distinct Endpoints

This cell should retrieve **all** distinct **non BFL** endpoints, grouped by collection and pipeline **(not just the endpoints counted above)**. The endpoint entry date, end date, most recent log date and most recent status for each endpoint are retrieved to help deduce whether the endpoint is no longer functional and if it has been manually listed as an old endpoint.

**N.B:** 
- This is queried mainly for a merge later on but may be useful nonetheless.
- This query pushes the datasette timeout limit, and often gives an internal server (500) error, rerun this and following cells if this is the case.


In [12]:
def get_most_recent_logs():
    global most_recent_logs  
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT e.endpoint AS endpoint_hash, MAX(l.entry_date) AS most_recent_entry_date, l.status AS most_recent_status
        FROM endpoint e
        JOIN log l ON e.endpoint = l.endpoint
        GROUP BY e.endpoint
        """,
        "_size": "max"
    })

    url = f"{datasette_url}digital-land.csv?{params}"
    most_recent_logs = pd.read_csv(url)
    return most_recent_logs

def get_distinct_endpoints_status_and_end_dates(organisation):
    global distinct_endpoints_status_and_end_dates_df  
    if organisation:
        query = f" s.organisation = '{organisation}'"
    else:
        query = f" s.organisation LIKE '%'"
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            s.organisation AS organisation,
            s.collection AS collection,
            sp.pipeline AS pipeline,
            e.endpoint_url AS endpoint,
            e.endpoint AS endpoint_hash,
            e.entry_date,
            s.end_date AS end_date
        FROM
          log l
          INNER JOIN source s ON l.endpoint = s.endpoint
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          LEFT JOIN source_pipeline sp ON s.source = sp.source
        WHERE
           ({query})
           AND (NOT collection="brownfield-land")
        GROUP BY
            s.collection,
            sp.pipeline,
            e.endpoint
        ORDER BY
            s.collection,
            sp.pipeline
        """,
        "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    distinct_endpoints_status_and_end_dates_df = pd.read_csv(url)
    return distinct_endpoints_status_and_end_dates_df

def combine_dataframes(organisation):
    global distinct_endpoints_status_and_end_dates_with_status_df
    left_df = get_distinct_endpoints_status_and_end_dates(organisation)
    right_df = get_most_recent_logs()
    distinct_endpoints_status_and_end_dates_with_status_df = pd.merge(left_df, right_df, left_on=['endpoint_hash'], right_on=['endpoint_hash'])
    return distinct_endpoints_status_and_end_dates_with_status_df

widgets.interact(combine_dataframes, organisation=organisation_dropdown)
initial_organisation = organisation_dropdown.value

interactive(children=(Dropdown(description='Select LPA:', options={"All LPA's": None, 'Newcastle': 'local-auth…

In [13]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    distinct_endpoints_status_and_end_dates_with_status_df.to_csv("distinct_endpoints_status_and_end_dates_with_status_df.csv", index=False)
    print("Query result downloaded as 'distinct_endpoints_status_and_end_dates_with_status_df.csv'")

Do you want to download the table? (yes/no):  no


# Investigating Datasets with Multiple Endpoint Urls
The `endpoint_aggregate` dataframe counts the number of active endpoints for each organisation dataset, below this is merged with the `distinct_endpoints_status_and_end_dates_with_status_df` to list all suspect endpoints in more detail, rather than their aggregates.

**N.B:** To function, the following cell requires the "Counting the Number of Distinct Active Endpoints" and "Investigating All Endpoints" cells to have been run successfully.


In [14]:
# A One-to-many merge
df = pd.merge(distinct_endpoints_status_and_end_dates_with_status_df, endpoint_aggregate, left_on=['organisation', 'collection', 'pipeline'], right_on=['organisation', 'collection', 'pipeline'])

#Removing endpoint rows which had no match with the endpoint_aggregate dataframe
df = df.dropna(subset=["non_ended_endpoints"]).drop(["non_ended_endpoints", "ended_endpoints"], axis=1)

# Dropping rows with an end_date value (non-active), sorting and reseting the index field
df = df.drop(df[df.end_date.notnull()].index, axis=0).drop(["end_date"], axis=1).sort_values(by = ["organisation", "collection", "pipeline"]).reset_index(drop=True)

possible_duplicate_endpoints = df

# Consider when these endpoints were last successfully accessed
Below we add the field describing the timestamp the endpoint was last successfully accessed (gave a 200-like response) to the `possible_duplicate_endpoints` dataframe generated above.


In [15]:
def get_when_successfully_accessed():
    global last_successfully_accessed  
    params = urllib.parse.urlencode({
        "sql": f"""
        SELECT
            MAX(l.entry_date) AS last_200_LIKE_response_timestamp,
            e.endpoint AS endpoint_hash
        FROM
          log l
          INNER JOIN endpoint e ON l.endpoint = e.endpoint
          INNER JOIN source s ON l.endpoint = s.endpoint
        WHERE
            (NOT collection="brownfield-land")
            AND l.status LIKE "2%"
        GROUP BY
            e.endpoint_url
        """,
        "_size": "max"
    })

    url = f"{datasette_url}digital-land.csv?{params}"
    last_successfully_accessed = pd.read_csv(url)
    return last_successfully_accessed



possible_duplicate_endpoints_last_200 = pd.merge(possible_duplicate_endpoints, get_when_successfully_accessed(), left_on=['endpoint_hash'], right_on=['endpoint_hash'])

possible_duplicate_endpoints_last_200

Unnamed: 0,organisation,collection,pipeline,endpoint,endpoint_hash,entry_date,most_recent_entry_date,most_recent_status,last_200_LIKE_response_timestamp
0,government-organisation:D303,local-authority-district,local-authority-district,https://opendata.arcgis.com/datasets/fba7a58e8...,1a1fb45966731d26440c51d63c66260dea3bce90d710e7...,2021-12-11T20:20:29Z,2024-01-31T00:02:42Z,404.0,2022-12-05T00:01:39Z
1,government-organisation:D303,local-authority-district,local-authority-district,https://services1.arcgis.com/ESMARspQHYMw9BZ9/...,2ef35572a081e84417104abe180737f5e74d7481b36504...,2023-08-02T10:10:35Z,2024-01-31T00:02:42Z,200.0,2024-01-31T00:02:42Z
2,government-organisation:D303,local-authority-district,local-authority-district,https://opendata.arcgis.com/datasets/4b9e1318d...,3239201775cab1f0d0d240cfd5cfe90b5ffd81d51ecce0...,2021-12-11T20:20:19Z,2024-01-31T00:02:42Z,404.0,2022-12-03T00:01:45Z
3,government-organisation:D303,national-park,national-park,https://opendata.arcgis.com/datasets/2dcdc561b...,48b0d08d547af3d959084610b07ac4c74ccf5306267827...,2020-11-30T00:00:00Z,2024-01-31T00:22:26Z,404.0,2022-12-05T00:23:41Z
4,government-organisation:D303,national-park,national-park,https://opendata.arcgis.com/datasets/6b6603ff4...,6b5a5ff541c241f71ac0c4187d0766c84fac32c2341ac1...,2020-11-30T00:00:00Z,2024-01-31T00:22:26Z,404.0,2022-12-02T00:23:38Z
...,...,...,...,...,...,...,...,...,...
106,local-authority-eng:WDE,developer-contributions,developer-agreement-contribution,https://westdevon.gov.uk/developer-agreement-c...,697dbfe28573767f45aeb099d7babe91561e95353609c8...,2020-12-18T00:00:00Z,2024-01-31T00:21:05Z,200.0,2024-01-31T00:21:05Z
107,national-park-authority:Q72617158,developer-contributions,developer-agreement,https://www.newforestnpa.gov.uk/app/uploads/20...,5d011d068dc5da03d3fca7e3387797480d5cf1250b8ca4...,2021-12-31T11:11:11Z,2024-01-31T00:21:05Z,200.0,2024-01-31T00:21:05Z
108,national-park-authority:Q72617158,developer-contributions,developer-agreement,https://www.newforestnpa.gov.uk/app/uploads/20...,705d90de8c2841af9a725e0e4c0d39166d169e45497d89...,2020-12-18T00:00:00Z,2024-01-31T00:21:05Z,404.0,2021-12-07T01:05:06Z
109,national-park-authority:Q72617158,developer-contributions,developer-agreement-transaction,https://www.newforestnpa.gov.uk/app/uploads/20...,34ef1379df513fb0b55919653336be6a534f9073482e06...,2021-12-31T11:11:15Z,2024-01-31T00:21:05Z,200.0,2024-01-31T00:21:05Z


In [16]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    possible_duplicate_endpoints_last_200.to_csv("possible_duplicate_endpoints_last_200.csv", index=False)

Do you want to download the table? (yes/no):  no


## - Stale Endpoints
Endpoints which were last successfully accessed over **5 days** ago can be assumed faulty, or effectively ended and can be recommended for removal by the standard process https://docs.google.com/document/d/1Xm1frOBY-J4mLfigXuFdeq976cQGghhnt0gbmZleAyc/edit#heading=h.y6u78drjip12.


In [17]:
# Grab datetime of 5 days ago relative to current date
five_days_ago_timestamp = pd.to_datetime('today').normalize().tz_localize("Europe/London" ,ambiguous=True) - pd.Timedelta(days=5)

# Assign new df variable and convert most_recent_status to string for later string comparison
df = possible_duplicate_endpoints_last_200
df["most_recent_status"] = df["most_recent_status"].astype(str)

# Convert last_200_response_timestamp field to datetime
df["last_200_LIKE_response_timestamp"] = pd.to_datetime(df["last_200_LIKE_response_timestamp"])

# Grab non-200 data by comparison
df = df[~df["most_recent_status"].str.contains("2")]

# Filter for entries which only returned 200 over 5 days ago
df = df[df['last_200_LIKE_response_timestamp'] < five_days_ago_timestamp].reset_index(drop=True)

# Convert most_recent_status back to float
df["most_recent_status"] = df["most_recent_status"].astype(float)

df

Unnamed: 0,organisation,collection,pipeline,endpoint,endpoint_hash,entry_date,most_recent_entry_date,most_recent_status,last_200_LIKE_response_timestamp
0,government-organisation:D303,local-authority-district,local-authority-district,https://opendata.arcgis.com/datasets/fba7a58e8...,1a1fb45966731d26440c51d63c66260dea3bce90d710e7...,2021-12-11T20:20:29Z,2024-01-31T00:02:42Z,404.0,2022-12-05 00:01:39+00:00
1,government-organisation:D303,local-authority-district,local-authority-district,https://opendata.arcgis.com/datasets/4b9e1318d...,3239201775cab1f0d0d240cfd5cfe90b5ffd81d51ecce0...,2021-12-11T20:20:19Z,2024-01-31T00:02:42Z,404.0,2022-12-03 00:01:45+00:00
2,government-organisation:D303,national-park,national-park,https://opendata.arcgis.com/datasets/2dcdc561b...,48b0d08d547af3d959084610b07ac4c74ccf5306267827...,2020-11-30T00:00:00Z,2024-01-31T00:22:26Z,404.0,2022-12-05 00:23:41+00:00
3,government-organisation:D303,national-park,national-park,https://opendata.arcgis.com/datasets/6b6603ff4...,6b5a5ff541c241f71ac0c4187d0766c84fac32c2341ac1...,2020-11-30T00:00:00Z,2024-01-31T00:22:26Z,404.0,2022-12-02 00:23:38+00:00
4,local-authority-eng:CAT,article-4-direction,article-4-direction-area,https://mapping.canterbury.gov.uk/arcgis/rest/...,351fdbd179616dcf25ce0c4498cbd7fd5a917c5bbedcbc...,2021-11-11T14:14:25Z,2024-01-31T00:19:48Z,,2024-01-16 00:16:16+00:00
5,local-authority-eng:COV,developer-contributions,developer-agreement-transaction,https://www.coventry.gov.uk/download/downloads...,066ee597b0526ddbe1d9bee406eab6ccf316055662be33...,2021-12-31T11:11:14Z,2024-01-31T00:21:05Z,500.0,2022-07-13 00:26:08+00:00
6,local-authority-eng:COV,developer-contributions,developer-agreement-transaction,https://www.coventry.gov.uk/download/downloads...,1ae6b8cc4fab55aa8e23b51160c3e51b350eb8d44a04d6...,2021-12-31T11:11:14Z,2024-01-31T00:21:05Z,500.0,2023-08-24 00:18:10+00:00
7,local-authority-eng:COV,developer-contributions,developer-agreement-transaction,https://www.coventry.gov.uk/download/downloads...,323046bda27fe8856575a8cc3aa6895684dd6e7a5c0f34...,2021-12-31T11:11:14Z,2024-01-31T00:21:05Z,500.0,2023-02-09 00:21:52+00:00
8,local-authority-eng:DOV,developer-contributions,developer-agreement,https://www.dover.gov.uk/Planning/Planning-Pol...,739d4ab8af624694d0a2960571e2fadad37c9d2d353138...,2021-12-31T11:11:11Z,2024-01-31T00:21:05Z,404.0,2023-01-18 00:21:34+00:00
9,local-authority-eng:DOV,developer-contributions,developer-agreement-contribution,https://www.dover.gov.uk/Planning/Planning-Pol...,65c04fb830320a32ceb91061f02d85be6fdb0f55fcae67...,2021-10-07T00:00:00Z,2024-01-31T00:21:05Z,404.0,2023-01-18 00:21:34+00:00


# Grouping Endpoints then Converting to Lists
The `possible_duplicate_endpoints` dataframe above is grouped by organisation, collection and pipeline, all unique endpoint urls are then placed into a list so that they can be easily looped through. This is so that the csv contents of each endpoint can be compared in future, and then decide on which endpoints to keep.

**N.B:** This currently includes the endpoints previously highlighted as stale.


In [18]:
# Remove unnecessary columns
df = possible_duplicate_endpoints.drop(["most_recent_status"], axis = 1)

# Group by organisation, collection and pipeline, append the aggregated endpoint urls into a list then reset the index
df = df.groupby(["organisation","collection","pipeline"])["endpoint"].apply(list).reset_index()
possible_duplicate_endpoints_aggregate = df
possible_duplicate_endpoints_aggregate

# This can be looped through programmatically to check the contents of the remaining endpoints to be eliminated

Unnamed: 0,organisation,collection,pipeline,endpoint
0,government-organisation:D303,local-authority-district,local-authority-district,[https://opendata.arcgis.com/datasets/fba7a58e...
1,government-organisation:D303,national-park,national-park,[https://opendata.arcgis.com/datasets/2dcdc561...
2,government-organisation:D4,green-belt,green-belt,[http://maps.communities.gov.uk/geoserver/dclg...
3,government-organisation:D69,title-boundary,title-boundary,[https://use-land-property-data.service.gov.uk...
4,government-organisation:EA39,infrastructure-project,infrastructure-project,[https://raw.githubusercontent.com/digital-lan...
...,...,...,...,...
63,local-authority-eng:TON,developer-contributions,developer-agreement-transaction,[https://docs.tmbc.gov.uk/docs/S106/developer-...
64,local-authority-eng:WDE,developer-contributions,developer-agreement,[https://www.westdevon.gov.uk/developer-agreem...
65,local-authority-eng:WDE,developer-contributions,developer-agreement-contribution,[https://westdevon.gov.uk/developer-agreement-...
66,national-park-authority:Q72617158,developer-contributions,developer-agreement,[https://www.newforestnpa.gov.uk/app/uploads/2...


In [19]:
download = input("Do you want to download the possible_duplicate_endpoints_aggregate table? (yes/no): ")

if download.lower() == "yes":
    possible_duplicate_endpoints_aggregate.to_csv("possible_duplicate_endpoints_aggregate.csv", index=False)

Do you want to download the possible_duplicate_endpoints_aggregate table? (yes/no):  no
