# Analysis: green-belt data changes
**Author**:  Greg Slater <br>
**Date**:  15 May 2024 <br>
**Data Scope**: green-belt<br>
**Report Type**: One off analysis <br>

## Purpose
This notebook was used to get a list of all historic green-belt endpoints and resources, as well as download historic cached resources in order to compare what resources have been end-dated (410'd) with which resources are returning valid data.

A number of historic green-belt resources were retired: https://github.com/digital-land/config/tree/main/collection/green-belt. But also, resource history shows there was a change which looks like historic endpoints stopped working and instead returned a WFS service error message. This still gives a 200 status even though it's invalid data, hence downloading resources to check what's what and what can be un-end-dated.

In [2]:
import pandas as pd
import geopandas as gpd
import os
import numpy as np
import urllib

In [33]:
def get_greenbelt_resources():
    datasette_url = "https://datasette.planning.data.gov.uk/"
  
    params = urllib.parse.urlencode({
    "sql": f"""
    select 
        endpoint, endpoint_url, status, resource, count(distinct resource) as count_res, STRFTIME("%Y-%m-%d", max(latest_log_entry_date)) as max_log, max(resource_end_date) as max_res_end
        from reporting_historic_endpoints
        where pipeline = "green-belt"
        group by 1, 2, 3, 4
        order by endpoint, max(latest_log_entry_date) desc
    """,
    "_size": "max"
    })
    
    url = f"{datasette_url}digital-land.csv?{params}"
    df = pd.read_csv(url)
    return df


def get_resource_file(collection_name, resource_id):

    url = f"https://files.planning.data.gov.uk/{collection_name}-collection/collection/resource/{resource_id}"

    try:
        gdf = gpd.read_file(url)

    # except urllib.error.HTTPError:
    #     print(f"can't read url: {url}")
    #     return None
    
    except:
        print(f"Error reading from url: {url}")
        return None

    return gdf


In [4]:
test = get_resource_file("green-belt", "54ff74f45e458c4e7d139689de57a8b7e7d768d9c1ccb11a0026903093842341")

print(len(test))
test.head()

185


Unnamed: 0,gml_id,LAD_NM,LAD_CD,GB_Name,Area_ha,geometry
0,England_Green_Belt_2022_23_WGS84,Amber Valley,E07000032,Derby and Nottingham,8647.904471,"MULTIPOLYGON (((-1.34303 53.07100, -1.34304 53..."
1,England_Green_Belt_2022_23_WGS84,Ashfield,E07000170,Derby and Nottingham,4521.598181,"MULTIPOLYGON (((-1.18089 53.11499, -1.18089 53..."
2,England_Green_Belt_2022_23_WGS84,Barking and Dagenham,E09000002,London,527.776471,"MULTIPOLYGON (((0.15314 51.56697, 0.15325 51.5..."
3,England_Green_Belt_2022_23_WGS84,Barnet,E09000003,London,2383.047837,"MULTIPOLYGON (((-0.20927 51.63758, -0.20937 51..."
4,England_Green_Belt_2022_23_WGS84,Barnsley,E08000016,South and West Yorkshire,22393.050084,"MULTIPOLYGON (((-1.27572 53.52946, -1.27574 53..."


In [5]:
# global variables
data_dir = "../data/geo_analysis/green_belt/"
os.makedirs(data_dir, exist_ok=True)

## Analysis

In [31]:
# get list of endpoints and historic resources from datasette 
gb_df = get_greenbelt_resources()

# get year from URL part
gb_df["year"] = gb_df["endpoint_url"].str.split("&").apply(lambda x: x[3][-13:-6])

print(len(gb_df))
gb_df.sort_values(["year", "max_log"], inplace=True, ascending= False)
gb_df.head(50)


46


Unnamed: 0,endpoint,endpoint_url,status,resource,count_res,max_log,max_res_end,year
0,0a348ac5a298612f216088f285ab4fd049752bd663f058...,https://maps.communities.gov.uk/geoserver/dclg...,200.0,54ff74f45e458c4e7d139689de57a8b7e7d768d9c1ccb1...,1,2024-05-16,,2022_23
45,e00db39f56de236656d4c5c9bb81923dbd0bf31a216ec2...,https://maps.communities.gov.uk/geoserver/dclg...,200.0,232a469fec0b0af41a628e1cc5203749af3c165156cb8e...,1,2024-05-16,,2021_22
30,b04c507f53aa0e0624b5f529c3335413f1ea384674533f...,http://maps.communities.gov.uk/geoserver/dclg_...,200.0,f3ec886559271bd6fe0e6650dee5e8928593146ea78102...,1,2024-05-16,,2020-21
31,b04c507f53aa0e0624b5f529c3335413f1ea384674533f...,http://maps.communities.gov.uk/geoserver/dclg_...,520.0,,1,2024-02-02,,2020-21
32,b04c507f53aa0e0624b5f529c3335413f1ea384674533f...,http://maps.communities.gov.uk/geoserver/dclg_...,200.0,2858fa51c91289f4ad90ee2e600aa7a586cac68884affc...,1,2023-11-27,2023-11-27,2020-21
33,b04c507f53aa0e0624b5f529c3335413f1ea384674533f...,http://maps.communities.gov.uk/geoserver/dclg_...,524.0,,1,2023-11-03,,2020-21
34,b04c507f53aa0e0624b5f529c3335413f1ea384674533f...,http://maps.communities.gov.uk/geoserver/dclg_...,200.0,f413faf9e8f7e38a38d94003b5248f827cd3dd74218de3...,1,2022-10-06,2022-10-06,2020-21
6,21f6ac0160488dfdab4fae54bea1940ec2827e9c80d3b8...,http://maps.communities.gov.uk/geoserver/dclg_...,200.0,f70c012718b6a8dbd9ac6e92dd54e3a49378e21362bf44...,1,2024-05-16,,2019-20
7,21f6ac0160488dfdab4fae54bea1940ec2827e9c80d3b8...,http://maps.communities.gov.uk/geoserver/dclg_...,520.0,,1,2024-02-02,,2019-20
8,21f6ac0160488dfdab4fae54bea1940ec2827e9c80d3b8...,http://maps.communities.gov.uk/geoserver/dclg_...,200.0,ebf46d50cde4bdd1fec04890e12eebc7158983ab24a7bf...,1,2023-11-27,2023-11-27,2019-20


In [7]:
# get resources which have been 410'd
old_res_df = pd.read_csv("https://raw.githubusercontent.com/digital-land/config/main/collection/green-belt/old-resource.csv")

old_res_df.head()

Unnamed: 0,old-resource,status,resource,notes
0,0e2cc42c6fcd06aebb3717e9a13cefbaa5a3c6590ba044...,410,,paused processing
1,111622fd0f0d9dd506764bdc361b45d2f7fb1c7a7297e2...,410,,paused processing
2,26dfbfa718b56dbf76a8d2182859371073a63332600a88...,410,,paused processing
3,5d462aa86f638be20e7fe0d531434f3bf10761dee2dbcf...,410,,paused processing
4,6f74ea15b5cd53bc752d23a6d5a1f25388bd330c9c7617...,410,,paused processing


In [32]:
# join 410 status on to the endpoint-resource list
gb_res_df = gb_df.merge(
    old_res_df[["old-resource", "status"]],
    how = "left",
    left_on = "resource",
    right_on = "old-resource",
    suffixes=["_endpoint", "_resource"]
)

# save a list of 200 endpoints and their resources - will use to check off what should be un-ended and what should be ended.
gb_res_df[gb_res_df["status_endpoint"] == 200][
    ["year", "endpoint", "status_endpoint", "resource", "max_log", "max_res_end", "status_resource"]
    ].to_csv("temp-green_belt-endpoint_resource2.csv", index = False)

In [12]:
# save historic resources to check if valid or not - will manually check through and flag using csv list saved above.

res_unique = gb_df[gb_df["resource"].notnull()]["resource"].drop_duplicates()
url_base = "https://files.planning.data.gov.uk/green-belt-collection/collection/resource/"

# save each resource in data directory
for r in res_unique:

    url = url_base + r
    f_path = f"{data_dir}{r}.xml"
    print(f"Retrieving: {f_path}")
    urllib.request.urlretrieve(url, f_path)
    print("Done")
    print("-" * 50)

Retrieving: ../data/geo_analysis/green_belt/ce01675696c982c4ce258c0be5d2ecfbf1322331dc5d55fe42c076e767698cfa.xml
Done
--------------------------------------------------
Retrieving: ../data/geo_analysis/green_belt/78d688e8977bcfeb00ce80679b2443b836b1ae275d87c6c0b8254a54f941efb1.xml
Done
--------------------------------------------------
Retrieving: ../data/geo_analysis/green_belt/111622fd0f0d9dd506764bdc361b45d2f7fb1c7a7297e2310109d662020a26b7.xml
Done
--------------------------------------------------
Retrieving: ../data/geo_analysis/green_belt/f22b92f7b52bf24c17d79ac6dfaa303d1bdd6395aebb99190e7d8e8e332d9036.xml
Done
--------------------------------------------------
Retrieving: ../data/geo_analysis/green_belt/bcc3680ca22f1e011cf1e1484d62f86f9ddf5d6ebfbd15d2c6a770f56b454440.xml
Done
--------------------------------------------------
Retrieving: ../data/geo_analysis/green_belt/d7f1e9256d7001793b0f7004fa741e205f988ad24a1d60cc28970f8334a355a5.xml
Done
-------------------------------------