In [1]:
from osdatahub import FeaturesAPI, Extent
import geojson
import pandas as pd
import geopandas as gpd
import shapely.wkt
import shapely.geometry

from key import ngd_key

pd.set_option("display.max_rows", 100)

In [111]:
# from pyproj import Transformer
# from pyproj.transformer import TransformerGroup

# TransformerGroup("epsg:27700", "epsg:4326").best_available

True

In [5]:
def nrow(df):
    return print(f"No. of records in df: {len(df):,}")

## Data import

In [40]:
# LPA boundary data from planning.data.gov

LAD_boundary_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/local-authority-district.csv", 
                                  usecols = ["reference", "name", "geometry"])

# LAD_boundary_df.columns = ["geometry", "name", "LPACD"]


# load geometry and create GDF
LAD_boundary_df['geometry'] = LAD_boundary_df['geometry'].apply(shapely.wkt.loads)
LAD_boundary_gdf = gpd.GeoDataFrame(LAD_boundary_df, geometry='geometry')

# Transform to ESPG:27700 for more interpretable area units
LAD_boundary_gdf.set_crs(epsg=4326, inplace=True)
LAD_boundary_gdf.to_crs(epsg=27700, inplace=True)

nrow(LAD_boundary_gdf)
LAD_boundary_gdf.head()


No. of records in df: 344


Unnamed: 0,geometry,name,reference
0,"MULTIPOLYGON (((450154.627 525938.188, 450164....",Hartlepool,E06000001
1,"MULTIPOLYGON (((446854.689 517192.726, 446858....",Middlesbrough,E06000002
2,"MULTIPOLYGON (((451747.383 520561.094, 451698....",Redcar and Cleveland,E06000003
3,"MULTIPOLYGON (((447177.708 517811.773, 447198....",Stockton-on-Tees,E06000004
4,"MULTIPOLYGON (((423496.594 524724.326, 423475....",Darlington,E06000005


In [6]:
# get camden listed building data direct from endpoint
cmd_df = pd.read_csv("https://opendata.camden.gov.uk/api/views/uu3n-zgbj/rows.csv?accessType=DOWNLOAD")

# load geometry and create GDF
cmd_df['geometry'] = cmd_df['geometry'].apply(shapely.wkt.loads)
cmd_gdf = gpd.GeoDataFrame(cmd_df, geometry='geometry')

# Transform to ESPG:27700 for more interpretable area units
cmd_gdf.set_crs(epsg=4326, inplace=True)
cmd_gdf.to_crs(epsg=27700, inplace=True)

nrow(cmd_gdf)
cmd_gdf.head()

No. of records in df: 1,961


Unnamed: 0,reference,name,listed-building,listed-building-grade,notes,start-date,end-date,entry-date,geometry
0,LB1859,"(East, off) Court Building, St Pancras Coroner...",10271,II,,2003-09-05,,,"POLYGON ((529794.083 183544.904, 529791.773 18..."
1,LB1481,(West side) Cattle Trough at junction with Her...,477772,II,,1998-07-01,,,"POLYGON ((525383.827 186281.856, 525383.774 18..."
2,LB1872,"HAMPSTEAD, ADELAIDE ROAD Swiss Cottage Regency...",492770,II,,2006-09-18,,,"POLYGON ((526706.983 184142.479, 526706.278 18..."
3,LB1531,"Nos. 64-67, Nos. 2-8",1061382,II,,2002-07-19,,,"POLYGON ((529534.796 181777.495, 529525.359 18..."
4,LB1532,NORTH CRESCENT War Memorial,1061383,II,,2002-07-19,,,"POLYGON ((529598.623 181849.781, 529599.345 18..."


## API test

In [None]:
# get bounds of Camden LAD to use in API call
LAD_boundary_gdf[LAD_boundary_gdf["reference"] == "E09000007"]["geometry"].bounds

Unnamed: 0,minx,miny,maxx,maxy
290,523951.678259,180964.539216,531554.746716,187603.681394


In [None]:
collection = "bld-fts-buildingpart-1"
ngd = NGD(ngd_key, collection)


# mask = (529000,181000,530000,182000)  # small area around bloomsbury to test with
mask = (529000,181000,529010,181010)  # even smaller
# mask = (444369,121216,444486,121244)

extent = Extent.from_bbox(
    mask,
    "EPSG:27700")

results = ngd.query(max_results=10, extent = extent, crs = 27700, offset = 0)
test_gdf = gpd.GeoDataFrame.from_features(results["features"])
test_gdf.set_crs(epsg=27700, inplace=True)

nrow(test_gdf)

No. of records in df: 5


In [None]:
test_gdf.explore()

## Full API query

In [None]:
# API details
collection = "bld-fts-buildingpart-1"
ngd = NGD(ngd_key, collection)

mask = (529000,181000,530000,182000)

extent = Extent.from_bbox(
    mask, 
    "EPSG:27700")

fields = ["geometry", "osid", "versiondate"]

# controls
limit = 5000
interval = 100
fail_limit = 5

# data storage & counter
api_results = []
fail_counter = 0

for count, offset in enumerate(range(0, limit, interval)):

    if fail_counter > fail_limit:
        break
    
    print(f"attempt number {count}")
    try:
        results = ngd.query(extent = extent, crs = 27700, max_results = interval, offset = offset)
        print("query success")

    except:
        print("query fail")
        fail_counter += 1

    
    results_gdf = gpd.GeoDataFrame.from_features(results["features"])
    results_gdf.set_crs(epsg=27700, inplace=True)

    api_results.append(results_gdf[fields])
    # api_results.append(results_gdf)

    # if there isn't a next page in the response then break
    if not any(d["rel"] == "next" for d in results["links"]):
        break
        

os_gdf = pd.concat(api_results)
nrow(os_gdf)

attempt number 0
query success
attempt number 1
query success
attempt number 2
query success
attempt number 3
query success
attempt number 4
query success
No. of records in df: 500


In [None]:
# save or read in API results
# os_gdf.to_file("cmd_bloomsbury_clip_os_bld-fts-building-1.gpkg")
os_building = gpd.read_file("cmd_bloomsbury_clip_os_bld-fts-building-1.gpkg")

In [None]:
# create poly from bounding coordinates used in API and filter camden data to within
coords = ((529000,181000), (530000,181000), (530000,182000), (529000,182000), (529000,181000))
bbox_poly = shapely.geometry.Polygon(coords)

cmd_clip_gdf = cmd_gdf.loc[cmd_gdf.intersects(bbox_poly)].copy()

In [None]:
map_os = os_building.explore(
    color = "blue",
    # tooltip = False,
    tiles = "CartoDB positron"
)

cmd_clip_gdf.explore(
    m = map_os,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0.3"
        }
)

# Analysis

In [10]:
# tidy names up to use in intersection
cmd_clip_gdf["area"] = cmd_clip_gdf["geometry"].area

cmd_gdf_join = cmd_clip_gdf[["reference", "listed-building", "name", "area", "geometry"]].copy()
cmd_gdf_join.columns = ["cmd_ref", "cmd_listed_building", "cmd_name", "cmd_area", "geometry"]

os_building["area"] = os_building["geometry"].area
os_gdf_join = os_building[["osid", "area", "geometry"]].copy()
os_gdf_join.columns = ["os_ref", "os_area", "geometry"]

In [116]:
# quick area comparison before doing full intersection
# this is the total footprint of listed buildings as a % of the total footprint of OS buildings in the same bounding box
# a good estimate for what the max % of copying OS data might be if all listed building geoms are copied directly

pct_worst_case = sum(cmd_gdf_join["cmd_area"]) / sum(os_gdf_join["os_area"])
print("Worst case estimate (total listed building footprint as % of total OS building footprint in area): {:%}".format(round(pct_worst_case,3)))

Worst case estimate (total listed building footprint as % of total OS building footprint in area): 16.300000%


### Intersection

In [13]:
# first overlay to see which OS geoms match to listed building ones
cmd_os_join_gdf = gpd.overlay(
    cmd_gdf_join, 
    os_gdf_join,
    how = "intersection", 
    keep_geom_type=False,
)

nrow(cmd_gdf_join)
nrow(os_gdf_join)
nrow(cmd_os_join_gdf)

cmd_os_join_gdf["int_area"] = cmd_os_join_gdf["geometry"].area

# calculate intersection areas
cmd_os_join_gdf["cmd_int_pct"] = cmd_os_join_gdf["int_area"] / cmd_os_join_gdf["cmd_area"]
cmd_os_join_gdf["os_int_pct"] = cmd_os_join_gdf["int_area"] / cmd_os_join_gdf["os_area"]

# add a count field for the number of cmd > OS matches
cmd_os_join_gdf["os_match_count"] = cmd_os_join_gdf.groupby("cmd_ref")["cmd_ref"].transform("count")

No. of records in df: 108
No. of records in df: 2,647
No. of records in df: 494


In [347]:
cmd_os_join_gdf.head()

Unnamed: 0,cmd_ref,cmd_listed_building,cmd_name,cmd_area,os_ref,os_area,geometry,int_area,cmd_int_pct,os_int_pct,os_match_count
0,LB1531,1061382,"Nos. 64-67, Nos. 2-8",626.057066,4771a712-7686-4441-8a01-2d6bd81ab4e2,145.97135,"POLYGON ((529545.238 181805.566, 529545.250 18...",0.151564,0.000242,0.001038,4
1,LB1531,1061382,"Nos. 64-67, Nos. 2-8",626.057066,51455257-306e-484f-ba40-57c1896f5382,212.94625,"POLYGON ((529559.639 181786.029, 529559.755 18...",212.920069,0.340097,0.999877,4
2,LB1531,1061382,"Nos. 64-67, Nos. 2-8",626.057066,8b969bdc-88bf-4998-aaf2-d12576db4444,474.604418,"POLYGON ((529530.610 181783.459, 529532.285 18...",0.107056,0.000171,0.000226,4
3,LB1531,1061382,"Nos. 64-67, Nos. 2-8",626.057066,d623fb58-3228-4cc4-bfba-eda7413ba1e4,376.602309,"POLYGON ((529528.932 181785.850, 529531.210 18...",376.598835,0.601541,0.999991,4
4,LB512,1113013,(East side) Nos.1-5 (Consecutive) Elms Lester ...,282.981721,04209200-e4da-41c7-8072-b4905c70157e,283.26125,"POLYGON ((529945.738 181214.149, 529939.100 18...",282.418092,0.998008,0.997023,1


### Direct matches

In [118]:
# CHECK FOR DIRECT MATCHES

threshold = 0.9

direct_matches = cmd_os_join_gdf[(cmd_os_join_gdf["cmd_int_pct"] >= threshold) & (cmd_os_join_gdf["os_int_pct"] > threshold)]
direct_match_pct = len(direct_matches) / len(os_gdf_join)

print(f"no. of direct matches between listed building outlines and OS buildings: {len(direct_matches)}")
print(f"which equates to {round(direct_match_pct, 4):%} of all OS building geometries in area")

map_os = os_gdf_join[os_gdf_join["os_ref"].isin(direct_matches["os_ref"])].explore(
    color = "blue",
    tooltip = False,
    tiles = "CartoDB positron"
)

cmd_clip_gdf[cmd_clip_gdf["reference"].isin(direct_matches["cmd_ref"])].explore(
    m = map_os,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0"
        }
)

no. of direct matches between listed building outlines and OS buildings: 38
which equates to 1.440000% of all OS building geometries in area


### Extending match limits through dissolving

In [17]:
# dissolve matching OS geometries

# lookup for cmd and os ids where matches are 1:1 or 1:many (where the many OS matches are each > 20%)
cmd_os_lookup = cmd_os_join_gdf[
    (cmd_os_join_gdf["os_match_count"] == 1) | ((cmd_os_join_gdf["os_match_count"] > 1) & (cmd_os_join_gdf["os_int_pct"] > 0.2))
    ][["cmd_ref", "os_ref"]]

# inner join to lookup and then dissolve OS geoms grouped by the listed building ref
os_dissolved = os_gdf_join.merge(
    cmd_os_lookup,
    how = "inner",
    on = "os_ref"
).dissolve(
    by = "cmd_ref"
)

In [80]:
map_os = os_dissolved.explore(
    color = "blue",
    tooltip = False,
    tiles = "CartoDB positron"
)

cmd_clip_gdf.explore(
    m = map_os,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0"
        }
)

In [26]:
# tidy dissolved table and re-calculate area for new dissolved geometries
os_dissolved.reset_index(inplace = True)
os_dissolved["os_area"] = os_dissolved["geometry"].area

os_dissolved.head()

Unnamed: 0,cmd_ref,geometry,os_ref,os_area
0,LB1101,"POLYGON ((529570.000 181527.150, 529566.550 18...",23a75435-1fdf-49d5-9dca-8c41c7a8a757,449.35
1,LB1102,"POLYGON ((529582.800 181549.400, 529584.651 18...",9a743e50-b868-4148-93a6-1f0fecb0276e,164.27515
2,LB1103,"POLYGON ((529588.500 181553.350, 529584.903 18...",45424cda-419f-47b6-85b0-700997db0f43,605.062425
3,LB1104,"POLYGON ((529634.950 181568.450, 529632.450 18...",592ab32b-5d39-4411-b98b-c3357045b7bf,374.18875
4,LB1105,"POLYGON ((529630.300 181595.750, 529636.150 18...",2ccae62d-75dd-4df3-a5b1-24911888eebb,151.83375


In [35]:
# intersect listed buildings with dissolved OS geometries
cmd_join_diss = gpd.overlay(
    cmd_gdf_join, 
    os_dissolved.reset_index(),
    how = "intersection", 
    keep_geom_type=False,
)

# filter to just those which have matching references (i.e. the dissolved OS geometry is based on the listed building geom)

cmd_join_diss = cmd_join_diss[cmd_join_diss["cmd_ref_1"] == cmd_join_diss["cmd_ref_2"]]
cmd_join_diss["int_area"] = cmd_join_diss["geometry"].area

nrow(cmd_gdf_join)
nrow(cmd_join_diss)
cmd_join_diss.head()

No. of records in df: 108
No. of records in df: 100


Unnamed: 0,cmd_ref_1,cmd_listed_building,cmd_name,cmd_area,index,cmd_ref_2,os_ref,os_area,geometry,int_area
0,LB1531,1061382,"Nos. 64-67, Nos. 2-8",626.057066,40,LB1531,51455257-306e-484f-ba40-57c1896f5382,589.548559,"POLYGON ((529528.932 181785.850, 529531.210 18...",589.518904
1,LB512,1113013,(East side) Nos.1-5 (Consecutive) Elms Lester ...,282.981721,74,LB512,04209200-e4da-41c7-8072-b4905c70157e,283.26125,"POLYGON ((529945.738 181214.149, 529939.100 18...",282.418092
3,LB513,1113014,(West side) No.6,145.182838,75,LB513,317f23cf-8379-41b8-9a52-52535dab6cdd,230.5725,"POLYGON ((529927.334 181241.497, 529934.948 18...",144.969026
4,LB514,1113015,(North side) No.12,252.544533,76,LB514,96cb1fa3-2aa3-4ec6-89cf-08c6bb9d0932,252.94625,"POLYGON ((529903.208 181210.280, 529910.076 18...",252.11319
5,LB573,1113021,(South side) No.35,93.875281,79,LB573,17d94385-e148-48ec-97a4-bfc3233743de,93.865,"POLYGON ((529476.969 181710.012, 529480.430 18...",93.586426


In [99]:
# check some examples from table above to see how area breakdowns work

ref = "LB1531"

map_os = os_dissolved[os_dissolved["cmd_ref"] == ref].explore(
    color = "blue",
    tooltip = False,
    tiles = "CartoDB positron"
)

cmd_clip_gdf[cmd_clip_gdf["reference"] == ref].explore(
    m = map_os,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0.3"
        }
)

In [44]:
# not all camden listed buildings intersect so rather than using overlay table, left join to it from the original listed building table
cmd_match_areas = cmd_gdf_join[["cmd_ref", "cmd_area"]].merge(
    cmd_join_diss[["cmd_ref_1", "os_area", "int_area"]],
    left_on = "cmd_ref",
    right_on = "cmd_ref_1",
    how = "left"
)

cmd_match_areas["cmd_int_pct"] = cmd_match_areas["int_area"] / cmd_match_areas["cmd_area"]
cmd_match_areas["os_int_pct"] = cmd_match_areas["int_area"] / cmd_match_areas["os_area"]

cmd_match_areas.head()


Unnamed: 0,cmd_ref,cmd_area,cmd_ref_1,os_area,int_area,cmd_int_pct,os_int_pct
0,LB1531,626.057066,LB1531,589.548559,589.518904,0.941638,0.99995
1,LB1532,2.462515,,,,,
2,LB512,282.981721,LB512,283.26125,282.418092,0.998008,0.997023
3,LB513,145.182838,LB513,230.5725,144.969026,0.998527,0.628735
4,LB514,252.544533,LB514,252.94625,252.11319,0.998292,0.996707


In [111]:
# set threshold for combined overlap
threshold = 0.9

# table of listed buildings where the combined match to a dissolved OS geometry is over the threshold 
cmd_match_thresh = cmd_match_areas[(cmd_match_areas["cmd_int_pct"] >= 0.9) & (cmd_match_areas["os_int_pct"] >= 0.9)]

pct_dissolved_matches = len(cmd_match_thresh) / len(cmd_match_areas)


# table of all OS geom refs which went into the dissolved geoms which have a match over threshold
os_match_geoms = cmd_os_lookup[cmd_os_lookup["cmd_ref"].isin(cmd_match_thresh["cmd_ref"])]

pct_os_match_geoms = len(os_match_geoms) / len(os_gdf_join)

print("{} out of {} listed  building geoms have a combined match over threshold to OS dissolved geoms".format(len(cmd_match_thresh), len(cmd_gdf_join)))
print("this equates to {:%} %".format(round(pct_dissolved_matches, 3)))

print("")
print("{} out of {} distinct OS building geoms are included in the dissolved geoms which match listed building outlines".format(len(os_match_geoms), len(os_gdf_join)))
print("this equates to {:%} %".format(round(pct_os_match_geoms, 3)))


63 out of 108 listed  building geoms have a combined match over threshold to OS dissolved geoms
this equates to 58.300000% %

138 out of 2647 distinct OS building geoms are included in the dissolved geoms which match listed building outlines
this equates to 5.200000% %


In [76]:
map_os = os_gdf_join[~os_gdf_join["os_ref"].isin(os_match_geoms["os_ref"])].explore(
    color = "#68afff",  # blue for geoms not matched
    tooltip = False,
    tiles = "CartoDB positron"
)

map_os2 = os_gdf_join[os_gdf_join["os_ref"].isin(os_match_geoms["os_ref"])].explore(
    m = map_os,
    color = "#53ffa2",  # green for geoms matched
    tooltip = False
)

cmd_gdf_join.explore(
    m = map_os2,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0"
        }
)

In [77]:
map_os = os_dissolved[~os_dissolved["cmd_ref"].isin(cmd_match_thresh["cmd_ref"])].explore(
    color = "#68afff",  # blue for geoms not matched
    tooltip = False,
    tiles = "CartoDB positron"
)

map_os2 = os_dissolved[os_dissolved["cmd_ref"].isin(cmd_match_thresh["cmd_ref"])].explore(
    m = map_os,
    color = "#53ffa2",  # green for geoms matched
    tooltip = False
)

cmd_gdf_join.explore(
    m = map_os2,
    color = "red",
    # tooltip = False,
    style_kwds = {
        "fillOpacity" : "0"
        }
)

### Suggested next steps

* agree estimate approach - using direct matches, or dissolved matches, or other?
* what is the key assessment metric, and are we agreed on the calculation method?
* agree approach to scaling - run for full LAD area?

In [None]:
cmd_match_areas[cmd_match_areas["cmd_ref_1"] == "LB1574"]

Unnamed: 0,cmd_ref,cmd_area,cmd_ref_1,os_area,int_area,cmd_int_pct,os_int_pct
78,LB1574,3098.285096,LB1574,4825.817969,3095.449498,0.999085,0.641435


In [72]:
cmd_match_thresh[cmd_match_thresh["cmd_ref_1"] == "LB1574"]

Unnamed: 0,cmd_ref,cmd_area,cmd_ref_1,os_area,int_area,cmd_int_pct,os_int_pct
78,LB1574,3098.285096,LB1574,4825.817969,3095.449498,0.999085,0.641435
