# Polygon Features

The polygons provided by Ordnance Survey (OS) define building boundaries for domestic properties in Wales. These shapes were used to derive the area, length (perimeter), and uniformity (area/length) of a given building.

In [16]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import json
import datetime as dt
from sklearn.neighbors import NearestNeighbors
from itertools import chain

In [29]:
# set variables from config file
config_path = os.path.abspath('..')

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

polygon_path = config['DEFAULT']['polygon_path']
polygon_fname = config['DEFAULT']['polygon_fname']
uprn_lookup_fname = config['DEFAULT']['uprn_lookup_fname']
lad_lookup_fname = config['DEFAULT']['lad_lookup_fname']
buildings_fname = config['DEFAULT']['buildings_fname']
building_height_fname = config['DEFAULT']['building_height_fname']
built_form_fname = config['DEFAULT']['built_form_fname']
polygon_features_fname = config['DEFAULT']['polygon_features_fname']

In [30]:
uprn2toid = pd.read_csv(os.path.join(polygon_path,uprn_lookup_fname))
lad_lookup = pd.read_csv(os.path.join(polygon_path,lad_lookup_fname))
building_heights = pd.read_csv(os.path.join(polygon_path,building_height_fname))

In [9]:
# Building polygons
# Change the co-ordinate reference system (CRS), note this takes a long time to run
buildings = gpd.read_file(os.path.join(polygon_path,polygon_fname))
buildings = buildings.to_crs({'init':'epsg:32630'})

### Calculate features from the polygon shapes

In [11]:
buildings['area'] = buildings.geometry.area
buildings['length'] = buildings.geometry.length
buildings['uniformity1'] = buildings['area'] / buildings['length']
buildings['cvx_hull_area'] = buildings.geometry.convex_hull.area
buildings['uniformity2'] = buildings['area'] / buildings['cvx_hull_area']

In [12]:
tmp_buildings = buildings.geometry.bounds
tmp_buildings['y'] = tmp_buildings['maxy'] - tmp_buildings['miny']
tmp_buildings['x'] = tmp_buildings['maxx'] - tmp_buildings['minx']
tmp_buildings['AR'] = tmp_buildings[['x','y']].max(axis=1) / tmp_buildings[['x','y']].min(axis=1)
buildings = pd.merge(buildings,
                     tmp_buildings[['AR']],
                     right_index = True,
                     left_index = True,
                     how = 'inner')

### Five nearest neighbours based on distances between polygon centroids

In [13]:
# Want to find each buildings five nearest neighbours. To reduce the time taken to process each building, 
# a flag for the local authority is joined and the search for neighbours is reduced to within the local authority

# the local authority lookup is by uprn. To join local auhtority to toid an iterim dataset of one uprn to 
# toid is created
uprn2toidone = uprn2toid.drop_duplicates(subset='toid',keep='first')

uprn2toidone = pd.merge(uprn2toidone, lad_lookup, on = 'uprn', how = 'inner')

# Merge the lad lookup onto the buildings data frame
buildings = pd.merge(buildings, uprn2toidone, on = 'toid', how = 'left')

In [14]:
# Get a list of the LAD entries and drop unwanted entries
lad_list = buildings['lad'].unique().tolist()
lad_list.remove('unknown')
lad_list.remove('Cheshire West and Chester')
lad_list.remove('Shropshire')

Find nearest neighbour for each toid using the K nearest neighbour (KNN) algorithm. In this case K = 6 as the closest neighbour will be itself and will be excluded

In [None]:
buildings_buffer = []
for curr_lad in lad_list:
    print('Processing:\t', curr_lad)
    # Extract relevant records and reset index
    build_lad_slice = buildings.loc[buildings['lad'] == curr_lad,['uprn','geometry']]
    build_lad_slice = build_lad_slice.reset_index()
    
    # Extract the components of the centroid and convert to a numpy array
    build_lad_slice['x'] = build_lad_slice.centroid.x
    build_lad_slice['y'] = build_lad_slice.centroid.y
    #  Convert to a numpy array
    nn_array = build_lad_slice[['x','y']].values
    # Get NN (instantiate the sklearn object then 'train')
    nn = NearestNeighbors(n_neighbors = 6, algorithm = 'ball_tree').fit(nn_array)
    distances, indices = nn.kneighbors(nn_array)
    # Convert the nparray to a dataframe (take the 2nd column because the 1st is
    # the distance between each item and itself i.e. 0
    nn = pd.DataFrame(distances)[[1,2,3,4,5]]
    nn.rename(columns = {1:'nn_centroid',2:'nn_c2',3:'nn_c3',4:'nn_c4',5:'nn_c5'}, inplace = True)
    
    nn['nn_centroid_count'] = nn[['nn_centroid','nn_c2','nn_c3','nn_c4','nn_c5']].astype(bool).sum(axis=1)
    
    # Merge on the house centroids
    build_lad_slice = pd.merge(build_lad_slice[['uprn']],
                               nn,
                               left_index = True,
                               right_index = True,
                               how = 'inner')
    
    # Add to the LAD buffer
    buildings_buffer.append(build_lad_slice)

In [None]:
# Concatenate the buffer into a single dataframe
buildings_centroid_nn = pd.concat(buildings_buffer,ignore_index = True)


# Merge back onto the buildings dataset
buildings = pd.merge(buildings, buildings_centroid_nn, on = 'uprn', how = 'left')

### Five nearest neighbours based on distances between polygon geometry

In [17]:
buildings_buffer = []
# del nn
for curr_lad in lad_list:

    print('Processing:\t', curr_lad)

    # Extract relevant records and reset index
    build_lad_slice = buildings.loc[buildings['lad'] == curr_lad,['uprn','geometry']]
    build_lad_slice = build_lad_slice.reset_index()

    # Get the geometries into a look up dictionary
    geometry_lookup = build_lad_slice['geometry'].to_dict()
    uprn_lookup = build_lad_slice['uprn'].to_dict()

    # Extract the components of the centroid and convert to a numpy array
    build_lad_slice['x'] = build_lad_slice.centroid.x
    build_lad_slice['y'] = build_lad_slice.centroid.y

    #  Convert to a numpy array
    nn_array = build_lad_slice[['x','y']].values

    # Get NN (instantiate the sklearn object then 'train')
    nn = NearestNeighbors(n_neighbors = 6, algorithm = 'ball_tree').fit(nn_array)
    distances, indices = nn.kneighbors(nn_array)

    # Convert the nparray to a dataframe (take the 2nd to 6th column because the 1st is
    # the zeroth index
    nn = pd.DataFrame(indices)[[1,2,3,4,5]]

    # Get the geometrety for each NN
    nn['geo1'] = nn[1].map(geometry_lookup)
    nn['geo2'] = nn[2].map(geometry_lookup)
    nn['geo3'] = nn[3].map(geometry_lookup)
    nn['geo4'] = nn[4].map(geometry_lookup)
    nn['geo5'] = nn[5].map(geometry_lookup)
    
    nn['uprn1'] = nn[1].map(uprn_lookup)
    nn['uprn2'] = nn[2].map(uprn_lookup)
    nn['uprn3'] = nn[3].map(uprn_lookup)
    nn['uprn4'] = nn[4].map(uprn_lookup)
    nn['uprn5'] = nn[5].map(uprn_lookup)
    
    # Merge on the house geometry
    nn = pd.merge(build_lad_slice[['uprn','geometry']],
                  nn[['geo1','geo2','geo3','geo4','geo5','uprn1','uprn2','uprn3','uprn4','uprn5']],
                  left_index = True,
                  right_index = True,
                  how = 'inner')

    # Get distances between geometries and take the minimum
    nn['d1'] = nn.geometry.distance(gpd.GeoSeries(nn['geo1']))
    nn['d2'] = nn.geometry.distance(gpd.GeoSeries(nn['geo2']))
    nn['d3'] = nn.geometry.distance(gpd.GeoSeries(nn['geo3']))
    nn['d4'] = nn.geometry.distance(gpd.GeoSeries(nn['geo4']))
    nn['d5'] = nn.geometry.distance(gpd.GeoSeries(nn['geo5']))

    nn['nn_geometry'] = nn[['d1','d2','d3','d4','d5']].min(axis = 1)
    nn['nn_count'] = nn[['d1','d2','d3','d4','d5']].astype(bool).sum(axis=1)

    # Add to the LAD buffer
    buildings_buffer.append(nn[['uprn','nn_geometry','nn_count','uprn1','uprn2','uprn3','uprn4','uprn5',
                               'd1','d2','d3','d4','d5']])

Processing:	 Flintshire
Processing:	 Denbighshire
Processing:	 Wrexham
Processing:	 Conwy
Processing:	 Powys
Processing:	 Monmouthshire
Processing:	 Caerphilly
Processing:	 Rhondda Cynon Taf
Processing:	 Merthyr Tydfil
Processing:	 Blaenau Gwent
Processing:	 Torfaen
Processing:	 Vale of Glamorgan
Processing:	 Cardiff
Processing:	 Newport
Processing:	 Gwynedd
Processing:	 Isle of Anglesey
Processing:	 Pembrokeshire
Processing:	 Carmarthenshire
Processing:	 Ceredigion
Processing:	 Swansea
Processing:	 Neath Port Talbot
Processing:	 Bridgend


In [18]:
# Concatenate the buffer into a single dataframe
buildings_geometry_nn = pd.concat(buildings_buffer,ignore_index = True)

# Merge back onto the buildings dataset
buildings = pd.merge(buildings,
                     buildings_geometry_nn,
                     on = 'uprn',
                     how = 'left')

### Export dataset

In [None]:
# export interim dataset to use in another block_counts script
buildings.to_csv(os.path.join(polygon_path,buildings_fname),index=False)

### Adjusting building metrics

A building may contain multiple properties for instance a block of flats. The polygons capture the whole building outline and are recorded by topographic identifier (TOID) with a lookup for each Unique Property Reference Number (UPRN). A TOID represents an individual building which means for houses the data provides accurate information - shape, area, length and uniformity. However, for flats the exact details are unknown and the area and length have been divided equally between the properties in the building.

In [19]:
# Merge on the uprn to the buildings dataset
buildings = pd.merge(buildings, uprn2toid, on = 'toid', how = 'left')

# Add a multiple toid count
tmp = buildings['toid'].value_counts()
tmp = tmp.to_frame(name = 'uprn_count').reset_index()
tmp.rename(columns = {'index' : 'toid'}, inplace = True)

buildings = pd.merge(buildings, tmp, on = 'toid', how = 'inner')

# Adjust the metrics to account for multiple toids
buildings['adj_area'] = buildings['area'] / buildings['uprn_count']
buildings['adj_length'] = buildings['length'] / buildings['uprn_count']
buildings['adj_uniformity1'] = buildings['adj_area'] / buildings['adj_length']
buildings['adj_cvx_hull_area'] = buildings['cvx_hull_area'] / buildings['uprn_count']
buildings['adj_uniformity2'] = buildings['adj_area'] / buildings['adj_cvx_hull_area']

Flats have been determined by counting the number of UPRNs within a toid. If the count is greater than one than the properties within the building are all deemed to be flats.

In [20]:
buildings['flat'] = buildings.apply(lambda row: 1 if row['uprn_count'] > 1 else 0, axis = 1)

### Merging building heights

In [31]:
buildings = buildings.merge(building_heights, left_on = 'toid', right_on = 'TOID', how = 'left')

In [32]:
buildings['adj_reH2'] = buildings['ReH2'] / buildings['uprn_count']
buildings['adj_AbsHMax'] = buildings['AbsHMax'] / buildings['uprn_count']

In [33]:
buildings.drop('uprn_x',inplace=True,axis=1)
buildings.rename({'uprn_y':'uprn'},inplace=True,axis=1)

### Merging built form from block_counts script

In [34]:
built_form = pd.read_csv(os.path.join(polygon_path,built_form_fname))

In [35]:
buildings_complete = buildings.merge(built_form[['toid','built_form','block_count']], how = 'left',on='toid')

### export data

In [None]:
buildings_complete.to_csv(os.path.join(polygon_path,polygon_features_fname),index = False)