# Test Full Code with Birmingham
### Gavin Rolls

This notebook will be a compilation of the methods I've applied to London as my area of study and attempt to recreate the model in Birmingham

# Data Collection

### Imports + Database Configuration

In [2]:
#Library Imports - using DuckDB for Overture Import

#Database
import duckdb

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np
from geopy.geocoders import Nominatim

#OpenStreetMap
import osmnx as ox

#Shapely
from geopy.geocoders import Nominatim
from shapely.geometry import box
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon
from shapely.geometry import mapping
from shapely.geometry import shape

#Plots
import matplotlib.pyplot as plt
import seaborn as sns

#Warning Supression
import warnings

#Machine Learning
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import ast

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

In [6]:
#Config SQL
%pip install ipython-sql duckdb duckdb-engine jupysql --quiet
%pip install --upgrade grpcio --quiet
%load_ext sql

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:

In [8]:
%%sql      
INSTALL httpfs;

LOAD httpfs;

INSTALL spatial;

LOAD spatial;

SET s3_region='us-west-2';

Unnamed: 0,Success


### Download Buildings - Overture

Downloaded as 'data/bham_buildings_overture.geojson'

In [10]:
#Get Birmingham Bounding Box

# Initialize the geolocator
geolocator = Nominatim(user_agent="geoapi")

# Get location data
location = geolocator.geocode("Birmingham")

# Get the bounding box
bounding_box = location.raw['boundingbox']

# Convert bounding box to coordinates
min_lat, max_lat = float(bounding_box[0]), float(bounding_box[1])
min_lon, max_lon = float(bounding_box[2]), float(bounding_box[3])

print(min_lon)
print(min_lat)
print(max_lon)
print(max_lat)

-2.0336486
52.381053
-1.7288417
52.6087058


In [11]:
%%sql

LOAD azure;

SET azure_storage_connection_string = 'DefaultEndpointsProtocol=https;AccountName=overturemapswestus2;AccountKey=;EndpointSuffix=core.windows.net';
COPY (
SELECT
    names.primary as primary_name,
    height,
    level,
    ST_GeomFromWKB(geometry) as geometry
FROM read_parquet('azure://release/2024-05-16-beta.0/theme=buildings/type=building/*', filename=true, hive_partitioning=1)
WHERE primary_name IS NOT NULL
AND bbox.xmin > -2.0336486
AND bbox.xmax < -1.7288417
AND bbox.ymin > 52.381053
AND bbox.ymax < 52.6087058
) TO 'data/overture_data/bham_buildings_overture.geojson'
WITH (FORMAT GDAL, DRIVER 'GeoJSON', SRS 'EPSG:4326');


### Download POIs - Overture

Downloaded as 'data/bham_places_overture.geojson''

In [13]:
%%sql

COPY (
    SELECT
        names.primary AS name,
        categories.main as category,
        ROUND(confidence,2) as confidence,
        ST_GeomFromWKB(geometry) as geometry
FROM read_parquet('s3://overturemaps-us-west-2/release/2024-05-16-beta.0/theme=places/*/*')
WHERE
    bbox.xmin BETWEEN -2.0336486 AND -1.7288417 AND
    bbox.ymin BETWEEN 52.381053 AND 52.6087058
) TO 'data/overture_data/bham_places_overture.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON', SRS 'EPSG:4326');

### Overture Download Stats

In [4]:
#Basic overview stats of Birmingham Data - Using the locally saved files here
buildings = gpd.read_file('data/overture_data/bham_buildings_overture.geojson')
places = gpd.read_file('data/overture_data/bham_places_overture.geojson')

#Count of Features
print("Birmingham Building Count: " + str(buildings.shape[0]))
print("Birmingham POI count: " + str(places.shape[0]))

Birmingham Building Count: 11987
Birmingham POI count: 46781


### Download Buildings - OSM

Downloaded as 'data/osmbuildings/building_footprints.geojson'

In [5]:
#Download Data for Birmingham
place_name = 'Birmingham, United Kingdom'

#Test new config
ox.config(use_cache=True, log_console=True)

buildings = ox.features_from_place(place_name, tags={'building': True})
buildings = buildings[buildings.geometry.notnull()]
building_footprints = buildings[buildings.geom_type.isin(['Polygon', 'MultiPolygon'])]

for col in building_footprints.columns:
    if building_footprints.apply(lambda x: isinstance(x, list)).any():
        building_footprints = building_footprints[col].apply(lambda x: str(x) if isinstance(x, list) else x)

building_footprints = building_footprints[['name', 'geometry']].reset_index()
print(building_footprints)
        
# Save the combined GeoDataFrame to a geojson file
building_footprints.to_file("data/osm_data/bham_buildings.geojson", driver="GeoJSON")



       element_type     osmid                                          name  \
0               way  16966777                                           NaN   
1               way  16966784                                           NaN   
2               way  16966785                                           NaN   
3               way  16966788                                           NaN   
4               way  16966789                                           NaN   
...             ...       ...                                           ...   
256261     relation  15607352                   Beauchamp Convenience Store   
256262     relation  15809922                                       Block A   
256263     relation  15823526                                           NaN   
256264     relation  16414329  Birmingham Settlement - 610 Community Centre   
256265     relation  16542773                                           NaN   

                                                 ge

### Extract Building Type Info from OSM - Commercial

In [6]:
#Download Data for Birmingham
place_name = 'Birmingham, United Kingdom'

commercial_buildings = ox.features_from_place(place_name, tags={'building': ['commercial']})

commercial_buildings = commercial_buildings[commercial_buildings.geometry.notnull()]
commercial_building_footprints = commercial_buildings[commercial_buildings.geom_type.isin(['Polygon', 'MultiPolygon'])]

for col in commercial_building_footprints.columns:
    if commercial_building_footprints.apply(lambda x: isinstance(x, list)).any():
        commercial_building_footprints = commercial_building_footprints[col].apply(lambda x: str(x) if isinstance(x, list) else x)

commercial_building_footprints = commercial_building_footprints[['name', 'geometry']].reset_index()
print(commercial_building_footprints)
        
# Save the combined GeoDataFrame to a geojson file
commercial_building_footprints.to_file("data/osm_data/bham_commercial_buildings.geojson", driver="GeoJSON")

    element_type       osmid                     name  \
0            way    17405658                      NaN   
1            way    20002347  Dental Care Partnership   
2            way    22763584            Station House   
3            way    22763760                      NaN   
4            way    22763952                      NaN   
..           ...         ...                      ...   
312          way  1259115143                      NaN   
313          way  1259115144                      NaN   
314          way  1259115145                      NaN   
315     relation      333915      The Custard Factory   
316     relation     7076030                Eagle Two   

                                              geometry  
0    POLYGON ((-1.82146 52.56186, -1.82179 52.56209...  
1    POLYGON ((-1.82680 52.55679, -1.82680 52.55680...  
2    POLYGON ((-1.82027 52.56533, -1.82036 52.56528...  
3    POLYGON ((-1.81624 52.56247, -1.81633 52.56237...  
4    POLYGON ((-1.82061 52.565

### Extract Building Type Info from OSM - Office

In [8]:
#Download Data for Birmingham
place_name = 'Birmingham, United Kingdom'

ox.config(use_cache=True, log_console=True)

office_buildings = ox.features_from_place(place_name, tags={'building': ['office']})

office_buildings = office_buildings[office_buildings.geometry.notnull()]
office_building_footprints = office_buildings[office_buildings.geom_type.isin(['Polygon', 'MultiPolygon'])]

for col in office_building_footprints.columns:
    if office_building_footprints.apply(lambda x: isinstance(x, list)).any():
        office_building_footprints[col] = office_building_footprints[col].apply(lambda x: str(x) if isinstance(x, list) else x)

office_building_footprints = office_building_footprints[['name', 'geometry']].reset_index()
print(office_building_footprints)
        
# Save the combined GeoDataFrame to a geojson file
office_building_footprints.to_file("data/osm_data/bham_office_buildings.geojson", driver="GeoJSON")


    element_type     osmid                     name  \
0            way  22820818        Baskerville House   
1            way  28292471     No. 1 Colmore Square   
2            way  28425056         McLaren Building   
3            way  28455010        Gazette Buildings   
4            way  28514237                      NaN   
..           ...       ...                      ...   
190     relation   1298933           Fountain Court   
191     relation   3417218     Eight Brindley Place   
192     relation   3417219     Seven Brindley Place   
193     relation   3417220       Ten Brindley Place   
194     relation   4631334  Birmingham Royal Ballet   

                                              geometry  
0    POLYGON ((-1.90809 52.48018, -1.90735 52.48039...  
1    POLYGON ((-1.89668 52.48235, -1.89675 52.48258...  
2    POLYGON ((-1.89265 52.48192, -1.89247 52.48183...  
3    POLYGON ((-1.89313 52.48261, -1.89339 52.48236...  
4    POLYGON ((-1.90172 52.47568, -1.90125 52.47528...



### Extract Building Type Info from OSM - Residential

In [9]:
#Download Data for Birmingham
place_name = 'Birmingham, United Kingdom'

ox.config(use_cache=True, log_console=True)

residential_buildings = ox.features_from_place(place_name, tags={'building': ['residential']})

residential_buildings = residential_buildings[residential_buildings.geometry.notnull()]
residential_building_footprints = residential_buildings[residential_buildings.geom_type.isin(['Polygon', 'MultiPolygon'])]

for col in residential_building_footprints.columns:
    if residential_building_footprints.apply(lambda x: isinstance(x, list)).any():
        residential_building_footprints[col] = residential_building_footprints[col].apply(lambda x: str(x) if isinstance(x, list) else x)

residential_building_footprints = residential_building_footprints[['name', 'geometry']].reset_index()
print(residential_building_footprints)
        
# Save the combined GeoDataFrame to a geojson file
residential_building_footprints.to_file("data/osm_data/bham_residential_buildings.geojson", driver="GeoJSON")




       element_type     osmid           name  \
0               way  16966777            NaN   
1               way  16966784            NaN   
2               way  16966785            NaN   
3               way  16966788            NaN   
4               way  16966789            NaN   
...             ...       ...            ...   
189381     relation  10407216            NaN   
189382     relation  11382596  Bentley House   
189383     relation  12197160            NaN   
189384     relation  15809922        Block A   
189385     relation  16542773            NaN   

                                                 geometry  
0       POLYGON ((-1.82461 52.55549, -1.82450 52.55543...  
1       POLYGON ((-1.82324 52.55438, -1.82301 52.55434...  
2       POLYGON ((-1.82273 52.55403, -1.82264 52.55400...  
3       POLYGON ((-1.82341 52.55414, -1.82338 52.55412...  
4       POLYGON ((-1.82311 52.55396, -1.82302 52.55391...  
...                                                   ...  
189

### Extract Building Type Info from OSM - Retail

In [10]:
#Download Data for Birmingham
place_name = 'Birmingham, United Kingdom'

ox.config(use_cache=True, log_console=True)

retail_buildings = ox.features_from_place(place_name, tags={'building': ['retail']})

retail_buildings = retail_buildings[retail_buildings.geometry.notnull()]
retail_building_footprints = retail_buildings[retail_buildings.geom_type.isin(['Polygon', 'MultiPolygon'])]

for col in retail_building_footprints.columns:
    if retail_building_footprints.apply(lambda x: isinstance(x, list)).any():
        retail_building_footprints[col] = retail_building_footprints[col].apply(lambda x: str(x) if isinstance(x, list) else x)

retail_building_footprints = retail_building_footprints[['name', 'geometry']].reset_index()
print(retail_building_footprints)
        
# Save the combined GeoDataFrame to a geojson file
retail_building_footprints.to_file("data/osm_data/bham_retail_buildings.geojson", driver="GeoJSON")




     element_type     osmid                         name  \
0             way  16969083                        Boots   
1             way  17273169                          NaN   
2             way  20002348                  Vesey Manor   
3             way  20002352             Driffold Gallery   
4             way  23585174                     One Stop   
...           ...       ...                          ...   
3499     relation   2912290                  Sainsbury's   
3500     relation   5536524                Grand Central   
3501     relation  10748407             Picadilly Arcade   
3502     relation  14397641                          NaN   
3503     relation  15607352  Beauchamp Convenience Store   

                                               geometry  
0     POLYGON ((-1.82369 52.55768, -1.82379 52.55771...  
1     POLYGON ((-1.89325 52.47748, -1.89313 52.47745...  
2     POLYGON ((-1.82665 52.55648, -1.82665 52.55630...  
3     POLYGON ((-1.82722 52.55565, -1.82734 52.

# Data Cleaning
### Taken from v2

### UK BRES Employment Data

In [11]:
#Skip the first six rows because they're header information
empl_data = pd.read_csv('data/employment_data/lsoa_by_industry.csv', skiprows=7, delimiter=',')

unnamed_cols = empl_data.columns[empl_data.columns.str.contains('^Unnamed:')]
empl_data.drop(columns=unnamed_cols, inplace=True)

#Separate name into LSOA11CD and LSOA11NM
def split_column(value):
    if isinstance(value, str) and 'lsoa2011:' in value:
        parts = value.split('lsoa2011:')[1]
        code, name = parts.split(' : ')
        return code.strip(), name.strip()
    else:
        return None, None

empl_data[['LSOA11CD', 'LSOA11NM']] = empl_data['Area'].apply(lambda x: pd.Series(split_column(x)))

# Drop rows not in Birmingham
empl_data.dropna(subset=['LSOA11NM'], inplace=True)
empl_data = empl_data[empl_data['LSOA11NM'].str.contains('Birmingham')]

print("Num Rows (LSOAs) Before Cleaning: " + str(empl_data.shape[0]))

#There appear to be a bunch of duplicates so I'm going to get rid of them now
empl_data.drop_duplicates(inplace=True)

print("Num Rows (LSOAs) After Cleaning: " + str(empl_data.shape[0]))

Num Rows (LSOAs) Before Cleaning: 639
Num Rows (LSOAs) After Cleaning: 639


In [12]:
#Create Total Employment Column
empl_data[empl_data.columns[1:-2]] = empl_data[empl_data.columns[1:-2]].apply(pd.to_numeric, errors='coerce')
empl_data['total_employment'] = empl_data[empl_data.columns[1:-2]].sum(axis=1)

empl_data.head()

max_index = empl_data['total_employment'].idxmax()
max_row = empl_data.loc[max_index]
print(max_row)

Area                                                                                                lsoa2011:E01033620 : Birmingham 138A
01 : Crop and animal production, hunting and related service activities                                                              0.0
02 : Forestry and logging                                                                                                            0.0
03 : Fishing and aquaculture                                                                                                         0.0
05 : Mining of coal and lignite                                                                                                      0.0
                                                                                                                    ...                 
98 : Undifferentiated goods- and services-producing activities of private households for own use                                     0.0
99 : Activities of extraterritorial organ

### LSOA Geographic Data

In [13]:
#Get LSOA Shapefile Data
lsoa_geo = gpd.read_file('data/lsoa_data/LSOA_2011_EW_BFE_V3.shp')

#Convert to WGS for consistency
lsoa_geo = lsoa_geo.to_crs(epsg=4326)

print("Num Rows (LSOAs): " + str(lsoa_geo.shape[0]))

#Get rid of columns I'm not using for now
lsoa_geo = lsoa_geo.drop(columns=['BNG_E', 'BNG_N', 'LONG_', 'LAT', 'GlobalID', 'Shape_Leng'])

#Join with population
lsoa_pop = pd.read_csv('data/lsoa_data/lsoa_pop.csv')

lsoa_geo.drop(columns=['LSOA11NM'])

#Get population estimates - had to be pulled in from separate Census dataset
lsoa_geo = lsoa_geo.merge(lsoa_pop, on='LSOA11CD')

lsoa_geo.head()

Num Rows (LSOAs): 34753


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population
0,E01000034,Barking and Dagenham 003A,"POLYGON ((0.17380 51.56013, 0.17379 51.56012, ...",Barking and Dagenham 003A,,1444
1,E01000035,Barking and Dagenham 010A,"POLYGON ((0.16598 51.55250, 0.16601 51.55248, ...",Barking and Dagenham 010A,,1610
2,E01000036,Barking and Dagenham 010B,"POLYGON ((0.15965 51.54776, 0.15974 51.54771, ...",Barking and Dagenham 010B,,1353
3,E01000037,Barking and Dagenham 003B,"POLYGON ((0.18508 51.56480, 0.18476 51.56454, ...",Barking and Dagenham 003B,,1688
4,E01000038,Barking and Dagenham 003C,"POLYGON ((0.17599 51.56476, 0.17601 51.56475, ...",Barking and Dagenham 003C,,1447


### Overture Places Data

In [14]:
overture_places = gpd.read_file('data/overture_data/bham_places_overture.geojson')

print('Number of POIs in Overture (Birmingham): ' + str(overture_places.shape[0]))
overture_places.head()

Number of POIs in Overture (Birmingham): 46781


Unnamed: 0,name,category,confidence,geometry
0,Beacon Hill Toposcope,landmark_and_historical_building,0.96,POINT (-2.01965 52.38213)
1,Lickey Hills Country Park - Beacon Hill,park,0.67,POINT (-2.01972 52.38229)
2,Lickey Monument,monument,0.55,POINT (-2.01956 52.38227)
3,Beacon Castle,park,0.67,POINT (-2.01961 52.38234)
4,Barnett Fair Unisex Hairdressing,beauty_salon,0.96,POINT (-2.00227 52.38278)


### OpenStreetMap Buildings Data

I will query each set of building types and then collapse the dataset down so that tags are preserved

In [15]:
osm_all_buildings = gpd.read_file('data/osm_data/bham_buildings.geojson')

print('Number of Buildings in OSM (Birmingham): ' + str(osm_all_buildings.shape[0]))
osm_all_buildings.head()

Number of Buildings in OSM (Birmingham): 256266


Unnamed: 0,element_type,osmid,name,geometry
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543..."
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434..."
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400..."
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412..."
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391..."


In [16]:
# Commercial Buildings

osm_commercial_buildings = gpd.read_file('data/osm_data/bham_commercial_buildings.geojson')

print('Number of Commercial Buildings in OSM (Birmingham): ' + str(osm_commercial_buildings.shape[0]))
osm_commercial_buildings.head()

Number of Commercial Buildings in OSM (Birmingham): 317


Unnamed: 0,element_type,osmid,name,geometry
0,way,17405658,,"POLYGON ((-1.82146 52.56186, -1.82179 52.56209..."
1,way,20002347,Dental Care Partnership,"POLYGON ((-1.82680 52.55679, -1.82680 52.55680..."
2,way,22763584,Station House,"POLYGON ((-1.82027 52.56533, -1.82036 52.56528..."
3,way,22763760,,"POLYGON ((-1.81624 52.56247, -1.81633 52.56237..."
4,way,22763952,,"POLYGON ((-1.82061 52.56543, -1.82042 52.56531..."


In [17]:
# Office Buildings

osm_office_buildings = gpd.read_file('data/osm_data/bham_office_buildings.geojson')

print('Number of Office Buildings in OSM (Birmingham): ' + str(osm_office_buildings.shape[0]))
osm_office_buildings.head()

Number of Office Buildings in OSM (Birmingham): 195


Unnamed: 0,element_type,osmid,name,geometry
0,way,22820818,Baskerville House,"POLYGON ((-1.90809 52.48018, -1.90735 52.48039..."
1,way,28292471,No. 1 Colmore Square,"POLYGON ((-1.89668 52.48235, -1.89675 52.48258..."
2,way,28425056,McLaren Building,"POLYGON ((-1.89265 52.48192, -1.89247 52.48183..."
3,way,28455010,Gazette Buildings,"POLYGON ((-1.89313 52.48261, -1.89339 52.48236..."
4,way,28514237,,"POLYGON ((-1.90172 52.47568, -1.90125 52.47528..."


In [18]:
# Residential Buildings

osm_residential_buildings = gpd.read_file('data/osm_data/bham_residential_buildings.geojson')

print('Number of Residential Buildings in OSM (Birmingham): ' + str(osm_residential_buildings.shape[0]))
osm_residential_buildings.head()

Number of Residential Buildings in OSM (Birmingham): 189386


Unnamed: 0,element_type,osmid,name,geometry
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543..."
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434..."
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400..."
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412..."
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391..."


In [19]:
# Retail Buildings

osm_retail_buildings = gpd.read_file('data/osm_data/bham_retail_buildings.geojson')

print('Number of Retail Buildings in OSM (Birmingham): ' + str(osm_retail_buildings.shape[0]))
osm_retail_buildings.head()

Number of Retail Buildings in OSM (Birmingham): 3504


Unnamed: 0,element_type,osmid,name,geometry
0,way,16969083,Boots,"POLYGON ((-1.82369 52.55768, -1.82379 52.55771..."
1,way,17273169,,"POLYGON ((-1.89325 52.47748, -1.89313 52.47745..."
2,way,20002348,Vesey Manor,"POLYGON ((-1.82665 52.55648, -1.82665 52.55630..."
3,way,20002352,Driffold Gallery,"POLYGON ((-1.82722 52.55565, -1.82734 52.55554..."
4,way,23585174,One Stop,"POLYGON ((-1.90532 52.51745, -1.90573 52.51726..."


In [20]:
# Assign a building type in the main DataFrame if the osmid is found in one of the other DataFrames

# I know that office and retail buildings could have commercial or office but office and retail will overwrite commercia
# (and are thus checked after) because they're more specific

osm_all_buildings['building_type'] = 'none'

osm_all_buildings.loc[osm_all_buildings['osmid'].isin(osm_commercial_buildings['osmid']), 'building_type'] = 'commercial'
osm_all_buildings.loc[osm_all_buildings['osmid'].isin(osm_retail_buildings['osmid']), 'building_type'] = 'retail'
osm_all_buildings.loc[osm_all_buildings['osmid'].isin(osm_office_buildings['osmid']), 'building_type'] = 'office'
osm_all_buildings.loc[osm_all_buildings['osmid'].isin(osm_residential_buildings['osmid']), 'building_type'] = 'residential'

osm_all_buildings.head(50)

Unnamed: 0,element_type,osmid,name,geometry,building_type
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543...",residential
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434...",residential
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400...",residential
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412...",residential
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391...",residential
5,way,16966791,,"POLYGON ((-1.82282 52.55433, -1.82266 52.55431...",residential
6,way,16966794,Royal (Proposed),"POLYGON ((-1.82619 52.55661, -1.82611 52.55650...",none
7,way,16966799,,"POLYGON ((-1.82425 52.55803, -1.82413 52.55804...",residential
8,way,16966800,James Preston Health Centre,"POLYGON ((-1.82348 52.55799, -1.82331 52.55823...",none
9,way,16966807,,"POLYGON ((-1.82428 52.55791, -1.82431 52.55807...",residential


## Data Joining + Prep

### Joining LSOA Geography and Employment Data

In [21]:
#Note that total Birmingham statistics are dropped here because there's no equivalent column in lsoa_geo
empl_geog = pd.merge(lsoa_geo, empl_data, on = "LSOA11CD")

print("Num Rows (LSOAS): " + str(empl_geog.shape[0]))

empl_geog.head()

Num Rows (LSOAS): 639


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,92 : Gambling and betting activities,93 : Sports activities and amusement and recreation activities,94 : Activities of membership organisations,95 : Repair of computers and personal and household goods,96 : Other personal service activities,97 : Activities of households as employers of domestic personnel,98 : Undifferentiated goods- and services-producing activities of private households for own use,99 : Activities of extraterritorial organisations and bodies,LSOA11NM,total_employment
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,Birmingham 067A,455.0
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,Birmingham 066A,30.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,Birmingham 078A,1810.0
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,40.0,0.0,0.0,0.0,Birmingham 078B,2590.0
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,Birmingham 076A,90.0


### Add POI and Building Data

In [22]:
#Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

#Init Columns (This was causing an error at first when I didn't put it here to begin)
empl_geog['num_buildings'] = 0
empl_geog['num_retail_buildings'] = 0
empl_geog['num_residential_buildings'] = 0
empl_geog['num_commercial_buildings'] = 0
empl_geog['num_office_buildings'] = 0

empl_geog['num_places'] = 0

empl_geog['building_poly'] = None
empl_geog['commercial_building_poly'] = None
empl_geog['retail_building_poly'] = None
empl_geog['office_building_poly'] = None
empl_geog['residential_building_poly'] = None

empl_geog['place_points'] = None
empl_geog['category_list'] = None

#Iterate through all LSOAs and grab OSM building counts and subcategories - add to empl_geog dataframe

for index, row in empl_geog.iterrows():

    #Status report
    if (index % 20 == 0):
        print(index)
    
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs=lsoa_geo.crs)

    #All Buildings - OSM
    filtered_osm_buildings = osm_all_buildings[osm_all_buildings.geometry.intersects(filter_geom.unary_union)]

    #Get subcategories based on tags
    commercial_buildings = osm_all_buildings[osm_all_buildings['building_type'] == 'commercial']
    office_buildings = osm_all_buildings[osm_all_buildings['building_type'] == 'office']
    retail_buildings = osm_all_buildings[osm_all_buildings['building_type'] == 'retail']
    residential_buildings = osm_all_buildings[osm_all_buildings['building_type'] == 'residential']

    #Subcategories of each building
    filtered_commercial_buildings = commercial_buildings[commercial_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_office_buildings = office_buildings[office_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_retail_buildings = retail_buildings[retail_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_residential_buildings = residential_buildings[residential_buildings.geometry.intersects(filter_geom.unary_union)]

    #Multipolygon of each LSOA's (OSM) buildings - category separated
    combined_multipolygon = filtered_osm_buildings.geometry.unary_union
    combined_commercial_multipolygon = filtered_commercial_buildings.geometry.unary_union
    combined_office_polygon = filtered_office_buildings.geometry.unary_union
    combined_retail_polygon = filtered_retail_buildings.geometry.unary_union
    combined_residential_polygon = filtered_residential_buildings.geometry.unary_union
    
    #All Places
    filtered_places = overture_places[overture_places.geometry.intersects(filter_geom.unary_union)]

    #Multipoint of each LSOA's places
    combined_multipoint = filtered_places.geometry.unary_union
    
    #Add place category information
    category_list = filtered_places['category'].dropna().tolist()
    

    #Add back to Dataframe
    empl_geog.at[index, 'num_buildings'] = len(filtered_osm_buildings)
    
    empl_geog['num_retail_buildings'] = len(filtered_retail_buildings)
    empl_geog['num_residential_buildings'] = len(filtered_residential_buildings)
    empl_geog['num_commercial_buildings'] = len(filtered_commercial_buildings)
    empl_geog['num_office_buildings'] = len(filtered_office_buildings)
    
    empl_geog.at[index, 'num_places'] = len(filtered_places)
    
    empl_geog.at[index, 'building_poly'] = combined_multipolygon
    empl_geog.at[index, 'commercial_building_poly'] = combined_commercial_multipolygon
    empl_geog.at[index, 'retail_building_poly'] = combined_office_polygon
    empl_geog.at[index, 'office_building_poly'] = combined_retail_polygon
    empl_geog.at[index, 'residential_building_poly'] = combined_residential_polygon

    
    empl_geog.at[index, 'place_points'] = combined_multipoint
    empl_geog.at[index, 'category_list'] = category_list
    
#Check
empl_geog.head()

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,num_commercial_buildings,num_office_buildings,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0,1,79,"MULTIPOLYGON (((-1.8213621 52.4666609, -1.8210...",,,"MULTIPOLYGON (((-1.8122732 52.4644594, -1.8121...","MULTIPOLYGON (((-1.8207928 52.467573, -1.82066...","MULTIPOINT ((-1.8208857 52.4670509), (-1.82066...","[convenience_store, pizza_restaurant, fast_foo..."
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0,1,5,"MULTIPOLYGON (((-1.8336333 52.4637367, -1.8332...",,,"MULTIPOLYGON (((-1.8332702 52.4634641, -1.8336...","MULTIPOLYGON (((-1.833377 52.4638622, -1.83337...","MULTIPOINT ((-1.8329422 52.4640839), (-1.83285...","[church_cathedral, home_security, school]"
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0,1,39,"MULTIPOLYGON (((-1.8286578 52.4605087, -1.8276...",,,"MULTIPOLYGON (((-1.825245 52.4631362, -1.82474...","MULTIPOLYGON (((-1.8270907 52.4634663, -1.8270...","MULTIPOINT ((-1.8271863 52.463742), (-1.827072...","[computer_store, professional_services, sporti..."
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0,1,173,"MULTIPOLYGON (((-1.8378055 52.4575343, -1.8380...",,"POLYGON ((-1.8354944 52.4612255, -1.8352535 52...","MULTIPOLYGON (((-1.8394717 52.4632192, -1.8388...","MULTIPOLYGON (((-1.8377769 52.4595681, -1.8377...","MULTIPOINT ((-1.8443173 52.4598824), (-1.84310...","[car_window_tinting, powder_coating_service, l..."
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0,1,28,"MULTIPOLYGON (((-1.8092612 52.4561461, -1.8092...",,,"MULTIPOLYGON (((-1.8052836 52.4607015, -1.8052...","MULTIPOLYGON (((-1.8093482 52.4561539, -1.8092...","MULTIPOINT ((-1.8092852 52.4618985), (-1.80896...","[hotel, dry_cleaning, international_business_a..."


In [23]:
#Save to Dataframe on Desktop

#Convert multipolygons to WKT for export
empl_geog['building_poly'] = empl_geog['building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog['commercial_building_poly'] = empl_geog['commercial_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog['retail_building_poly'] = empl_geog['retail_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog['office_building_poly'] = empl_geog['office_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog['residential_building_poly'] = empl_geog['residential_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

empl_geog['place_points'] = empl_geog['place_points'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

# Convert category_list to string for export
empl_geog['category_list'] = empl_geog['category_list'].apply(lambda x: str(x) if x else None)

#Export
empl_geog.to_file("data/bham_data/empl_geog.geojson", driver="GeoJSON")

In [24]:
# Set back to Geometries for use in plot

# WKT strings back to Shapely geometries
empl_geog['building_poly'] = empl_geog['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['commercial_building_poly'] = empl_geog['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['retail_building_poly'] = empl_geog['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['office_building_poly'] = empl_geog['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['residential_building_poly'] = empl_geog['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

empl_geog['place_points'] = empl_geog['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

In [25]:
#Get building subset - Birmingham 138A

#Code from earlier - isolate Birmingham 138A
lsoa = empl_geog[empl_geog['LSOA11NM_x'] == 'Birmingham 138A']
geom = lsoa['geometry'].iloc[0]
filter_geom = gpd.GeoSeries([geom], crs=lsoa_geo.crs)

#Get buildings
lsoa_all_buildings = gpd.GeoDataFrame(lsoa, geometry='building_poly')
lsoa_commercial_buildings = gpd.GeoDataFrame(lsoa, geometry='commercial_building_poly')
lsoa_office_buildings = gpd.GeoDataFrame(lsoa, geometry='office_building_poly')
lsoa_residential_buildings = gpd.GeoDataFrame(lsoa, geometry='residential_building_poly')
lsoa_retail_buildings = gpd.GeoDataFrame(lsoa, geometry='retail_building_poly')

# Plot OSM buildings
fig, ax = plt.subplots(figsize=(20, 20))
lsoa_all_buildings.plot(ax=ax, color='blue', edgecolor='black')
lsoa_commercial_buildings.plot(ax=ax, color='orange', edgecolor='black')
lsoa_office_buildings.plot(ax=ax, color='red', edgecolor='black')
lsoa_residential_buildings.plot(ax=ax, color='green', edgecolor='black')
lsoa_retail_buildings.plot(ax=ax, color='purple', edgecolor='black')

filter_geom.boundary.plot(ax=ax, color='red', linewidth=2)

plt.savefig('Plots/Birmingham_138A.png')

In [26]:
#Get all Birmingham

# Plot OSM buildings
fig, ax = plt.subplots(figsize=(20, 20))
osm_all_buildings.plot(ax=ax, legend=True, color='blue', edgecolor='black')
ax.set_axis_off()
plt.title('All OSM Birmingham Buildings', fontsize=16, fontweight='bold')

plt.savefig('Plots/Birmingham_All.png')

# Data Exploration

### Encoding POI Categories

In [27]:
# Encode POI categories in data - using One-Hot encoding

#Make sure category list is a list
empl_geog['category_list'] = empl_geog['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#Explode category lists
empl_geog_exploded = empl_geog.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories = encoder.fit_transform(empl_geog_exploded[['category_list']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_exploded = pd.concat([empl_geog_exploded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_exploded.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_encoded = empl_geog_exploded.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Store category columns
category_columns = empl_geog_encoded.columns[1:]

empl_geog_encoded.head()

Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,category_list_adoption_services,category_list_adult_education,...,category_list_windshield_installation_and_repair,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo
0,E01008881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,E01008884,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Geometric Features Extraction (copied from earlier but includes building types now)

#I'm going to exclude num polygons, average perimeter, and total perimeter cause they're not so helpful (cause Collinearity matrix shows its very close to count)

def extract_multipolygon_features(multipolygon, lsoa_geometry):
    if isinstance(multipolygon, MultiPolygon):
        polygons = list(multipolygon.geoms)
    elif isinstance(multipolygon, Polygon):
        polygons = [multipolygon]
    else:
        return pd.Series({
            'total_area': 0,
            'avg_building_area': 0,
            'lsoa_area_ratio': 0,
        })

    num_polygons = len(polygons)
    areas = [polygon.area for polygon in polygons]
    total_area = sum(areas)
    avg_building_area = total_area / num_polygons if num_polygons > 0 else 0

    #Built-up area ratio could result in a div by zero error if there are no buildings in an LSOA so this logic has to be included here
    try:
        lsoa_area_ratio = total_area / lsoa_geometry.area
    except ZeroDivisionError:
        lsoa_area_ratio = 0

    return pd.Series({
        'total_area': total_area,
        'lsoa_area_ratio': lsoa_area_ratio,
        'avg_building_area': avg_building_area,
    })


In [29]:
# All Buildings
all_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['building_poly'], row['geometry']),
    axis=1
).add_prefix('all_')

# Residential
residential_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['residential_building_poly'], row['geometry']),
    axis=1
).add_prefix('residential_')

# Commercial
commercial_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['commercial_building_poly'], row['geometry']),
    axis=1
).add_prefix('commercial_')

# Office
office_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['office_building_poly'], row['geometry']),
    axis=1
).add_prefix('office_')

# Retail
retail_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['retail_building_poly'], row['geometry']),
    axis=1
).add_prefix('retail_')

# Combine all geometry features into one DataFrame
all_geom_features = pd.concat([
    all_buildings_geometry_features,
    residential_buildings_geometry_features,
    commercial_buildings_geometry_features,
    office_buildings_geometry_features,
    retail_buildings_geometry_features
], axis=1, ignore_index=False)

# Capture Column Names
geo_features = all_geom_features.columns

# Display the first 50 rows
all_geom_features.head(50)

Unnamed: 0,all_total_area,all_lsoa_area_ratio,all_avg_building_area,residential_avg_building_area,residential_lsoa_area_ratio,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,6e-06,0.122619,3.285543e-08,2.302835e-08,0.063581,3e-06,0.0,0.0,0.0,6.707288e-08,0.016603,8.719474e-07,0.0,0.0,0.0
1,6e-06,0.175438,2.692668e-08,2.244685e-08,0.076574,2e-06,0.0,0.0,0.0,1.186504e-07,0.010939,3.559512e-07,0.0,0.0,0.0
2,1.3e-05,0.244526,7.209237e-08,3.779565e-08,0.093431,5e-06,0.0,0.0,0.0,8.688264e-08,0.004995,2.606479e-07,0.0,0.0,0.0
3,2.6e-05,0.276558,1.156413e-07,5.742557e-08,0.050335,5e-06,0.0,0.0,0.0,1.090375e-07,0.018879,1.7446e-06,6.686906e-08,0.000724,6.686906e-08
4,4e-06,0.155249,1.559469e-08,1.455804e-08,0.109552,3e-06,0.0,0.0,0.0,5.695903e-08,0.00893,2.278361e-07,0.0,0.0,0.0
5,9e-06,0.15004,6.116768e-08,3.388691e-08,0.048629,3e-06,0.0,0.0,0.0,4.629078e-07,0.038622,2.314539e-06,1.727449e-07,0.008648,5.182347e-07
6,1.4e-05,0.237079,6.470391e-08,2.455759e-08,0.060392,4e-06,0.0,0.0,0.0,5.124669e-08,0.005075,3.074801e-07,1.683305e-07,0.008335,5.049915e-07
7,4e-06,0.129026,1.246474e-08,1.11736e-08,0.104359,3e-06,0.0,0.0,0.0,6.357958e-08,0.006431,1.907387e-07,0.0,0.0,0.0
8,3.2e-05,0.287574,1.111652e-07,2.86973e-08,0.04845,5e-06,0.0,0.0,0.0,1.840669e-07,0.006683,7.362675e-07,9.919489e-08,0.0009,9.919489e-08
9,8e-06,0.197336,2.713345e-08,2.083696e-08,0.113785,5e-06,0.0,0.0,0.0,2.299217e-08,0.001126,4.598433e-08,1.982874e-08,0.000486,1.982874e-08


In [30]:
# Combine Datasets

empl_geog['place_points'] = empl_geog['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

data_with_geom = pd.concat([empl_geog, all_geom_features], axis=1)
all_data = pd.concat([data_with_geom, empl_geog_encoded], axis=1)

all_data.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,category_list_windshield_installation_and_repair,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test AutoML with new Data

In [33]:
# create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(category_columns)
target = 'total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data[features], all_data[target], test_size=0.2, random_state=3)

# Save results and fit
automl = AutoML(results_path="automl_results_bham/basic_building_categories/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

#Save results for plotting
predictions_all = automl.predict(all_data[features])
geometries = all_data.loc[all_data[target].index, 'geometry']

results_basic_building = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data[target],
    'predicted': predictions_all,
})


Linear algorithm was disabled.
This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.




R^2 Score: 0.9067948096459529
RMSE: 428.62019971385496




## Filter POIs and Collapse Categories

In [31]:
#Filter to quality places only
quality_places = overture_places[overture_places['confidence'] >= 0.6]
quality_places.head()

Unnamed: 0,name,category,confidence,geometry
0,Beacon Hill Toposcope,landmark_and_historical_building,0.96,POINT (-2.01965 52.38213)
1,Lickey Hills Country Park - Beacon Hill,park,0.67,POINT (-2.01972 52.38229)
3,Beacon Castle,park,0.67,POINT (-2.01961 52.38234)
4,Barnett Fair Unisex Hairdressing,beauty_salon,0.96,POINT (-2.00227 52.38278)
6,Rednal Tyres | Cheap Car Tyres Birmingham,automotive_repair,0.96,POINT (-2.00208 52.38210)


In [32]:
#Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Generating a version of empl_geog that has only quality places - I'm using my data_with_geom file as a starting point
empl_geog_quality_places = data_with_geom

#Copy back quality places using logic from DataCleaning.ipynb

empl_geog_quality_places['category_list'] = None

for index, row in empl_geog.iterrows():
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs='EPSG:4326')

    #All Places
    filtered_places = quality_places[quality_places.geometry.intersects(filter_geom.unary_union)]

    #Multipoint of each LSOA's places
    combined_multipoint = filtered_places.geometry.unary_union
    
    #Add place category information
    category_list = filtered_places['category'].dropna().tolist()

    #Add back to Dataframe
    empl_geog_quality_places.at[index, 'category_list'] = category_list

empl_geog_quality_places.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,3e-06,0.0,0.0,0.0,6.707288e-08,0.016603,8.719474e-07,0.0,0.0,0.0
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,2e-06,0.0,0.0,0.0,1.186504e-07,0.010939,3.559512e-07,0.0,0.0,0.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,5e-06,0.0,0.0,0.0,8.688264e-08,0.004995,2.606479e-07,0.0,0.0,0.0
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,5e-06,0.0,0.0,0.0,1.090375e-07,0.018879,1.7446e-06,6.686906e-08,0.000724,6.686906e-08
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,3e-06,0.0,0.0,0.0,5.695903e-08,0.00893,2.278361e-07,0.0,0.0,0.0


In [33]:
# One-Hot Encoding - yet again

#Explode category lists
empl_geog_quality_places_exploded = empl_geog_quality_places.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories_quality = encoder.fit_transform(empl_geog_quality_places_exploded[['category_list']])
encoded_df = pd.DataFrame(encoded_categories_quality, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_quality_places_exploded = pd.concat([empl_geog_quality_places_exploded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_quality_places_exploded.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_quality_places_encoded = empl_geog_quality_places_exploded.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Save columns of interest for analysis

quality_categories = empl_geog_quality_places_encoded.columns[1:]

empl_geog_quality_places_encoded.head()


Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,category_list_adoption_services,category_list_adult_education,...,category_list_windshield_installation_and_repair,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo
0,E01008881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,E01008884,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#Rejoin with all data

all_data_quality_places = pd.merge(empl_geog_quality_places, empl_geog_quality_places_encoded, on='LSOA11CD')

all_data_quality_places.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,category_list_windshield_installation_and_repair,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Collapse POI Categories

In [35]:
# Collapse Categories (Also Copied from DataExploration)

# Automated Condensing of Column Types

# List all columns

all_columns = empl_geog_encoded.columns.tolist()

# I'll group them by 'suffix' to find commonalities
suffix_groups = {}

# Iterate through columns to identify suffixes
for column in all_columns:
    lower_column = column.lower()
    parts = lower_column.split('_')
    if len(parts) > 1:
        suffix = parts[-1]
        if suffix in suffix_groups:
            suffix_groups[suffix].append(column)
        else:
            suffix_groups[suffix] = [column]
    else:
        # Handle columns without suffixes
        suffix_groups['other'] = suffix_groups.get('other', []) + [column]

# Create a list of DataFrames to concatenate
concatenated_dfs = []

# Iterate through suffix groups and aggregate columns
for suffix, columns in suffix_groups.items():
    if columns:
        if len(columns) == 1:
            # Preserve col name if only one col with suffix
            new_column_name = columns[0].replace('category_list_', '')
            concatenated_dfs.append(empl_geog_encoded[columns].rename(columns={columns[0]: new_column_name}))
        else:
            # Aggregate columns with more than one column in the group
            new_column_name = f'all_{suffix}'
            concatenated_dfs.append(empl_geog_encoded[columns].sum(axis=1).rename(new_column_name))

# Concatenate all DataFrames
condensed_categories = pd.concat(concatenated_dfs, axis=1)

# Print summary information (groups only)
for suffix, columns in suffix_groups.items():
    num_columns = len(columns)
    if num_columns > 1:
        print(f'Number of {suffix.capitalize()} Categories: {num_columns}')

condensed_categories.head()

Number of Service Categories: 56
Number of Center Categories: 17
Number of Services Categories: 64
Number of Education Categories: 3
Number of Entertainment Categories: 2
Number of Agency Categories: 11
Number of Restaurant Categories: 63
Number of Lounge Categories: 2
Number of Centers Categories: 2
Number of Contractor Categories: 5
Number of Medicine Categories: 2
Number of Supplier Categories: 18
Number of Team Categories: 3
Number of Park Categories: 5
Number of Church Categories: 5
Number of Hospital Categories: 3
Number of Shelter Categories: 2
Number of Studio Categories: 6
Number of Store Categories: 76
Number of Apartments Categories: 2
Number of Manufacturer Categories: 5
Number of Range Categories: 3
Number of Architect Categories: 2
Number of Designer Categories: 3
Number of Museum Categories: 4
Number of Restoration Categories: 2
Number of School Categories: 21
Number of Facility Categories: 4
Number of House Categories: 3
Number of Rental Categories: 4
Number of Shop Cat

Unnamed: 0,LSOA11CD,all_service,abuse_and_addiction_treatment,accommodation,accountant,active_life,acupuncture,all_center,all_services,all_education,...,waterproofing,waxing,welders,wellness_program,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,window_washing,wood_and_pulp,zoo
0,E01008881,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01008884,6.0,0.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
#Remove all categories that have less than five in Birmingham (arbitrary cutoff for now)

column_sums = condensed_categories.iloc[:, 1:].sum()
columns_to_drop = column_sums[column_sums < 5].index.tolist()
print(columns_to_drop)

condensed_categories = condensed_categories.drop(columns=columns_to_drop)

#Store new categories
condensed_category_columns = condensed_categories.columns[1:]

condensed_categories.head()

['abuse_and_addiction_treatment', 'agricultural_cooperatives', 'agriculture', 'airlines', 'airport_shuttles', 'airport_terminal', 'all_centers', 'allergist', 'aquarium', 'all_range', 'astrologer', 'auditorium', 'auto_upholstery', 'automobile_leasing', 'ayurveda', 'b2b_apparel', 'b2b_furniture_and_housewares', 'b2b_jewelers', 'b2b_rubber_and_plastics', 'baby_gear_and_furniture', 'banks', 'bartender', 'beer_wine_and_spirits', 'bicycle_sharing_location', 'bike_repair_maintenance', 'bookbinding', 'bookkeeper', 'boot_camp', 'brasserie', 'brokers', 'business_office_supplies_and_stationery', 'business_schools', 'campground', 'car_auction', 'car_sharing', 'car_wash', 'castle', 'all_loans', 'chimney_sweep', 'choir', 'circus', 'coach_bus', 'colonics', 'commercial_printer', 'commercial_refrigeration', 'convents_and_monasteries', 'cottage', 'cricket_ground', 'customized_merchandise', 'data_recovery', 'all_laboratories', 'diagnostic_imaging', 'distillery', 'drugstore', 'embroidery_and_crochet', 'en

Unnamed: 0,LSOA11CD,all_service,accommodation,accountant,active_life,acupuncture,all_center,all_services,all_education,all_entertainment,...,travel,travel_agents,trusts,university_housing,used_vintage_and_consignment,veterinarian,videographer,vitamins_and_supplements,warehouses,window_washing
0,E01008881,3.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,1.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,E01008884,6.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,E01008885,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Join Condensed Version with data (Same Logic as in EarlyModels.ipynb and above)

all_data_cleaned = pd.merge(data_with_geom, condensed_categories, on='LSOA11CD')

all_data_cleaned.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,travel,travel_agents,trusts,university_housing,used_vintage_and_consignment,veterinarian,videographer,vitamins_and_supplements,warehouses,window_washing
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Test Model with Collapsed and Quality/Occurrence Cleaned Categories

In [49]:

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=3)

# Save results and fit
automl = AutoML(results_path="automl_results_bham/raw_employment_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_raw_employment_category_cleaned = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})




This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.967032184164977
RMSE: 254.91633927633055




## Target Employment Density

In [38]:
# Create a new column Employment Density and create new model to target that
all_data_cleaned['employment_density'] = all_data_cleaned['total_employment'] / (all_data_cleaned['geometry']).to_crs("EPSG:27700").area

In [55]:

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=42)

# Save results and fit
automl = AutoML(results_path="automl_results_bham/employment_density_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_empl_density = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})



This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.33991992112381764
RMSE: 0.011419765764439736


## Target Office Worker Employment Density

In [39]:
# Use office work cols from previous analysis

office_work_cols = [
    '42 : Civil engineering',
    '58 : Publishing activities',
    '59 : Motion picture, video and television programme production, sound recording and music publishing activities',
    '60 : Programming and broadcasting activities',
    '61 : Telecommunications',
    '62 : Computer programming, consultancy and related activities',
    '63 : Information service activities',
    '64 : Financial service activities, except insurance and pension funding',
    '65 : Insurance, reinsurance and pension funding, except compulsory social security',
    '66 : Activities auxiliary to financial services and insurance activities',
    '68 : Real estate activities',
    '69 : Legal and accounting activities',
    '70 : Activities of head offices; management consultancy activities',
    '71 : Architectural and engineering activities; technical testing and analysis',
    '72 : Scientific research and development',
    '73 : Advertising and market research',
    '74 : Other professional, scientific and technical activities',
    '77 : Rental and leasing activities',
    '78 : Employment activities',
    '79 : Travel agency, tour operator and other reservation service and related activities',
    '80 : Security and investigation activities',
    '82 : Office administrative, office support and other business support activities',
    '84 : Public administration and defence; compulsory social security'
]

# Create a new Office Work Total Column :
all_data_cleaned['office_total_employment'] = all_data_cleaned[office_work_cols].sum(axis=1)

# And an Office Work Density Column
all_data_cleaned['office_employment_density'] = all_data_cleaned['office_total_employment'] / (all_data_cleaned['geometry']).to_crs("EPSG:27700").area

all_data_cleaned.head()


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,university_housing,used_vintage_and_consignment,veterinarian,videographer,vitamins_and_supplements,warehouses,window_washing,employment_density,office_total_employment,office_employment_density
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001146,35.0,8.8e-05
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000122,0.0,0.0
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.004589,650.0,0.001648
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.003708,485.0,0.000694
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000467,30.0,0.000156


In [82]:
# Run the model

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=3)

# Save results and fit
automl = AutoML(results_path="automl_results_bham/office_employment_density_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_office_density_cleaned = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})



This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.8731816051982462
RMSE: 0.0006703192984289862


In [None]:
# Performance Model

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='automl_results_bham/office_employment_density_category_cleaned_perform/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=3,
    top_models_to_improve=3,
    golden_features=True,
    features_selection=False,
    stack_models=True,
    train_ensemble=True,
    explain_level=0,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_office_density_cleaned = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})

AutoML directory: automl_results_bham/office_employment_density_category_cleaned_perform/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'hill_climbing_1', 'hill_climbing_2', 'hill_climbing_3', 'ensemble', 'stack', 'ensemble_stacked']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.005276 trained in 4.93 seconds
2_Default_CatBoost rmse 0.005492 trained in 5.31 seconds
3_Default_RandomForest rmse 0.005017 trained in 6.89 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.004894 trained in 4.24 seconds
8_CatBoost rmse 0.005579 trained in 14.12 seconds
12_RandomForest rmse 0.004994 trained in 5.71 seconds
5_Xgboost rmse 0.005271 trained in



13_RandomForest_GoldenFeatures rmse 0.00487 trained in 29.5 seconds




4_Xgboost_GoldenFeatures rmse 0.004672 trained in 4.61 seconds




6_Xgboost_GoldenFeatures rmse 0.004983 trained in 4.65 seconds
* Step hill_climbing_1 will try to check up to 17 models




16_Xgboost_GoldenFeatures rmse 0.004666 trained in 4.77 seconds




17_Xgboost_GoldenFeatures rmse 0.00466 trained in 4.64 seconds
18_RandomForest rmse 0.004961 trained in 9.09 seconds
19_RandomForest rmse 0.004797 trained in 8.63 seconds




20_RandomForest_GoldenFeatures rmse 0.004854 trained in 7.6 seconds




21_RandomForest_GoldenFeatures rmse 0.004809 trained in 6.68 seconds
22_Xgboost rmse 0.004905 trained in 4.95 seconds
23_Xgboost rmse 0.004872 trained in 4.32 seconds




24_Xgboost_GoldenFeatures rmse 0.004988 trained in 4.56 seconds




25_Xgboost_GoldenFeatures rmse 0.004988 trained in 4.26 seconds
26_RandomForest rmse 0.00497 trained in 6.2 seconds
27_CatBoost rmse 0.005373 trained in 4.98 seconds
28_CatBoost rmse 0.005351 trained in 6.95 seconds
29_CatBoost rmse 0.005414 trained in 5.98 seconds
30_CatBoost rmse 0.005551 trained in 7.14 seconds
31_CatBoost rmse 0.005441 trained in 5.36 seconds
32_CatBoost rmse 0.005437 trained in 6.19 seconds
* Step hill_climbing_2 will try to check up to 7 models




## Plots

In [40]:
# Birmingham Residual Plots

# Basic Model

results_basic_building = gpd.GeoDataFrame(results_basic_building, geometry='geometry')
results_basic_building['residual'] = results_basic_building['observed'] - results_basic_building['predicted']

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
results_basic_building.plot(ax=ax, column='residual', legend=True, cmap='viridis')
plt.title('Residuals - Birmingham Basic Model')
ax.set_axis_off()
plt.savefig('Plots/from_code/residuals_bham/basic_model_residuals.png')
plt.close(fig)


# Category Cleaned

results_raw_employment_category_cleaned = gpd.GeoDataFrame(results_raw_employment_category_cleaned, geometry='geometry')
results_raw_employment_category_cleaned['residual'] = results_raw_employment_category_cleaned['observed'] - results_raw_employment_category_cleaned['predicted']

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
results_raw_employment_category_cleaned.plot(ax=ax, column='residual', legend=True, cmap='viridis')
plt.title('Residuals - Birmingham Category Cleaned Model')
ax.set_axis_off()
plt.savefig('Plots/from_code/residuals_bham/category_cleaned_model_residuals.png')
plt.close(fig)


# Category Cleaned Employment Density

results_empl_density = gpd.GeoDataFrame(results_empl_density, geometry='geometry')
results_empl_density['residual'] = results_empl_density['observed'] - results_empl_density['predicted']

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
results_empl_density.plot(ax=ax, column='residual', legend=True, cmap='viridis')
plt.title('Residuals - Birmingham Employment Density Model')
ax.set_axis_off()
plt.savefig('Plots/from_code/residuals_bham/density_model_residuals.png')
plt.close(fig)

# Category Cleaned Employment Density Office Employment

results_office_density_cleaned = gpd.GeoDataFrame(results_office_density_cleaned, geometry='geometry')
results_office_density_cleaned['residual'] = results_office_density_cleaned['observed'] - results_office_density_cleaned['predicted']

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
results_office_density_cleaned.plot(ax=ax, column='residual', legend=True, cmap='viridis')
plt.title('Residuals - Birmingham Office Employment Density Model')
ax.set_axis_off()
plt.savefig('Plots/from_code/residuals_bham/office_density_model_residuals.png')
plt.close(fig)

NameError: name 'results_basic_building' is not defined

In [80]:
# General Data Plots (Not from Model)

# Employment Count
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
all_data_cleaned.plot(ax=ax, column='total_employment', legend=True, cmap='viridis')
plt.title('Employment Count Map of Birmingham LSOAs')
ax.set_axis_off()
plt.title('Employment Count Map of Birmingham LSOAs')
plt.savefig('Plots/from_code/stats_bham/employment_count_map_bham.png')
plt.close(fig)

# Employment Density
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
all_data_cleaned.plot(ax=ax, column='employment_density', legend=True, cmap='viridis')
plt.title('Employment Density Map of Birmingham LSOAs')
ax.set_axis_off()
plt.title('Employment Density Map of Birmingham LSOAs')
plt.savefig('Plots/from_code/stats_bham/employment_density_map_bham.png')
plt.close(fig)

# Office Employment Density
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
all_data_cleaned.plot(ax=ax, column='office_employment_density', legend=True, cmap='viridis')
plt.title('Office Employment Density Map of Birmingham LSOAs')
ax.set_axis_off()
plt.title('Office Employment Density Map of Birmingham LSOAs')
plt.savefig('Plots/from_code/stats_bham/office_employment_density_map_bham.png')
plt.close(fig)


# Overture POI Density
all_data_cleaned['poi_density'] = all_data_cleaned['num_places'] / (all_data_cleaned['geometry']).to_crs("EPSG:27700").area

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
all_data_cleaned.plot(ax=ax, column='poi_density', legend=True, cmap='viridis')
plt.title('Overture POI Density Map of Birmingham LSOAs')
ax.set_axis_off()
plt.title('Overture POI Density Map of Birmingham LSOAs')
plt.savefig('Plots/from_code/stats_bham/poi_density_map_bham.png')
plt.close(fig)

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
all_data_cleaned.plot(ax=ax, column='num_places', legend=True, cmap='viridis')
plt.title('Overture POI Count Map of Birmingham LSOAs')
ax.set_axis_off()
plt.title('Overture POI Count Map of Birmingham LSOAs')
plt.savefig('Plots/from_code/stats_bham/poi_count_map_bham.png')
plt.close(fig)
