# Data Joining and Cleaning
### Gavin Rolls

This script takes the Overture and OSM data queried from DataQuerying.ipynb as well as downloaded BRES and LSOA-level data and joins it 

## Library Imports

In [3]:
#Basics
import pandas as pd
import geopandas as gpd
import numpy as np

#Shapely
from geopy.geocoders import Nominatim
from shapely.geometry import box
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon
from shapely.geometry import mapping
from shapely.geometry import shape

#Plots
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.preprocessing import OneHotEncoder
import ast

#Warning Supression
import warnings

## UK BRES Employment Data

### London

I created a separate CSV for London employment data because it was simply easier to do outside of code (harder to filter cause it's done by borough name)

In [128]:
#Skip the first six rows because they're header information
empl_data_london = pd.read_csv('data/employment_data/lsoa_by_industry_london.csv', skiprows=7, delimiter=',')

unnamed_cols_london = empl_data_london.columns[empl_data_london.columns.str.contains('^Unnamed:')]
empl_data_london.drop(columns=unnamed_cols_london, inplace=True)

#Separate name into LSOA11CD and LSOA11NM
def split_column(value):
    #Keep Greater London stats
    if value.startswith('gor:'):
        return value, value
        #Split into name and code
    else:
        parts = value.split('lsoa2011:')[1]
        code, name = parts.split(' : ')
        return code.strip(), name.strip()
        return code, name

empl_data_london[['LSOA11CD', 'LSOA11NM']] = empl_data_london['Area'].apply(lambda x: pd.Series(split_column(x)))

print("Num Rows (LSOAs) Before Cleaning: " + str(empl_data_london.shape[0]))

#There appear to be a bunch of duplicates so I'm going to get rid of them now
empl_data_london.drop_duplicates(inplace=True)

print("Num Rows (LSOAs) After Cleaning: " + str(empl_data_london.shape[0]))

Num Rows (LSOAs) Before Cleaning: 9478
Num Rows (LSOAs) After Cleaning: 4836


### Birmingham

In [130]:
# Get rid of value set on copy of slice warning
warnings.filterwarnings('ignore')

#Skip the first six rows because they're header information
empl_data_bham = pd.read_csv('data/employment_data/lsoa_by_industry.csv', skiprows=7, delimiter=',')

unnamed_cols_bham = empl_data_bham.columns[empl_data_bham.columns.str.contains('^Unnamed:')]
empl_data_bham.drop(columns=unnamed_cols_bham, inplace=True)

#Separate name into LSOA11CD and LSOA11NM
def split_column(value):
    if isinstance(value, str) and 'lsoa2011:' in value:
        parts = value.split('lsoa2011:')[1]
        code, name = parts.split(' : ')
        return code.strip(), name.strip()
    else:
        return None, None

empl_data_bham[['LSOA11CD', 'LSOA11NM']] = empl_data_bham['Area'].apply(lambda x: pd.Series(split_column(x)))

# Drop rows not in Birmingham
empl_data_bham.dropna(subset=['LSOA11NM'], inplace=True)
empl_data_bham = empl_data_bham[empl_data_bham['LSOA11NM'].str.contains('Birmingham')]

print("Num Rows (LSOAs) Before Cleaning: " + str(empl_data_bham.shape[0]))

#There appear to be a bunch of duplicates so I'm going to get rid of them now
empl_data_bham.drop_duplicates(inplace=True)

print("Num Rows (LSOAs) After Cleaning: " + str(empl_data_bham.shape[0]))

Num Rows (LSOAs) Before Cleaning: 639
Num Rows (LSOAs) After Cleaning: 639


In [132]:
# Get rid of value set on copy of slice warning
warnings.filterwarnings('ignore')

# Create Total Employment and Log Total Employment Column for London
empl_data_london[empl_data_london.columns[1:-2]] = empl_data_london[empl_data_london.columns[1:-2]].apply(pd.to_numeric, errors='coerce')
empl_data_london['total_employment'] = empl_data_london[empl_data_london.columns[1:-2]].sum(axis=1)
empl_data_london['log_total_employment'] = np.log(empl_data_london['total_employment'].replace(0, np.nan))

empl_data_london.head()

Unnamed: 0,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,08 : Other mining and quarrying,09 : Mining support service activities,10 : Manufacture of food products,...,94 : Activities of membership organisations,95 : Repair of computers and personal and household goods,96 : Other personal service activities,97 : Activities of households as employers of domestic personnel,98 : Undifferentiated goods- and services-producing activities of private households for own use,99 : Activities of extraterritorial organisations and bodies,LSOA11CD,LSOA11NM,total_employment,log_total_employment
0,gor:London,1250,1250,400,0,1500,0,450,350,32000,...,57000,17000,62000,0,0,0,gor:London,gor:London,5605320,15.539227
1,lsoa2011:E01000907 : Camden 001A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,E01000907,Camden 001A,115,4.744932
2,lsoa2011:E01000908 : Camden 001B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,E01000908,Camden 001B,180,5.192957
3,lsoa2011:E01000909 : Camden 001C,0,0,0,0,0,0,0,0,0,...,0,0,5,0,0,0,E01000909,Camden 001C,200,5.298317
4,lsoa2011:E01000912 : Camden 001D,0,0,0,0,0,0,0,0,0,...,0,0,5,0,0,0,E01000912,Camden 001D,825,6.715383


In [134]:
# Get rid of value set on copy of slice warning
warnings.filterwarnings('ignore')

# Create Total Employment and Log Total Employment Column for Birmingham
empl_data_bham[empl_data_bham.columns[1:-2]] = empl_data_bham[empl_data_bham.columns[1:-2]].apply(pd.to_numeric, errors='coerce')
empl_data_bham['total_employment'] = empl_data_bham[empl_data_bham.columns[1:-2]].sum(axis=1)
empl_data_bham['log_total_employment'] = np.log(empl_data_bham['total_employment'].replace(0, np.nan))

empl_data_bham.head()

Unnamed: 0,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,08 : Other mining and quarrying,09 : Mining support service activities,10 : Manufacture of food products,...,94 : Activities of membership organisations,95 : Repair of computers and personal and household goods,96 : Other personal service activities,97 : Activities of households as employers of domestic personnel,98 : Undifferentiated goods- and services-producing activities of private households for own use,99 : Activities of extraterritorial organisations and bodies,LSOA11CD,LSOA11NM,total_employment,log_total_employment
13485,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,E01008881,Birmingham 067A,455.0,6.120297
13486,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,E01008882,Birmingham 066A,30.0,3.401197
13487,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,...,20.0,0.0,0.0,0.0,0.0,0.0,E01008883,Birmingham 078A,1810.0,7.501082
13488,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,...,0.0,5.0,40.0,0.0,0.0,0.0,E01008884,Birmingham 078B,2590.0,7.859413
13489,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,15.0,0.0,0.0,0.0,E01008885,Birmingham 076A,90.0,4.49981


## LSOA Geographic Data

In [37]:
#Get LSOA Shapefile Data
lsoa_geo = gpd.read_file('data/lsoa_data/LSOA_2011_EW_BFE_V3.shp')

#Convert to WGS for consistency
lsoa_geo = lsoa_geo.to_crs(epsg=4326)

print("Num Rows (LSOAs): " + str(lsoa_geo.shape[0]))

#Get rid of columns I'm not using for now
lsoa_geo = lsoa_geo.drop(columns=['BNG_E', 'BNG_N', 'LONG_', 'LAT', 'GlobalID', 'Shape_Leng'])

#Join with population
lsoa_pop = pd.read_csv('data/lsoa_data/lsoa_pop.csv')

lsoa_geo.drop(columns=['LSOA11NM'])

#Get population estimates - had to be pulled in from separate Census dataset
lsoa_geo = lsoa_geo.merge(lsoa_pop, on='LSOA11CD')

lsoa_geo.head()

Num Rows (LSOAs): 34753


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population
0,E01000034,Barking and Dagenham 003A,"POLYGON ((0.17380 51.56013, 0.17379 51.56012, ...",Barking and Dagenham 003A,,1444
1,E01000035,Barking and Dagenham 010A,"POLYGON ((0.16598 51.55250, 0.16601 51.55248, ...",Barking and Dagenham 010A,,1610
2,E01000036,Barking and Dagenham 010B,"POLYGON ((0.15965 51.54776, 0.15974 51.54771, ...",Barking and Dagenham 010B,,1353
3,E01000037,Barking and Dagenham 003B,"POLYGON ((0.18508 51.56480, 0.18476 51.56454, ...",Barking and Dagenham 003B,,1688
4,E01000038,Barking and Dagenham 003C,"POLYGON ((0.17599 51.56476, 0.17601 51.56475, ...",Barking and Dagenham 003C,,1447


## Overture Places Data

### London

In [32]:
london_places = gpd.read_file('data/overture_data/london_places.geojson')

print('Number of POIs in Overture (London): ' + str(london_places.shape[0]))
london_places.head()

Number of POIs in Overture (London): 343712


Unnamed: 0,name,category,confidence,geometry
0,The Heating Professionals,hvac_services,0.77,POINT (-0.50991 51.28709)
1,Hub Property Care,plumbing,0.77,POINT (-0.50991 51.28709)
2,F & S,plumbing,0.72,POINT (-0.50911 51.28766)
3,Hookins Accountants,accountant,0.75,POINT (-0.50881 51.28878)
4,Muddy Paws Grooming Service,pet_groomer,0.55,POINT (-0.50741 51.28859)


### Birmingham

In [33]:
bham_places = gpd.read_file('data/overture_data/bham_places.geojson')

print('Number of POIs in Overture (Birmingham): ' + str(bham_places.shape[0]))
bham_places.head()

Number of POIs in Overture (Birmingham): 46781


Unnamed: 0,name,category,confidence,geometry
0,Beacon Hill Toposcope,landmark_and_historical_building,0.96,POINT (-2.01965 52.38213)
1,Lickey Hills Country Park - Beacon Hill,park,0.67,POINT (-2.01972 52.38229)
2,Lickey Monument,monument,0.55,POINT (-2.01956 52.38227)
3,Beacon Castle,park,0.67,POINT (-2.01961 52.38234)
4,Barnett Fair Unisex Hairdressing,beauty_salon,0.96,POINT (-2.00227 52.38278)


### OpenStreetMap Buildings Data

I will query each set of building types and then collapse the dataset down so that tags are preserved

### London

In [54]:
london_all_buildings = gpd.read_file('data/osm_data/all_london_commercial_buildings.geojson')

print('Number of Buildings in OSM (London): ' + str(london_all_buildings.shape[0]))
london_all_buildings.head()

Number of Buildings in OSM (London): 5619


Unnamed: 0,element_type,osmid,name,geometry
0,way,4076571,Bromley Road Retail Park,"POLYGON ((-0.02063 51.43669, -0.02047 51.43631..."
1,way,4076587,Catford Delivery Office,"POLYGON ((-0.02001 51.43859, -0.02004 51.43865..."
2,way,5790535,Refuge House,"POLYGON ((-0.07800 51.65301, -0.07777 51.65303..."
3,way,5986754,One Canada Square,"POLYGON ((-0.02002 51.50477, -0.01996 51.50500..."
4,way,5987126,,"POLYGON ((-0.02588 51.50520, -0.02596 51.50531..."


In [57]:
# Commercial Buildings

london_commercial_buildings = gpd.read_file('data/osm_data/all_london_commercial_buildings.geojson')

print('Number of Commercial Buildings in OSM (London): ' + str(london_commercial_buildings.shape[0]))
london_commercial_buildings.head()

Number of Commercial Buildings in OSM (London): 5619


Unnamed: 0,element_type,osmid,name,geometry
0,way,4076571,Bromley Road Retail Park,"POLYGON ((-0.02063 51.43669, -0.02047 51.43631..."
1,way,4076587,Catford Delivery Office,"POLYGON ((-0.02001 51.43859, -0.02004 51.43865..."
2,way,5790535,Refuge House,"POLYGON ((-0.07800 51.65301, -0.07777 51.65303..."
3,way,5986754,One Canada Square,"POLYGON ((-0.02002 51.50477, -0.01996 51.50500..."
4,way,5987126,,"POLYGON ((-0.02588 51.50520, -0.02596 51.50531..."


In [59]:
# Office Buildings

london_office_buildings = gpd.read_file('data/osm_data/all_london_office_buildings.geojson')

print('Number of Office Buildings in OSM (London): ' + str(london_office_buildings.shape[0]))
london_office_buildings.head()

Number of Office Buildings in OSM (London): 1910


Unnamed: 0,element_type,osmid,name,geometry
0,way,4237383,ITN,"POLYGON ((-0.11445 51.52304, -0.11478 51.52344..."
1,way,5251705,,"POLYGON ((-0.23366 51.40761, -0.23281 51.40822..."
2,way,5986805,HSBC UK,"POLYGON ((-0.01792 51.50522, -0.01794 51.50525..."
3,way,5986920,KPMG,"POLYGON ((-0.01680 51.50508, -0.01663 51.50569..."
4,way,5986929,Barclays,"POLYGON ((-0.01494 51.50486, -0.01493 51.50490..."


In [61]:
# Residential Buildings

london_residential_buildings = gpd.read_file('data/osm_data/all_london_residential_buildings.geojson')

print('Number of Residential Buildings in OSM (London): ' + str(london_residential_buildings.shape[0]))
london_residential_buildings.head()

Number of Residential Buildings in OSM (London): 83828


Unnamed: 0,element_type,osmid,name,geometry
0,way,4948114,Kemble House,"POLYGON ((-0.10656 51.46576, -0.10645 51.46582..."
1,way,4948116,Kettleby House,"POLYGON ((-0.10571 51.46618, -0.10559 51.46624..."
2,way,4948118,Woolley House,"POLYGON ((-0.10486 51.46659, -0.10473 51.46665..."
3,way,5131193,,"POLYGON ((-0.19992 51.46981, -0.19992 51.46987..."
4,way,5131194,,"POLYGON ((-0.19955 51.46940, -0.19955 51.46980..."


In [62]:
# Retail Buildings

london_retail_buildings = gpd.read_file('data/osm_data/all_london_retail_buildings.geojson')

print('Number of Retail Buildings in OSM (London): ' + str(london_retail_buildings.shape[0]))
london_retail_buildings.head()

Number of Retail Buildings in OSM (London): 11719


Unnamed: 0,element_type,osmid,name,geometry
0,way,4270351,,"POLYGON ((-0.03794 51.63273, -0.03778 51.63280..."
1,way,4425414,,"POLYGON ((-0.21897 51.57513, -0.21821 51.57486..."
2,way,4425469,,"POLYGON ((-0.23039 51.57118, -0.22974 51.57071..."
3,way,4770296,Kingston Rowing Club / Tiffin School Boat Club,"POLYGON ((-0.30581 51.41876, -0.30566 51.41905..."
4,way,4775619,St Richard's Court,"POLYGON ((-0.32066 51.43665, -0.32049 51.43647..."


In [72]:
# Assign a building type in the main DataFrame if the osmid is found in one of the other DataFrames

# I know that office and retail buildings could have commercial or office but office and retail will overwrite commercial
# (and are thus checked after) because they're more specific

london_all_buildings['building_type'] = 'none'

london_all_buildings.loc[london_all_buildings['osmid'].isin(london_commercial_buildings['osmid']), 'building_type'] = 'commercial'
london_all_buildings.loc[london_all_buildings['osmid'].isin(london_retail_buildings['osmid']), 'building_type'] = 'retail'
london_all_buildings.loc[london_all_buildings['osmid'].isin(london_office_buildings['osmid']), 'building_type'] = 'office'
london_all_buildings.loc[london_all_buildings['osmid'].isin(london_residential_buildings['osmid']), 'building_type'] = 'residential'

london_all_buildings.head(50)

Unnamed: 0,element_type,osmid,name,geometry,building_type
0,way,4076571,Bromley Road Retail Park,"POLYGON ((-0.02063 51.43669, -0.02047 51.43631...",commercial
1,way,4076587,Catford Delivery Office,"POLYGON ((-0.02001 51.43859, -0.02004 51.43865...",commercial
2,way,5790535,Refuge House,"POLYGON ((-0.07800 51.65301, -0.07777 51.65303...",commercial
3,way,5986754,One Canada Square,"POLYGON ((-0.02002 51.50477, -0.01996 51.50500...",commercial
4,way,5987126,,"POLYGON ((-0.02588 51.50520, -0.02596 51.50531...",commercial
5,way,6065001,Costco,"POLYGON ((-0.03228 51.60995, -0.03218 51.60994...",commercial
6,way,7732788,Windsor House,"POLYGON ((-0.13505 51.49795, -0.13533 51.49792...",commercial
7,way,8076474,BNP Paribas,"POLYGON ((-0.16446 51.52331, -0.16451 51.52330...",commercial
8,way,12349637,,"POLYGON ((-0.02384 51.67477, -0.02281 51.67472...",commercial
9,way,14397579,Lawn House,"POLYGON ((-0.22530 51.50517, -0.22502 51.50513...",commercial


### Birmingham

In [74]:
bham_all_buildings = gpd.read_file('data/osm_data/bham_buildings.geojson')

print('Number of Buildings in OSM (Birmingham): ' + str(bham_all_buildings.shape[0]))
bham_all_buildings.head()

Number of Buildings in OSM (Birmingham): 256279


Unnamed: 0,element_type,osmid,name,geometry
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543..."
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434..."
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400..."
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412..."
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391..."


In [75]:
# Commercial Buildings

bham_commercial_buildings = gpd.read_file('data/osm_data/bham_commercial_buildings.geojson')

print('Number of Commercial Buildings in OSM (Birmingham): ' + str(bham_commercial_buildings.shape[0]))
bham_commercial_buildings.head()

Number of Commercial Buildings in OSM (Birmingham): 317


Unnamed: 0,element_type,osmid,name,geometry
0,way,17405658,,"POLYGON ((-1.82146 52.56186, -1.82179 52.56209..."
1,way,20002347,Dental Care Partnership,"POLYGON ((-1.82680 52.55679, -1.82680 52.55680..."
2,way,22763584,Station House,"POLYGON ((-1.82027 52.56533, -1.82036 52.56528..."
3,way,22763760,,"POLYGON ((-1.81624 52.56247, -1.81633 52.56237..."
4,way,22763952,,"POLYGON ((-1.82061 52.56543, -1.82042 52.56531..."


In [76]:
# Office Buildings

bham_office_buildings = gpd.read_file('data/osm_data/bham_office_buildings.geojson')

print('Number of Office Buildings in OSM (Birmingham): ' + str(bham_office_buildings.shape[0]))
bham_office_buildings.head()

Number of Office Buildings in OSM (Birmingham): 195


Unnamed: 0,element_type,osmid,name,geometry
0,way,22820818,Baskerville House,"POLYGON ((-1.90809 52.48018, -1.90735 52.48039..."
1,way,28292471,No. 1 Colmore Square,"POLYGON ((-1.89668 52.48235, -1.89675 52.48258..."
2,way,28425056,McLaren Building,"POLYGON ((-1.89265 52.48192, -1.89247 52.48183..."
3,way,28455010,Gazette Buildings,"POLYGON ((-1.89313 52.48261, -1.89339 52.48236..."
4,way,28514237,,"POLYGON ((-1.90172 52.47568, -1.90125 52.47528..."


In [77]:
# Residential Buildings

bham_residential_buildings = gpd.read_file('data/osm_data/bham_residential_buildings.geojson')

print('Number of Residential Buildings in OSM (Birmingham): ' + str(bham_residential_buildings.shape[0]))
bham_residential_buildings.head()

Number of Residential Buildings in OSM (Birmingham): 189382


Unnamed: 0,element_type,osmid,name,geometry
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543..."
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434..."
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400..."
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412..."
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391..."


In [78]:
# Retail Buildings

bham_retail_buildings = gpd.read_file('data/osm_data/bham_retail_buildings.geojson')

print('Number of Retail Buildings in OSM (Birmingham): ' + str(bham_retail_buildings.shape[0]))
bham_retail_buildings.head()

Number of Retail Buildings in OSM (Birmingham): 3504


Unnamed: 0,element_type,osmid,name,geometry
0,way,16969083,Boots,"POLYGON ((-1.82369 52.55768, -1.82379 52.55771..."
1,way,17273169,,"POLYGON ((-1.89325 52.47748, -1.89313 52.47745..."
2,way,20002348,Vesey Manor,"POLYGON ((-1.82665 52.55648, -1.82665 52.55630..."
3,way,20002352,Driffold Gallery,"POLYGON ((-1.82722 52.55565, -1.82734 52.55554..."
4,way,23585174,One Stop,"POLYGON ((-1.90532 52.51745, -1.90573 52.51726..."


In [79]:
# Assign a building type in the main DataFrame if the osmid is found in one of the other DataFrames

# I know that office and retail buildings could have commercial or office but office and retail will overwrite commercial
# (and are thus checked after) because they're more specific

bham_all_buildings['building_type'] = 'none'

bham_all_buildings.loc[bham_all_buildings['osmid'].isin(bham_commercial_buildings['osmid']), 'building_type'] = 'commercial'
bham_all_buildings.loc[bham_all_buildings['osmid'].isin(bham_retail_buildings['osmid']), 'building_type'] = 'retail'
bham_all_buildings.loc[bham_all_buildings['osmid'].isin(bham_office_buildings['osmid']), 'building_type'] = 'office'
bham_all_buildings.loc[bham_all_buildings['osmid'].isin(bham_residential_buildings['osmid']), 'building_type'] = 'residential'

bham_all_buildings.head(50)

Unnamed: 0,element_type,osmid,name,geometry,building_type
0,way,16966777,,"POLYGON ((-1.82461 52.55549, -1.82450 52.55543...",residential
1,way,16966784,,"POLYGON ((-1.82324 52.55438, -1.82301 52.55434...",residential
2,way,16966785,,"POLYGON ((-1.82273 52.55403, -1.82264 52.55400...",residential
3,way,16966788,,"POLYGON ((-1.82341 52.55414, -1.82338 52.55412...",residential
4,way,16966789,,"POLYGON ((-1.82311 52.55396, -1.82302 52.55391...",residential
5,way,16966791,,"POLYGON ((-1.82282 52.55433, -1.82266 52.55431...",residential
6,way,16966794,Royal (Proposed),"POLYGON ((-1.82619 52.55661, -1.82611 52.55650...",none
7,way,16966799,,"POLYGON ((-1.82425 52.55803, -1.82413 52.55804...",residential
8,way,16966800,James Preston Health Centre,"POLYGON ((-1.82348 52.55799, -1.82331 52.55823...",none
9,way,16966807,,"POLYGON ((-1.82428 52.55791, -1.82431 52.55807...",residential


## Data Joining

### Joining LSOA Geography and Employment Data

### London

In [138]:
#Note that total London statistics are dropped here because there's no equivalent column in lsoa_geo
empl_geog_london = pd.merge(lsoa_geo, empl_data_london, on = "LSOA11CD")

print("Num Rows (LSOAS): " + str(empl_geog_london.shape[0]))

empl_geog_london.head()

Num Rows (LSOAS): 4835


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,93 : Sports activities and amusement and recreation activities,94 : Activities of membership organisations,95 : Repair of computers and personal and household goods,96 : Other personal service activities,97 : Activities of households as employers of domestic personnel,98 : Undifferentiated goods- and services-producing activities of private households for own use,99 : Activities of extraterritorial organisations and bodies,LSOA11NM,total_employment,log_total_employment
0,E01000034,Barking and Dagenham 003A,"POLYGON ((0.17380 51.56013, 0.17379 51.56012, ...",Barking and Dagenham 003A,,1444,lsoa2011:E01000034 : Barking and Dagenham 003A,0,0,0,...,0,0,0,10,0,0,0,Barking and Dagenham 003A,710,6.565265
1,E01000035,Barking and Dagenham 010A,"POLYGON ((0.16598 51.55250, 0.16601 51.55248, ...",Barking and Dagenham 010A,,1610,lsoa2011:E01000035 : Barking and Dagenham 010A,0,0,0,...,150,0,0,25,0,0,0,Barking and Dagenham 010A,4255,8.35585
2,E01000036,Barking and Dagenham 010B,"POLYGON ((0.15965 51.54776, 0.15974 51.54771, ...",Barking and Dagenham 010B,,1353,lsoa2011:E01000036 : Barking and Dagenham 010B,0,0,0,...,0,10,0,0,0,0,0,Barking and Dagenham 010B,320,5.768321
3,E01000037,Barking and Dagenham 003B,"POLYGON ((0.18508 51.56480, 0.18476 51.56454, ...",Barking and Dagenham 003B,,1688,lsoa2011:E01000037 : Barking and Dagenham 003B,0,0,0,...,0,0,0,0,0,0,0,Barking and Dagenham 003B,205,5.32301
4,E01000038,Barking and Dagenham 003C,"POLYGON ((0.17599 51.56476, 0.17601 51.56475, ...",Barking and Dagenham 003C,,1447,lsoa2011:E01000038 : Barking and Dagenham 003C,0,0,0,...,0,0,0,10,0,0,0,Barking and Dagenham 003C,295,5.686975


### Birmingham

In [92]:
#Note that total Birmingham statistics are dropped here because there's no equivalent column in lsoa_geo
empl_geog_bham = pd.merge(lsoa_geo, empl_data_bham, on = "LSOA11CD")

print("Num Rows (LSOAS): " + str(empl_geog_bham.shape[0]))

empl_geog_bham.head()

Num Rows (LSOAS): 639


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,93 : Sports activities and amusement and recreation activities,94 : Activities of membership organisations,95 : Repair of computers and personal and household goods,96 : Other personal service activities,97 : Activities of households as employers of domestic personnel,98 : Undifferentiated goods- and services-producing activities of private households for own use,99 : Activities of extraterritorial organisations and bodies,LSOA11NM,total_employment,log_total_employment
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,Birmingham 067A,455.0,6.120297
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,Birmingham 066A,30.0,3.401197
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,Birmingham 078A,1810.0,7.501082
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.0,0.0,5.0,40.0,0.0,0.0,0.0,Birmingham 078B,2590.0,7.859413
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.0,0.0,0.0,15.0,0.0,0.0,0.0,Birmingham 076A,90.0,4.49981


## Add POI and Building Data

In [153]:
# London

# Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Init Columns (This was causing an error at first when I didn't put it here to begin)
empl_geog_london['num_buildings'] = 0
empl_geog_london['num_retail_buildings'] = 0
empl_geog_london['num_residential_buildings'] = 0
empl_geog_london['num_commercial_buildings'] = 0
empl_geog_london['num_office_buildings'] = 0

empl_geog_london['num_places'] = 0

empl_geog_london['building_poly'] = None
empl_geog_london['commercial_building_poly'] = None
empl_geog_london['retail_building_poly'] = None
empl_geog_london['office_building_poly'] = None
empl_geog_london['residential_building_poly'] = None

empl_geog_london['place_points'] = None
empl_geog_london['category_list'] = None

# Iterate through all LSOAs and grab OSM building counts and subcategories - add to empl_geog_london dataframe
for index, row in empl_geog_london.iterrows():

    # Status report
    if (index % 20 == 0):
        print(index)
    
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs=lsoa_geo.crs)

    # All Buildings - OSM
    filtered_osm_buildings = london_all_buildings[london_all_buildings.geometry.intersects(filter_geom.unary_union)]

    # Get subcategories based on tags
    commercial_buildings = london_all_buildings[london_all_buildings['building_type'] == 'commercial']
    office_buildings = london_all_buildings[london_all_buildings['building_type'] == 'office']
    retail_buildings = london_all_buildings[london_all_buildings['building_type'] == 'retail']
    residential_buildings = london_all_buildings[london_all_buildings['building_type'] == 'residential']

    # Subcategories of each building
    filtered_commercial_buildings = commercial_buildings[commercial_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_office_buildings = office_buildings[office_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_retail_buildings = retail_buildings[retail_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_residential_buildings = residential_buildings[residential_buildings.geometry.intersects(filter_geom.unary_union)]

    # Multipolygon of each LSOA's (OSM) buildings - category separated
    combined_multipolygon = filtered_osm_buildings.geometry.unary_union
    combined_commercial_multipolygon = filtered_commercial_buildings.geometry.unary_union
    combined_office_polygon = filtered_office_buildings.geometry.unary_union
    combined_retail_polygon = filtered_retail_buildings.geometry.unary_union
    combined_residential_polygon = filtered_residential_buildings.geometry.unary_union
    
    # All Places
    filtered_places = london_places[london_places.geometry.intersects(filter_geom.unary_union)]

    # Multipoint of each LSOA's places
    combined_multipoint = filtered_places.geometry.unary_union
    
    # Add place category information
    category_list = filtered_places['category'].dropna().tolist()
    
    # Add back to Dataframe
    empl_geog_london.at[index, 'num_buildings'] = len(filtered_osm_buildings)
    
    empl_geog_london.at[index, 'num_retail_buildings'] = len(filtered_retail_buildings)
    empl_geog_london.at[index, 'num_residential_buildings'] = len(filtered_residential_buildings)
    empl_geog_london.at[index, 'num_commercial_buildings'] = len(filtered_commercial_buildings)
    empl_geog_london.at[index, 'num_office_buildings'] = len(filtered_office_buildings)
    
    empl_geog_london.at[index, 'num_places'] = len(filtered_places)
    empl_geog_london.at[index, 'log_num_places'] = np.log(len(filtered_places))
    
    empl_geog_london.at[index, 'building_poly'] = combined_multipolygon
    empl_geog_london.at[index, 'commercial_building_poly'] = combined_commercial_multipolygon
    empl_geog_london.at[index, 'retail_building_poly'] = combined_office_polygon
    empl_geog_london.at[index, 'office_building_poly'] = combined_retail_polygon
    empl_geog_london.at[index, 'residential_building_poly'] = combined_residential_polygon

    empl_geog_london.at[index, 'place_points'] = combined_multipoint
    empl_geog_london.at[index, 'category_list'] = category_list
    
# Check
empl_geog_london.head()


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4

Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,num_office_buildings,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list,log_num_places
0,E01000034,Barking and Dagenham 003A,"POLYGON ((0.17380 51.56013, 0.17379 51.56012, ...",Barking and Dagenham 003A,,1444,lsoa2011:E01000034 : Barking and Dagenham 003A,0,0,0,...,0,14,,,,,,"MULTIPOINT ((0.165082 51.553969), (0.16642 51....","[college_university, engineering_services, pub...",2.639057
1,E01000035,Barking and Dagenham 010A,"POLYGON ((0.16598 51.55250, 0.16601 51.55248, ...",Barking and Dagenham 010A,,1610,lsoa2011:E01000035 : Barking and Dagenham 010A,0,0,0,...,0,115,,,,,,"MULTIPOINT ((0.1563954 51.5504463), (0.1564236...","[sports_club_and_league, professional_services...",4.744932
2,E01000036,Barking and Dagenham 010B,"POLYGON ((0.15965 51.54776, 0.15974 51.54771, ...",Barking and Dagenham 010B,,1353,lsoa2011:E01000036 : Barking and Dagenham 010B,0,0,0,...,0,35,,,,,,"MULTIPOINT ((0.1570608 51.548477), (0.1573769 ...","[engineering_services, hospital, tattoo_and_pi...",3.555348
3,E01000037,Barking and Dagenham 003B,"POLYGON ((0.18508 51.56480, 0.18476 51.56454, ...",Barking and Dagenham 003B,,1688,lsoa2011:E01000037 : Barking and Dagenham 003B,0,0,0,...,0,13,,,,,,"MULTIPOINT ((0.1747285 51.5605037), (0.1751643...","[beauty_salon, courier_and_delivery_services, ...",2.564949
4,E01000038,Barking and Dagenham 003C,"POLYGON ((0.17599 51.56476, 0.17601 51.56475, ...",Barking and Dagenham 003C,,1447,lsoa2011:E01000038 : Barking and Dagenham 003C,0,0,0,...,0,19,,,,,,"MULTIPOINT ((0.1707281 51.5619532), (0.170907 ...","[business_management_services, automotive_repa...",2.944439


In [155]:
# Birmingham

# Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Init Columns (This was causing an error at first when I didn't put it here to begin)
empl_geog_bham['num_buildings'] = 0
empl_geog_bham['num_retail_buildings'] = 0
empl_geog_bham['num_residential_buildings'] = 0
empl_geog_bham['num_commercial_buildings'] = 0
empl_geog_bham['num_office_buildings'] = 0

empl_geog_bham['num_places'] = 0

empl_geog_bham['building_poly'] = None
empl_geog_bham['commercial_building_poly'] = None
empl_geog_bham['retail_building_poly'] = None
empl_geog_bham['office_building_poly'] = None
empl_geog_bham['residential_building_poly'] = None

empl_geog_bham['place_points'] = None
empl_geog_bham['category_list'] = None

# Iterate through all LSOAs and grab OSM building counts and subcategories - add to empl_geog_bham dataframe
for index, row in empl_geog_bham.iterrows():

    # Status report
    if (index % 20 == 0):
        print(index)
    
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs=lsoa_geo.crs)

    # All Buildings - OSM
    filtered_osm_buildings = bham_all_buildings[bham_all_buildings.geometry.intersects(filter_geom.unary_union)]

    # Get subcategories based on tags
    commercial_buildings = bham_all_buildings[bham_all_buildings['building_type'] == 'commercial']
    office_buildings = bham_all_buildings[bham_all_buildings['building_type'] == 'office']
    retail_buildings = bham_all_buildings[bham_all_buildings['building_type'] == 'retail']
    residential_buildings = bham_all_buildings[bham_all_buildings['building_type'] == 'residential']

    # Subcategories of each building
    filtered_commercial_buildings = commercial_buildings[commercial_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_office_buildings = office_buildings[office_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_retail_buildings = retail_buildings[retail_buildings.geometry.intersects(filter_geom.unary_union)]
    filtered_residential_buildings = residential_buildings[residential_buildings.geometry.intersects(filter_geom.unary_union)]

    # Multipolygon of each LSOA's (OSM) buildings - category separated
    combined_multipolygon = filtered_osm_buildings.geometry.unary_union
    combined_commercial_multipolygon = filtered_commercial_buildings.geometry.unary_union
    combined_office_polygon = filtered_office_buildings.geometry.unary_union
    combined_retail_polygon = filtered_retail_buildings.geometry.unary_union
    combined_residential_polygon = filtered_residential_buildings.geometry.unary_union
    
    # All Places
    filtered_places = bham_places[bham_places.geometry.intersects(filter_geom.unary_union)]

    # Multipoint of each LSOA's places
    combined_multipoint = filtered_places.geometry.unary_union
    
    # Add place category information
    category_list = filtered_places['category'].dropna().tolist()
    
    # Add back to Dataframe
    empl_geog_bham.at[index, 'num_buildings'] = len(filtered_osm_buildings)
    
    empl_geog_bham.at[index, 'num_retail_buildings'] = len(filtered_retail_buildings)
    empl_geog_bham.at[index, 'num_residential_buildings'] = len(filtered_residential_buildings)
    empl_geog_bham.at[index, 'num_commercial_buildings'] = len(filtered_commercial_buildings)
    empl_geog_bham.at[index, 'num_office_buildings'] = len(filtered_office_buildings)
    
    empl_geog_bham.at[index, 'num_places'] = len(filtered_places)
    empl_geog_bham.at[index, 'log_num_places'] = np.log(len(filtered_places))
        
    empl_geog_bham.at[index, 'building_poly'] = combined_multipolygon
    empl_geog_bham.at[index, 'commercial_building_poly'] = combined_commercial_multipolygon
    empl_geog_bham.at[index, 'retail_building_poly'] = combined_office_polygon
    empl_geog_bham.at[index, 'office_building_poly'] = combined_retail_polygon
    empl_geog_bham.at[index, 'residential_building_poly'] = combined_residential_polygon
    
    empl_geog_bham.at[index, 'place_points'] = combined_multipoint
    empl_geog_bham.at[index, 'category_list'] = category_list
    
# Check
empl_geog_bham.head()

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620


Unnamed: 0,LSOA11CD,LSOA11NM_x,geometry,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,num_office_buildings,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list,log_num_places
0,E01008881,Birmingham 067A,"POLYGON ((-1.80957 52.47014, -1.80953 52.47009...",Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0,79,"MULTIPOLYGON (((-1.8213621 52.4666609, -1.8210...",,,"MULTIPOLYGON (((-1.8122732 52.4644594, -1.8121...","MULTIPOLYGON (((-1.8207928 52.467573, -1.82066...","MULTIPOINT ((-1.8208857 52.4670509), (-1.82066...","[convenience_store, pizza_restaurant, fast_foo...",4.369448
1,E01008882,Birmingham 066A,"POLYGON ((-1.82792 52.46893, -1.82768 52.46891...",Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0,5,"MULTIPOLYGON (((-1.8336333 52.4637367, -1.8332...",,,"MULTIPOLYGON (((-1.8332702 52.4634641, -1.8336...","MULTIPOLYGON (((-1.833377 52.4638622, -1.83337...","MULTIPOINT ((-1.8329422 52.4640839), (-1.83285...","[church_cathedral, home_security, school]",1.609438
2,E01008883,Birmingham 078A,"POLYGON ((-1.82144 52.46633, -1.82143 52.46632...",Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0,39,"MULTIPOLYGON (((-1.8286578 52.4605087, -1.8276...",,,"MULTIPOLYGON (((-1.825245 52.4631362, -1.82474...","MULTIPOLYGON (((-1.8270907 52.4634663, -1.8270...","MULTIPOINT ((-1.8271863 52.463742), (-1.827072...","[computer_store, professional_services, sporti...",3.663562
3,E01008884,Birmingham 078B,"POLYGON ((-1.82576 52.46527, -1.82579 52.46516...",Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,1,173,"MULTIPOLYGON (((-1.8378055 52.4575343, -1.8380...",,"POLYGON ((-1.8354944 52.4612255, -1.8352535 52...","MULTIPOLYGON (((-1.8394717 52.4632192, -1.8388...","MULTIPOLYGON (((-1.8377769 52.4595681, -1.8377...","MULTIPOINT ((-1.8443173 52.4598824), (-1.84310...","[car_window_tinting, powder_coating_service, l...",5.153292
4,E01008885,Birmingham 076A,"POLYGON ((-1.80450 52.46160, -1.80414 52.46135...",Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0,28,"MULTIPOLYGON (((-1.8092612 52.4561461, -1.8092...",,,"MULTIPOLYGON (((-1.8052836 52.4607015, -1.8052...","MULTIPOLYGON (((-1.8093482 52.4561539, -1.8092...","MULTIPOINT ((-1.8092852 52.4618985), (-1.80896...","[hotel, dry_cleaning, international_business_a...",3.332205


### Export to Desktop

In [30]:
# Save London to Dataframe on Desktop

# Convert multipolygons to WKT for export
empl_geog_london['building_poly'] = empl_geog_london['building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_london['commercial_building_poly'] = empl_geog_london['commercial_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_london['retail_building_poly'] = empl_geog_london['retail_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_london['office_building_poly'] = empl_geog_london['office_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_london['residential_building_poly'] = empl_geog_london['residential_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

empl_geog_london['place_points'] = empl_geog_london['place_points'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

# Convert category_list to string for export
empl_geog_london['category_list'] = empl_geog_london['category_list'].apply(lambda x: str(x) if x else None)

# Export
empl_geog_london.to_file("data/combined_data/empl_geog_london.geojson", driver="GeoJSON")


In [31]:
# Save Birmingham to Dataframe on Desktop

# Convert multipolygons to WKT for export
empl_geog_bham['building_poly'] = empl_geog_bham['building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_bham['commercial_building_poly'] = empl_geog_bham['commercial_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_bham['retail_building_poly'] = empl_geog_bham['retail_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_bham['office_building_poly'] = empl_geog_bham['office_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
empl_geog_bham['residential_building_poly'] = empl_geog_bham['residential_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

empl_geog_bham['place_points'] = empl_geog_bham['place_points'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

# Convert category_list to string for export
empl_geog_bham['category_list'] = empl_geog_bham['category_list'].apply(lambda x: str(x) if x else None)

# Export
empl_geog_bham.to_file("data/combined_data/empl_geog_bham.geojson", driver="GeoJSON")


## Checkpoint

When restarting the kernel, the exported files above can be loaded back in for further work

In [128]:
# London
empl_geog_london = gpd.read_file("data/combined_data/empl_geog_london.geojson")

# WKT strings back to Shapely geometries
empl_geog_london['building_poly'] = empl_geog_london['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_london['commercial_building_poly'] = empl_geog_london['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_london['retail_building_poly'] = empl_geog_london['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_london['office_building_poly'] = empl_geog_london['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_london['residential_building_poly'] = empl_geog_london['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

empl_geog_london['place_points'] = empl_geog_london['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
empl_geog_london['category_list'] = empl_geog_london['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Birmingham
empl_geog_bham = gpd.read_file("data/combined_data/empl_geog_bham.geojson")

# WKT strings back to Shapely geometries
empl_geog_bham['building_poly'] = empl_geog_bham['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_bham['commercial_building_poly'] = empl_geog_bham['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_bham['retail_building_poly'] = empl_geog_bham['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_bham['office_building_poly'] = empl_geog_bham['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog_bham['residential_building_poly'] = empl_geog_bham['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

empl_geog_bham['place_points'] = empl_geog_bham['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
empl_geog_bham['category_list'] = empl_geog_bham['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Test
empl_geog_london.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list,log_num_places,geometry
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,243,"MULTIPOLYGON (((-0.0983297 51.5158158, -0.0983...","MULTIPOLYGON (((-0.0966079 51.5147477, -0.0967...","MULTIPOLYGON (((-0.098412 51.5155493, -0.09835...",,"MULTIPOLYGON (((-0.0980623 51.5182674, -0.0983...","MULTIPOINT ((-0.0987982 51.5176724), (-0.09860...","[pub, coffee_shop, immigration_law, shared_off...",5.493061,"POLYGON ((-0.09729 51.52158, -0.09652 51.52027..."
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,419,"MULTIPOLYGON (((-0.0945314 51.5147658, -0.0944...","MULTIPOLYGON (((-0.0951296 51.5145545, -0.0951...","MULTIPOLYGON (((-0.0940575 51.5145934, -0.0940...",,"MULTIPOLYGON (((-0.0936221 51.5181541, -0.0946...","MULTIPOINT ((-0.0958288 51.5168656), (-0.09565...","[business_manufacturing_and_supply, education,...",6.037871,"POLYGON ((-0.08813 51.51941, -0.08929 51.51752..."
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,55,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...",,"POLYGON ((-0.0959633 51.5209414, -0.0955256 51...",,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...","MULTIPOINT ((-0.0976057 51.5226912), (-0.09757...","[health_and_medical, hospital, parking, indust...",4.007333,"POLYGON ((-0.09679 51.52325, -0.09647 51.52282..."
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,534,"MULTIPOLYGON (((-0.0757822 51.511071, -0.07577...","MULTIPOLYGON (((-0.0752739 51.5111313, -0.0754...","MULTIPOLYGON (((-0.0779263 51.5147252, -0.0780...","POLYGON ((-0.0731987 51.5118923, -0.0731919 51...",,"MULTIPOINT ((-0.0799241 51.5156172), (-0.07992...","[professional_services, dance_club, insurance_...",6.280396,"POLYGON ((-0.07323 51.51000, -0.07553 51.50974..."
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,11,"MULTIPOLYGON (((0.085895 51.5386732, 0.0859413...",,,,,"MULTIPOINT ((0.0856198 51.538706), (0.0858424 ...","[landmark_and_historical_building, constructio...",2.397895,"POLYGON ((0.09115 51.53909, 0.09326 51.53787, ..."


## POI Encoding

### London

In [130]:
# Encode POI categories in data - using One-Hot encoding

#Make sure category list is a list
empl_geog_london['category_list'] = empl_geog_london['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#Explode category lists
empl_geog_london_exploded = empl_geog_london.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories = encoder.fit_transform(empl_geog_london_exploded[['category_list']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_london_exploded = pd.concat([empl_geog_london_exploded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_london_exploded.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_london_encoded = empl_geog_london_exploded.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Store category columns
category_columns = empl_geog_london_encoded.columns[1:]

empl_geog_london_encoded.head()

Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abortion_clinic,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_acoustical_consultant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01000001,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,0.0,0.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Birmingham

In [131]:
# Encode POI categories in data - using One-Hot encoding

#Make sure category list is a list
empl_geog_bham['category_list'] = empl_geog_bham['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#Explode category lists
empl_geog_bham_exploded = empl_geog_bham.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories = encoder.fit_transform(empl_geog_bham_exploded[['category_list']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_bham_exploded = pd.concat([empl_geog_bham_exploded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_bham_exploded.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_bham_encoded = empl_geog_bham_exploded.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Store category columns
category_columns = empl_geog_bham_encoded.columns[1:]

empl_geog_bham_encoded.head()

Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,category_list_adoption_services,category_list_adult_education,...,category_list_windshield_installation_and_repair,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo
0,E01008881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,E01008884,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Geometric Feature Extraction

In [132]:
# Geometric Features Extraction (copied from earlier but includes building types now)

#I'm going to exclude num polygons, average perimeter, and total perimeter cause they're not so helpful (cause Collinearity matrix shows its very close to count)

def extract_multipolygon_features(multipolygon, lsoa_geometry):
    if isinstance(multipolygon, MultiPolygon):
        polygons = list(multipolygon.geoms)
    elif isinstance(multipolygon, Polygon):
        polygons = [multipolygon]
    else:
        return pd.Series({
            'total_area': 0,
            'avg_building_area': 0,
            'lsoa_area_ratio': 0,
        })

    num_polygons = len(polygons)
    areas = [polygon.area for polygon in polygons]
    total_area = sum(areas)
    avg_building_area = total_area / num_polygons if num_polygons > 0 else 0

    #Built-up area ratio could result in a div by zero error if there are no buildings in an LSOA so this logic has to be included here
    try:
        lsoa_area_ratio = total_area / lsoa_geometry.area
    except ZeroDivisionError:
        lsoa_area_ratio = 0

    return pd.Series({
        'total_area': total_area,
        'lsoa_area_ratio': lsoa_area_ratio,
        'avg_building_area': avg_building_area,
    })


### London

In [133]:
# All Buildings
all_buildings_geometry_features = empl_geog_london.apply(
    lambda row: extract_multipolygon_features(row['building_poly'], row['geometry']),
    axis=1
).add_prefix('all_')

# Residential
residential_buildings_geometry_features = empl_geog_london.apply(
    lambda row: extract_multipolygon_features(row['residential_building_poly'], row['geometry']),
    axis=1
).add_prefix('residential_')

# Commercial
commercial_buildings_geometry_features = empl_geog_london.apply(
    lambda row: extract_multipolygon_features(row['commercial_building_poly'], row['geometry']),
    axis=1
).add_prefix('commercial_')

# Office
office_buildings_geometry_features = empl_geog_london.apply(
    lambda row: extract_multipolygon_features(row['office_building_poly'], row['geometry']),
    axis=1
).add_prefix('office_')

# Retail
retail_buildings_geometry_features = empl_geog_london.apply(
    lambda row: extract_multipolygon_features(row['retail_building_poly'], row['geometry']),
    axis=1
).add_prefix('retail_')

# Combine all geometry features into one DataFrame
all_geom_features_london = pd.concat([
    all_buildings_geometry_features,
    residential_buildings_geometry_features,
    commercial_buildings_geometry_features,
    office_buildings_geometry_features,
    retail_buildings_geometry_features
], axis=1, ignore_index=False)

# Capture Column Names
geo_features_london = all_geom_features_london.columns

# Display the first 50 rows
all_geom_features_london.head(50)

Unnamed: 0,all_avg_building_area,all_lsoa_area_ratio,all_total_area,residential_avg_building_area,residential_lsoa_area_ratio,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,6.954275e-07,0.966502,1.669026e-05,1.142252e-07,0.026458,4.569009e-07,2.656669e-07,0.076921,1.328335e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,4.409557e-07,0.722431,2.116588e-05,1.405123e-07,0.014388,4.215369e-07,2.245372e-07,0.09963,2.918983e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,7.60204e-07,1.126545,8.362244e-06,1.051174e-07,0.056645,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,2.474802e-07,0.540976,1.336393e-05,0.0,0.0,0.0,3.214321e-07,0.065058,1.60716e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,1.598686e-07,0.179683,3.35724e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.216192e-08,0.275445,7.071869e-06,4.436179e-08,0.006911,1.774472e-07,0.0,0.0,0.0,2.032864e-07,0.007918,2.032864e-07,5.505318e-08,0.004289,1.101064e-07
6,1.2218e-07,0.209613,5.25374e-06,0.0,0.0,0.0,0.0,0.0,0.0,2.35158e-07,0.009382,2.35158e-07,2.354605e-08,0.000939,2.354605e-08
7,8.681309e-08,0.317807,5.295598e-06,0.0,0.0,0.0,0.0,0.0,0.0,5.894408e-08,0.021225,3.536645e-07,0.0,0.0,0.0
8,1.452742e-07,0.298902,1.35105e-05,5.623035e-08,0.007464,3.373821e-07,6.004655e-08,0.002657,1.200931e-07,1.248022e-07,0.057983,2.620846e-06,8.596902e-08,0.00951,4.298451e-07
9,1.955352e-08,0.243971,2.854813e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Birmingham

In [134]:
# All Buildings
all_buildings_geometry_features = empl_geog_bham.apply(
    lambda row: extract_multipolygon_features(row['building_poly'], row['geometry']),
    axis=1
).add_prefix('all_')

# Residential
residential_buildings_geometry_features = empl_geog_bham.apply(
    lambda row: extract_multipolygon_features(row['residential_building_poly'], row['geometry']),
    axis=1
).add_prefix('residential_')

# Commercial
commercial_buildings_geometry_features = empl_geog_bham.apply(
    lambda row: extract_multipolygon_features(row['commercial_building_poly'], row['geometry']),
    axis=1
).add_prefix('commercial_')

# Office
office_buildings_geometry_features = empl_geog_bham.apply(
    lambda row: extract_multipolygon_features(row['office_building_poly'], row['geometry']),
    axis=1
).add_prefix('office_')

# Retail
retail_buildings_geometry_features = empl_geog_bham.apply(
    lambda row: extract_multipolygon_features(row['retail_building_poly'], row['geometry']),
    axis=1
).add_prefix('retail_')

# Combine all geometry features into one DataFrame
all_geom_features_bham = pd.concat([
    all_buildings_geometry_features,
    residential_buildings_geometry_features,
    commercial_buildings_geometry_features,
    office_buildings_geometry_features,
    retail_buildings_geometry_features
], axis=1, ignore_index=False)

# Capture Column Names
geo_features_bham = all_geom_features_bham.columns

# Display the first 50 rows
all_geom_features_bham.head(50)

Unnamed: 0,all_total_area,all_lsoa_area_ratio,all_avg_building_area,residential_avg_building_area,residential_lsoa_area_ratio,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,6e-06,0.122619,3.285543e-08,2.302835e-08,0.063581,3e-06,0.0,0.0,0.0,6.707288e-08,0.016603,8.719474e-07,0.0,0.0,0.0
1,6e-06,0.175438,2.692668e-08,2.244685e-08,0.076574,2e-06,0.0,0.0,0.0,1.186504e-07,0.010939,3.559512e-07,0.0,0.0,0.0
2,1.3e-05,0.244526,7.209237e-08,3.779565e-08,0.093431,5e-06,0.0,0.0,0.0,8.688264e-08,0.004995,2.606479e-07,0.0,0.0,0.0
3,2.6e-05,0.276558,1.156413e-07,5.742557e-08,0.050335,5e-06,0.0,0.0,0.0,1.090375e-07,0.018879,1.7446e-06,6.686906e-08,0.000724,6.686906e-08
4,4e-06,0.155249,1.559469e-08,1.455804e-08,0.109552,3e-06,0.0,0.0,0.0,5.695903e-08,0.00893,2.278361e-07,0.0,0.0,0.0
5,9e-06,0.15004,6.116768e-08,3.388691e-08,0.048629,3e-06,0.0,0.0,0.0,4.629078e-07,0.038622,2.314539e-06,1.727449e-07,0.008648,5.182347e-07
6,1.4e-05,0.237079,6.470391e-08,2.455759e-08,0.060392,4e-06,0.0,0.0,0.0,5.124669e-08,0.005075,3.074801e-07,1.683305e-07,0.008335,5.049915e-07
7,4e-06,0.129026,1.246474e-08,1.11736e-08,0.104359,3e-06,0.0,0.0,0.0,6.357958e-08,0.006431,1.907387e-07,0.0,0.0,0.0
8,3.2e-05,0.287574,1.111652e-07,2.86973e-08,0.04845,5e-06,0.0,0.0,0.0,1.840669e-07,0.006683,7.362675e-07,9.919489e-08,0.0009,9.919489e-08
9,8e-06,0.197336,2.713345e-08,2.083696e-08,0.113785,5e-06,0.0,0.0,0.0,2.299217e-08,0.001126,4.598433e-08,1.982874e-08,0.000486,1.982874e-08


In [135]:
# Combine Datasets

# London

empl_geog_london['place_points'] = empl_geog_london['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

data_with_geom_london = pd.concat([empl_geog_london, all_geom_features_london], axis=1)
all_data_london = pd.concat([data_with_geom_london, empl_geog_london_encoded], axis=1)

# Birmingham
empl_geog_bham['place_points'] = empl_geog_bham['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

data_with_geom_bham = pd.concat([empl_geog_bham, all_geom_features_london], axis=1)
all_data_bham = pd.concat([data_with_geom_bham, empl_geog_bham_encoded], axis=1)


## Filter POIs

In [136]:
# Filter to quality places only

# London
quality_places_london = london_places[london_places['confidence'] >= 0.6]
quality_places_london.head()

# Birmingham
quality_places_bham = bham_places[bham_places['confidence'] >= 0.6]
quality_places_bham.head()

Unnamed: 0,name,category,confidence,geometry
0,Beacon Hill Toposcope,landmark_and_historical_building,0.96,POINT (-2.01965 52.38213)
1,Lickey Hills Country Park - Beacon Hill,park,0.67,POINT (-2.01972 52.38229)
3,Beacon Castle,park,0.67,POINT (-2.01961 52.38234)
4,Barnett Fair Unisex Hairdressing,beauty_salon,0.96,POINT (-2.00227 52.38278)
6,Rednal Tyres | Cheap Car Tyres Birmingham,automotive_repair,0.96,POINT (-2.00208 52.38210)


In [137]:
# London

# Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Generating a version of empl_geog that has only quality places - I'm using my data_with_geom file as a starting point
empl_geog_quality_places_london = data_with_geom_london

#Copy back quality places using logic from DataCleaning.ipynb

empl_geog_quality_places_london['category_list'] = None

for index, row in empl_geog_london.iterrows():
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs='EPSG:4326')

    #All Places
    filtered_places_london = quality_places_london[quality_places_london.geometry.intersects(filter_geom.unary_union)]

    #Multipoint of each LSOA's places
    combined_multipoint = filtered_places_london.geometry.unary_union
    
    #Add place category information
    category_list = filtered_places_london['category'].dropna().tolist()

    #Add back to Dataframe
    empl_geog_quality_places_london.at[index, 'category_list'] = category_list

empl_geog_quality_places_london.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,4.569009e-07,2.656669e-07,0.076921,1e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,4.215369e-07,2.245372e-07,0.09963,3e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0.0,3.214321e-07,0.065058,2e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
# Birmingham

# Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Generating a version of empl_geog that has only quality places - I'm using my data_with_geom file as a starting point
empl_geog_quality_places_bham = data_with_geom_bham

#Copy back quality places using logic from DataCleaning.ipynb

empl_geog_quality_places_bham['category_list'] = None

for index, row in empl_geog_bham.iterrows():
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs='EPSG:4326')

    #All Places
    filtered_places_bham = quality_places_bham[quality_places_bham.geometry.intersects(filter_geom.unary_union)]

    #Multipoint of each LSOA's places
    combined_multipoint = filtered_places_bham.geometry.unary_union
    
    #Add place category information
    category_list = filtered_places_bham['category'].dropna().tolist()

    #Add back to Dataframe
    empl_geog_quality_places_bham.at[index, 'category_list'] = category_list

empl_geog_quality_places_bham.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,...,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,E01008881,Birmingham 067A,Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,0.0,...,4.569009e-07,2.656669e-07,0.076921,1e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,E01008882,Birmingham 066A,Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,0.0,...,4.215369e-07,2.245372e-07,0.09963,3e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,E01008883,Birmingham 078A,Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,0.0,...,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,E01008884,Birmingham 078B,Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,0.0,...,0.0,3.214321e-07,0.065058,2e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,E01008885,Birmingham 076A,Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
# One-Hot Encoding - yet again on London

#Explode category lists
empl_geog_quality_places_exploded_london = empl_geog_quality_places_london.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories_quality = encoder.fit_transform(empl_geog_quality_places_exploded_london[['category_list']])
encoded_df = pd.DataFrame(encoded_categories_quality, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_quality_places_exploded_london = pd.concat([empl_geog_quality_places_exploded_london.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_quality_places_exploded_london.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_quality_places_encoded_london = empl_geog_quality_places_exploded_london.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Save columns of interest for analysis

quality_categories = empl_geog_quality_places_encoded_london.columns[1:]

empl_geog_quality_places_encoded_london.head()


Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abortion_clinic,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_acoustical_consultant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_nan
0,E01000001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
# One-Hot Encoding - yet again on Bham

#Explode category lists
empl_geog_quality_places_exploded_bham = empl_geog_quality_places_bham.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories_quality = encoder.fit_transform(empl_geog_quality_places_exploded_bham[['category_list']])
encoded_df = pd.DataFrame(encoded_categories_quality, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_quality_places_exploded_bham = pd.concat([empl_geog_quality_places_exploded_bham.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_quality_places_exploded_bham.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_quality_places_encoded_bham = empl_geog_quality_places_exploded_bham.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Save columns of interest for analysis

quality_categories = empl_geog_quality_places_encoded_bham.columns[1:]

empl_geog_quality_places_encoded_bham.head()


Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,category_list_adoption_services,category_list_adult_education,...,category_list_wine_bar,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01008881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,E01008884,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
#Rejoin with all data

all_data_quality_places_london = pd.merge(empl_geog_quality_places_london, empl_geog_quality_places_encoded_london, on='LSOA11CD')
all_data_quality_places_bham = pd.merge(empl_geog_quality_places_bham, empl_geog_quality_places_encoded_bham, on='LSOA11CD')

all_data_quality_places_london.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_nan
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Collapse Categories

### London

In [142]:
# Automated Condensing of Column Types

# List all columns

all_columns_london = empl_geog_london_encoded.columns.tolist()

# I'll group them by 'suffix' to find commonalities
suffix_groups = {}

# Iterate through columns to identify suffixes
for column in all_columns_london:
    lower_column = column.lower()
    parts = lower_column.split('_')
    if len(parts) > 1:
        suffix = parts[-1]
        if suffix in suffix_groups:
            suffix_groups[suffix].append(column)
        else:
            suffix_groups[suffix] = [column]
    else:
        # Handle columns without suffixes
        suffix_groups['other'] = suffix_groups.get('other', []) + [column]

# Create a list of DataFrames to concatenate
concatenated_dfs_london = []

# Iterate through suffix groups and aggregate columns
for suffix, columns in suffix_groups.items():
    if columns:
        if len(columns) == 1:
            # Preserve col name if only one col with suffix
            new_column_name = columns[0].replace('category_list_', '')
            concatenated_dfs_london.append(empl_geog_london_encoded[columns].rename(columns={columns[0]: new_column_name}))
        else:
            # Aggregate columns with more than one column in the group
            new_column_name = f'all_{suffix}'
            concatenated_dfs_london.append(empl_geog_london_encoded[columns].sum(axis=1).rename(new_column_name))

# Concatenate all DataFrames
condensed_categories_london = pd.concat(concatenated_dfs_london, axis=1)

# Print summary information (groups only)
for suffix, columns in suffix_groups.items():
    num_columns = len(columns)
    if num_columns > 1:
        print(f'Number of {suffix.capitalize()} Categories: {num_columns}')

condensed_categories_london.head()

Number of Service Categories: 81
Number of Clinic Categories: 8
Number of Accommodation Categories: 2
Number of Consultant Categories: 9
Number of Center Categories: 31
Number of Services Categories: 85
Number of Education Categories: 3
Number of Entertainment Categories: 2
Number of Store Categories: 100
Number of Agency Categories: 12
Number of Restaurant Categories: 110
Number of Program Categories: 2
Number of Supplier Categories: 25
Number of Production Categories: 6
Number of Dealer Categories: 14
Number of Manufacturer Categories: 8
Number of Repair Categories: 21
Number of Lounge Categories: 2
Number of Centers Categories: 2
Number of Contractor Categories: 5
Number of Medicine Categories: 6
Number of League Categories: 4
Number of Team Categories: 4
Number of Park Categories: 9
Number of Church Categories: 5
Number of Hospital Categories: 3
Number of Shelter Categories: 2
Number of Studio Categories: 7
Number of Agent Categories: 2
Number of Apartments Categories: 2
Number of 

Unnamed: 0,LSOA11CD,all_service,all_clinic,abuse_and_addiction_treatment,all_accommodation,accountant,all_consultant,active_life,acupuncture,all_center,...,waxing,welders,well_drilling,wholesale_grocer,wildlife_control,wildlife_sanctuary,wills_trusts_and_probate,winery,wood_and_pulp,None
0,E01000001,12.0,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,21.0,0.0,0.0,1.0,5.0,4.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,26.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Birmingham

In [146]:
# Automated Condensing of Column Types

# List all columns

all_columns_bham = empl_geog_bham_encoded.columns.tolist()

# I'll group them by 'suffix' to find commonalities
suffix_groups = {}

# Iterate through columns to identify suffixes
for column in all_columns_bham:
    lower_column = column.lower()
    parts = lower_column.split('_')
    if len(parts) > 1:
        suffix = parts[-1]
        if suffix in suffix_groups:
            suffix_groups[suffix].append(column)
        else:
            suffix_groups[suffix] = [column]
    else:
        # Handle columns without suffixes
        suffix_groups['other'] = suffix_groups.get('other', []) + [column]

# Create a list of DataFrames to concatenate
concatenated_dfs_bham = []

# Iterate through suffix groups and aggregate columns
for suffix, columns in suffix_groups.items():
    if columns:
        if len(columns) == 1:
            # Preserve col name if only one col with suffix
            new_column_name = columns[0].replace('category_list_', '')
            concatenated_dfs_bham.append(empl_geog_bham_encoded[columns].rename(columns={columns[0]: new_column_name}))
        else:
            # Aggregate columns with more than one column in the group
            new_column_name = f'all_{suffix}'
            concatenated_dfs_bham.append(empl_geog_bham_encoded[columns].sum(axis=1).rename(new_column_name))

# Concatenate all DataFrames
condensed_categories_bham = pd.concat(concatenated_dfs_bham, axis=1)

# Print summary information (groups only)
for suffix, columns in suffix_groups.items():
    num_columns = len(columns)
    if num_columns > 1:
        print(f'Number of {suffix.capitalize()} Categories: {num_columns}')

condensed_categories_bham.head()

Number of Service Categories: 56
Number of Center Categories: 17
Number of Services Categories: 64
Number of Education Categories: 3
Number of Entertainment Categories: 2
Number of Agency Categories: 11
Number of Restaurant Categories: 63
Number of Lounge Categories: 2
Number of Centers Categories: 2
Number of Contractor Categories: 5
Number of Medicine Categories: 2
Number of Supplier Categories: 18
Number of Team Categories: 3
Number of Park Categories: 5
Number of Church Categories: 5
Number of Hospital Categories: 3
Number of Shelter Categories: 2
Number of Studio Categories: 6
Number of Store Categories: 76
Number of Apartments Categories: 2
Number of Manufacturer Categories: 5
Number of Range Categories: 3
Number of Architect Categories: 2
Number of Designer Categories: 3
Number of Museum Categories: 4
Number of Restoration Categories: 2
Number of School Categories: 21
Number of Facility Categories: 4
Number of House Categories: 3
Number of Rental Categories: 4
Number of Shop Cat

Unnamed: 0,LSOA11CD,all_service,abuse_and_addiction_treatment,accommodation,accountant,active_life,acupuncture,all_center,all_services,all_education,...,waterproofing,waxing,welders,wellness_program,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,window_washing,wood_and_pulp,zoo
0,E01008881,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01008884,6.0,0.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E01008885,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
#Remove all categories that have less than five in London

column_sums_london = condensed_categories_london.iloc[:, 1:].sum()
columns_to_drop_london = column_sums_london[column_sums_london < 5].index.tolist()
print(columns_to_drop_london)

condensed_categories_london = condensed_categories_london.drop(columns=columns_to_drop_london)

#Store new categories
condensed_category_columns_london = condensed_categories_london.columns[1:]

condensed_categories_london.head()

['aesthetician', 'agricultural_cooperatives', 'allergist', 'anesthesiologist', 'apiaries_and_beekeepers', 'artificial_turf', 'ayurveda', 'b2b_machinery_and_tools', 'all_area', 'batting_cage', 'bicycle_sharing_location', 'bistro', 'business_financing', 'cabaret', 'car_auction', 'car_sharing', 'castle', 'chambers_of_commerce', 'coffee_roastery', 'community_gardens', 'convents_and_monasteries', 'customized_merchandise', 'drive_in_theater', 'empanadas', 'feng_shui', 'festival', 'fidelity_and_surety_bonds', 'firewood', 'forest', 'fort', 'gemstone_and_mineral', 'gents_tailor', 'glass_blowing', 'greengrocer', 'greenhouses', 'heliports', 'hydrotherapy', 'all_specialist', 'inn', 'irrigation', 'island', 'knife_sharpening', 'light_rail_and_subway_stations', 'macarons', 'marching_band', 'metal_materials_and_experts', 'mills', 'mission', 'mountain_bike_parks', 'mountain_bike_trails', 'office_of_vital_records', 'oncologist', 'orthotics', 'outdoor_movies', 'package_locker', 'pediatric_cardiology', 'p

Unnamed: 0,LSOA11CD,all_service,all_clinic,abuse_and_addiction_treatment,all_accommodation,accountant,all_consultant,active_life,acupuncture,all_center,...,veterinarian,videographer,vitamins_and_supplements,warehouses,waterproofing,waxing,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,winery
0,E01000001,12.0,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,21.0,0.0,0.0,1.0,5.0,4.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,26.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
#Remove all categories that have less than five in Birmingham

column_sums_bham = condensed_categories_bham.iloc[:, 1:].sum()
columns_to_drop_bham = column_sums_bham[column_sums_bham < 5].index.tolist()
print(columns_to_drop_bham)

condensed_categories_bham = condensed_categories_bham.drop(columns=columns_to_drop_bham)

#Store new categories
condensed_category_columns_bham = condensed_categories_bham.columns[1:]

condensed_categories_bham.head()

['abuse_and_addiction_treatment', 'agricultural_cooperatives', 'agriculture', 'airlines', 'airport_shuttles', 'airport_terminal', 'all_centers', 'allergist', 'aquarium', 'all_range', 'astrologer', 'auditorium', 'auto_upholstery', 'automobile_leasing', 'ayurveda', 'b2b_apparel', 'b2b_furniture_and_housewares', 'b2b_jewelers', 'b2b_rubber_and_plastics', 'baby_gear_and_furniture', 'banks', 'bartender', 'beer_wine_and_spirits', 'bicycle_sharing_location', 'bike_repair_maintenance', 'bookbinding', 'bookkeeper', 'boot_camp', 'brasserie', 'brokers', 'business_office_supplies_and_stationery', 'business_schools', 'campground', 'car_auction', 'car_sharing', 'car_wash', 'castle', 'all_loans', 'chimney_sweep', 'choir', 'circus', 'coach_bus', 'colonics', 'commercial_printer', 'commercial_refrigeration', 'convents_and_monasteries', 'cottage', 'cricket_ground', 'customized_merchandise', 'data_recovery', 'all_laboratories', 'diagnostic_imaging', 'distillery', 'drugstore', 'embroidery_and_crochet', 'en

Unnamed: 0,LSOA11CD,all_service,accommodation,accountant,active_life,acupuncture,all_center,all_services,all_education,all_entertainment,...,travel,travel_agents,trusts,university_housing,used_vintage_and_consignment,veterinarian,videographer,vitamins_and_supplements,warehouses,window_washing
0,E01008881,3.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01008882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01008883,1.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,E01008884,6.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,E01008885,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# Join data

all_data_cleaned_london = pd.merge(data_with_geom_london, condensed_categories_london, on='LSOA11CD')
all_data_cleaned_bham = pd.merge(data_with_geom_bham, condensed_categories_bham, on='LSOA11CD')

all_data_cleaned_london.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,veterinarian,videographer,vitamins_and_supplements,warehouses,waterproofing,waxing,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,winery
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Add Employment and Office Employment Density Attributes

In [154]:
# Create a new column Employment Density and create new model to target that
all_data_cleaned_london['employment_density'] = all_data_cleaned_london['total_employment'] / (all_data_cleaned_london['geometry']).to_crs("EPSG:27700").area
all_data_cleaned_bham['employment_density'] = all_data_cleaned_bham['total_employment'] / (all_data_cleaned_bham['geometry']).to_crs("EPSG:27700").area

In [155]:
# Use office work cols from previous analysis

office_work_cols = [
    '42 : Civil engineering',
    '58 : Publishing activities',
    '59 : Motion picture, video and television programme production, sound recording and music publishing activities',
    '60 : Programming and broadcasting activities',
    '61 : Telecommunications',
    '62 : Computer programming, consultancy and related activities',
    '63 : Information service activities',
    '64 : Financial service activities, except insurance and pension funding',
    '65 : Insurance, reinsurance and pension funding, except compulsory social security',
    '66 : Activities auxiliary to financial services and insurance activities',
    '68 : Real estate activities',
    '69 : Legal and accounting activities',
    '70 : Activities of head offices; management consultancy activities',
    '71 : Architectural and engineering activities; technical testing and analysis',
    '72 : Scientific research and development',
    '73 : Advertising and market research',
    '74 : Other professional, scientific and technical activities',
    '77 : Rental and leasing activities',
    '78 : Employment activities',
    '79 : Travel agency, tour operator and other reservation service and related activities',
    '80 : Security and investigation activities',
    '82 : Office administrative, office support and other business support activities',
    '84 : Public administration and defence; compulsory social security'
]

# Create a new Office Work Total Column :
all_data_cleaned_london['office_total_employment'] = all_data_cleaned_london[office_work_cols].sum(axis=1)
all_data_cleaned_bham['office_total_employment'] = all_data_cleaned_bham[office_work_cols].sum(axis=1)

# And an Office Work Density Column
all_data_cleaned_london['office_employment_density'] = all_data_cleaned_london['office_total_employment'] / (all_data_cleaned_london['geometry']).to_crs("EPSG:27700").area
all_data_cleaned_bham['office_employment_density'] = all_data_cleaned_bham['office_total_employment'] / (all_data_cleaned_bham['geometry']).to_crs("EPSG:27700").area

all_data_cleaned_bham.head()


Unnamed: 0,LSOA11CD,LSOA11NM_x,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,...,university_housing,used_vintage_and_consignment,veterinarian,videographer,vitamins_and_supplements,warehouses,window_washing,employment_density,office_total_employment,office_employment_density
0,E01008881,Birmingham 067A,Birmingham 067A,,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001146,35.0,8.8e-05
1,E01008882,Birmingham 066A,Birmingham 066A,,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000122,0.0,0.0
2,E01008883,Birmingham 078A,Birmingham 078A,,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.004589,650.0,0.001648
3,E01008884,Birmingham 078B,Birmingham 078B,,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.003708,485.0,0.000694
4,E01008885,Birmingham 076A,Birmingham 076A,,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000467,30.0,0.000156


## Save Checkpoint

In [156]:
# Save CSV

all_data_cleaned_london.to_csv("data/combined_data/all_data_london.csv")
all_data_cleaned_bham.to_csv("data/combined_data/all_data_bham.csv")

In [157]:
# Save London to Dataframe on Desktop

# Convert multipolygons to WKT for export
all_data_cleaned_london['building_poly'] = all_data_cleaned_london['building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_london['commercial_building_poly'] = all_data_cleaned_london['commercial_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_london['retail_building_poly'] = all_data_cleaned_london['retail_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_london['office_building_poly'] = all_data_cleaned_london['office_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_london['residential_building_poly'] = all_data_cleaned_london['residential_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

all_data_cleaned_london['place_points'] = all_data_cleaned_london['place_points'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

# Convert category_list to string for export
all_data_cleaned_london['category_list'] = all_data_cleaned_london['category_list'].apply(lambda x: str(x) if x else None)

# Export
all_data_cleaned_london.to_file("data/combined_data/all_data_london.geojson", driver="GeoJSON")

In [158]:
# Save Birmingham to Dataframe on Desktop

# Convert multipolygons to WKT for export
all_data_cleaned_bham['building_poly'] = all_data_cleaned_bham['building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_bham['commercial_building_poly'] = all_data_cleaned_bham['commercial_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_bham['retail_building_poly'] = all_data_cleaned_bham['retail_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_bham['office_building_poly'] = all_data_cleaned_bham['office_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)
all_data_cleaned_bham['residential_building_poly'] = all_data_cleaned_bham['residential_building_poly'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

all_data_cleaned_bham['place_points'] = all_data_cleaned_bham['place_points'].apply(lambda x: x.wkt if isinstance(x, shapely.geometry.base.BaseGeometry) else x)

# Convert category_list to string for export
all_data_cleaned_bham['category_list'] = all_data_cleaned_bham['category_list'].apply(lambda x: str(x) if x else None)

# Export
all_data_cleaned_bham.to_file("data/combined_data/all_data_bham.geojson", driver="GeoJSON")

## Checkpoint

In [5]:
# London
all_data_cleaned_london = gpd.read_file("data/combined_data/all_data_london.geojson")

# WKT strings back to Shapely geometries
all_data_cleaned_london['building_poly'] = all_data_cleaned_london['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_london['commercial_building_poly'] = all_data_cleaned_london['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_london['retail_building_poly'] = all_data_cleaned_london['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_london['office_building_poly'] = all_data_cleaned_london['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_london['residential_building_poly'] = all_data_cleaned_london['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

all_data_cleaned_london['place_points'] = all_data_cleaned_london['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
all_data_cleaned_london['category_list'] = all_data_cleaned_london['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# London
all_data_cleaned_bham = gpd.read_file("data/combined_data/all_data_bham.geojson")

# WKT strings back to Shapely geometries
all_data_cleaned_bham['building_poly'] = all_data_cleaned_bham['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_bham['commercial_building_poly'] = all_data_cleaned_bham['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_bham['retail_building_poly'] = all_data_cleaned_bham['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_bham['office_building_poly'] = all_data_cleaned_bham['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
all_data_cleaned_bham['residential_building_poly'] = all_data_cleaned_bham['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

all_data_cleaned_bham['place_points'] = all_data_cleaned_bham['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
all_data_cleaned_bham['category_list'] = all_data_cleaned_bham['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Test
all_data_cleaned_london.head()

NameError: name 'all_data_cleaned_bham' is not defined

## Add Spatially Lagged Parameters

In [None]:
# Construct London Adjacency Matrix (K = 6)

# Calculate centroids (British National Grid)

all_data_cleaned_london['centroid'] = all_data_cleaned_london.geometry.to_crs("EPSG:27700").centroid

# Extract centroid coordinates and construct KDTree
centroids = np.array(list(all_data_cleaned_london['centroid'].apply(lambda geom: (geom.x, geom.y))))
tree = KDTree(centroids)

# Identify six nearest neighbours (+ self point)
distances, indices = tree.query(centroids, k=7)

# Store results
nearest_neighbors = {}
for idx, neighbors in enumerate(indices):
    #Track progress
    if idx % 20 == 0:
        print(idx)
        
    nearest_neighbors[idx] = neighbors[1:]

neighbors_df_london = pd.DataFrame.from_dict(nearest_neighbors, orient='index')

# Appply to all data
all_data_cleaned_london['nearest_neighbors'] = all_data_cleaned_london.index.map(nearest_neighbors)

all_data_cleaned_london.head()

In [None]:
# Construct Birmingam Adjacency Matrix (K = 6)

# Calculate centroids (British National Grid)

all_data_cleaned_bham['centroid'] = all_data_cleaned_bham.geometry.to_crs("EPSG:27700").centroid

# Extract centroid coordinates and construct KDTree
centroids = np.array(list(all_data_cleaned_bham['centroid'].apply(lambda geom: (geom.x, geom.y))))
tree = KDTree(centroids)

# Identify six nearest neighbours (+ self point)
distances, indices = tree.query(centroids, k=7)

# Store results
nearest_neighbors = {}
for idx, neighbors in enumerate(indices):
    #Track progress
    if idx % 20 == 0:
        print(idx)
        
    nearest_neighbors[idx] = neighbors[1:]

neighbors_df_bham = pd.DataFrame.from_dict(nearest_neighbors, orient='index')

# Appply to all data
all_data_cleaned_bham['nearest_neighbors'] = all_data_cleaned_bham.index.map(nearest_neighbors)

all_data_cleaned_bham.head()

In [None]:
# Construct adjacent features and add back to feature list

feature_columns_london = ['num_buildings', 'log_num_places', 'population'] + list(geo_features_london) + list(condensed_category_columns_london)

# Create new columns
for col in feature_columns_london:
    all_data_cleaned_london[f'lag_{col}'] = None

# Store average of adjacent features
average_features_london = {f'lag_{col}': [] for col in feature_columns}

# For each row in the dataframe
for idx, row in all_data_cleaned_london.iterrows():

    #Track progress
    if idx % 20 == 0:
        print(idx)
    # Get the six closest LSOAs computed earlier
    neighbor_indices = row['nearest_neighbors']
    
    # Calculate the average for each feature across those six
    for col in feature_columns_london:
        avg_value = all_data_cleaned_london.iloc[neighbor_indices][col].mean()
        average_features_london[f'lag_{col}'].append(avg_value)

# Add the lagged features to the original GeoDataFrame
for col in average_features_london:
    all_data_cleaned_london[col] = average_features_london[col]

# Display the result
all_data_cleaned_london.head()

In [None]:
# Construct adjacent features and add back to feature list

feature_columns_bham = ['num_buildings', 'log_num_places', 'population'] + list(geo_features_bham) + list(condensed_category_columns_bham)

# Create new columns
for col in feature_columns_bham:
    all_data_cleaned_bham[f'lag_{col}'] = None

# Store average of adjacent features
average_features_bham = {f'lag_{col}': [] for col in feature_columns}

# For each row in the dataframe
for idx, row in all_data_cleaned_bham.iterrows():

    #Track progress
    if idx % 20 == 0:
        print(idx)
    # Get the six closest LSOAs computed earlier
    neighbor_indices = row['nearest_neighbors']
    
    # Calculate the average for each feature across those six
    for col in feature_columns_bham:
        avg_value = all_data_cleaned_bham.iloc[neighbor_indices][col].mean()
        average_features_bham[f'lag_{col}'].append(avg_value)

# Add the lagged features to the original GeoDataFrame
for col in average_features_bham:
    all_data_cleaned_bham[col] = average_features_bham[col]

# Display the result
all_data_cleaned_bham.head()