## Dissertation Dataset Exploration (v2)
### Gavin Rolls

I've queried OSM data using different building type indicators. I'll give some additional modelling/exploration a try here

## Imports and Clean Up

In [386]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np

#Shapely
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Plots and Stats
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

In [181]:
# Import Cleaned Employment Geography File (Version 2)

empl_geog = gpd.read_file('data/combined_data_v2/empl_geog.geojson')

# WKT strings back to Shapely geometries
empl_geog['building_poly'] = empl_geog['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['commercial_building_poly'] = empl_geog['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['retail_building_poly'] = empl_geog['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['office_building_poly'] = empl_geog['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['residential_building_poly'] = empl_geog['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

empl_geog['place_points'] = empl_geog['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
empl_geog['category_list'] = empl_geog['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

empl_geog.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,num_office_buildings,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list,geometry
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0,243,"MULTIPOLYGON (((-0.0983297 51.5158158, -0.0983...","MULTIPOLYGON (((-0.0966079 51.5147477, -0.0967...","MULTIPOLYGON (((-0.098412 51.5155493, -0.09835...",,"MULTIPOLYGON (((-0.0980623 51.5182674, -0.0983...","MULTIPOINT ((-0.0987982 51.5176724), (-0.09860...","[pub, coffee_shop, immigration_law, shared_off...","POLYGON ((-0.09729 51.52158, -0.09652 51.52027..."
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,0,419,"MULTIPOLYGON (((-0.0945314 51.5147658, -0.0944...","MULTIPOLYGON (((-0.0951296 51.5145545, -0.0951...","MULTIPOLYGON (((-0.0940575 51.5145934, -0.0940...",,"MULTIPOLYGON (((-0.0936221 51.5181541, -0.0946...","MULTIPOINT ((-0.0958288 51.5168656), (-0.09565...","[business_manufacturing_and_supply, education,...","POLYGON ((-0.08813 51.51941, -0.08929 51.51752..."
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0,55,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...",,"POLYGON ((-0.0959633 51.5209414, -0.0955256 51...",,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...","MULTIPOINT ((-0.0976057 51.5226912), (-0.09757...","[health_and_medical, hospital, parking, indust...","POLYGON ((-0.09679 51.52325, -0.09647 51.52282..."
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0,534,"MULTIPOLYGON (((-0.0757822 51.511071, -0.07577...","MULTIPOLYGON (((-0.0752739 51.5111313, -0.0754...","MULTIPOLYGON (((-0.0779263 51.5147252, -0.0780...","POLYGON ((-0.0731987 51.5118923, -0.0731919 51...",,"MULTIPOINT ((-0.0799241 51.5156172), (-0.07992...","[professional_services, dance_club, insurance_...","POLYGON ((-0.07323 51.51000, -0.07553 51.50974..."
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0,11,"MULTIPOLYGON (((0.085895 51.5386732, 0.0859413...",,,,,"MULTIPOINT ((0.0856198 51.538706), (0.0858424 ...","[landmark_and_historical_building, constructio...","POLYGON ((0.09115 51.53909, 0.09326 51.53787, ..."


In [182]:
# Import Category Encoded File (POIs)

encoded_categories = pd.read_csv('data/combined_data/empl_geog_category_encoded.csv')

#Drop extraneous column
encoded_categories = encoded_categories.drop(columns=['Unnamed: 0'])

#Store category columns
category_columns = encoded_categories.columns[1:]

encoded_categories.head()

Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abortion_clinic,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_acoustical_consultant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01000001,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,0.0,0.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [310]:
# Geometric Features Extraction (copied from earlier but includes building types now)

#I'm going to exclude num polygons, average perimeter, and total perimeter cause they're not so helpful (cause Collinearity matrix shows its very close to count)

def extract_multipolygon_features(multipolygon, lsoa_geometry):
    if isinstance(multipolygon, MultiPolygon):
        polygons = list(multipolygon.geoms)
    elif isinstance(multipolygon, Polygon):
        polygons = [multipolygon]
    else:
        return pd.Series({
            'total_area': 0,
            'avg_building_area': 0,
            'lsoa_area_ratio': 0,
        })

    num_polygons = len(polygons)
    areas = [polygon.area for polygon in polygons]
    total_area = sum(areas)
    avg_building_area = total_area / num_polygons if num_polygons > 0 else 0

    #Built-up area ratio could result in a div by zero error if there are no buildings in an LSOA so this logic has to be included here
    try:
        lsoa_area_ratio = total_area / lsoa_geometry.area
    except ZeroDivisionError:
        lsoa_area_ratio = 0

    return pd.Series({
        'total_area': total_area,
        'lsoa_area_ratio': lsoa_area_ratio,
        'avg_building_area': avg_building_area,
    })


In [314]:
# All Buildings
all_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['building_poly'], row['geometry']),
    axis=1
).add_prefix('all_')

# Residential
residential_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['residential_building_poly'], row['geometry']),
    axis=1
).add_prefix('residential_')

# Commercial
commercial_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['commercial_building_poly'], row['geometry']),
    axis=1
).add_prefix('commercial_')

# Office
office_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['office_building_poly'], row['geometry']),
    axis=1
).add_prefix('office_')

# Retail
retail_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['retail_building_poly'], row['geometry']),
    axis=1
).add_prefix('retail_')

# Combine all geometry features into one DataFrame
all_geom_features = pd.concat([
    all_buildings_geometry_features,
    residential_buildings_geometry_features,
    commercial_buildings_geometry_features,
    office_buildings_geometry_features,
    retail_buildings_geometry_features
], axis=1, ignore_index=False)

# Capture Column Names
geo_features = all_geom_features.columns

# Display the first 50 rows
all_geom_features.head(50)

Unnamed: 0,all_avg_building_area,all_lsoa_area_ratio,all_total_area,residential_avg_building_area,residential_lsoa_area_ratio,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,6.954275e-07,0.966502,1.669026e-05,1.142252e-07,0.026458,4.569009e-07,2.656669e-07,0.076921,1.328335e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,4.409557e-07,0.722431,2.116588e-05,1.405123e-07,0.014388,4.215369e-07,2.245372e-07,0.09963,2.918983e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,7.60204e-07,1.126545,8.362244e-06,1.051174e-07,0.056645,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,2.474802e-07,0.540976,1.336393e-05,0.0,0.0,0.0,3.214321e-07,0.065058,1.60716e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,1.598686e-07,0.179683,3.35724e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.216192e-08,0.275445,7.071869e-06,4.436179e-08,0.006911,1.774472e-07,0.0,0.0,0.0,2.032864e-07,0.007918,2.032864e-07,5.505318e-08,0.004289,1.101064e-07
6,1.2218e-07,0.209613,5.25374e-06,0.0,0.0,0.0,0.0,0.0,0.0,2.35158e-07,0.009382,2.35158e-07,2.354605e-08,0.000939,2.354605e-08
7,8.681309e-08,0.317807,5.295598e-06,0.0,0.0,0.0,0.0,0.0,0.0,5.894408e-08,0.021225,3.536645e-07,0.0,0.0,0.0
8,1.452742e-07,0.298902,1.35105e-05,5.623035e-08,0.007464,3.373821e-07,6.004655e-08,0.002657,1.200931e-07,1.248022e-07,0.057983,2.620846e-06,8.596902e-08,0.00951,4.298451e-07
9,1.955352e-08,0.243971,2.854813e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [316]:
# Combine Datasets

data_with_geom = pd.concat([empl_geog, all_geom_features], axis=1)
all_data = pd.concat([data_with_geom, encoded_categories], axis=1)

all_data.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test AutoML with new Data

In [388]:
# create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(category_columns)
target = 'total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data[features], all_data[target], test_size=0.2, random_state=42)

# Save results and fit
automl = AutoML(results_path="automl_results_v2/basic_building_categories/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')


Linear algorithm was disabled.
This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6588749182919613
RMSE: 1485.5213970935156


## Filter POIs and Collapse Categories

In [327]:
# This work is currently storedin DataExploration.ipynb - some adaptation will be needed to migrate things over

all_places = gpd.read_file('data/overture_data/london_places_overture.geojson')
all_places.head()

Unnamed: 0,name,category,confidence,geometry
0,The Heating Professionals,hvac_services,0.77,POINT (-0.50991 51.28709)
1,Hub Property Care,plumbing,0.77,POINT (-0.50991 51.28709)
2,F & S,plumbing,0.72,POINT (-0.50911 51.28766)
3,Hookins Accountants,accountant,0.75,POINT (-0.50881 51.28878)
4,Muddy Paws Grooming Service,pet_groomer,0.55,POINT (-0.50741 51.28859)


In [332]:
#Filter to quality places only
quality_places = all_places[all_places['confidence'] >= 0.6]

In [338]:
#Also copied from original DataExploration file

#Dealing with filter warnings for empty geometries in particular LSOAs
warnings.filterwarnings("ignore", category=FutureWarning)

# Generating a version of empl_geog that has only quality places - I'm using my data_with_geom file as a starting point
empl_geog_quality_places = data_with_geom

#Copy back quality places using logic from DataCleaning.ipynb

empl_geog_quality_places['category_list'] = None

for index, row in empl_geog.iterrows():
    geom = row['geometry']
    filter_geom = gpd.GeoSeries([geom], crs='EPSG:4326')

    #All Places
    filtered_places = quality_places[quality_places.geometry.intersects(filter_geom.unary_union)]

    #Multipoint of each LSOA's places
    combined_multipoint = filtered_places.geometry.unary_union
    
    #Add place category information
    category_list = filtered_places['category'].dropna().tolist()

    #Add back to Dataframe
    empl_geog_quality_places.at[index, 'category_list'] = category_list

empl_geog_quality_places.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,residential_total_area,commercial_avg_building_area,commercial_lsoa_area_ratio,commercial_total_area,office_avg_building_area,office_lsoa_area_ratio,office_total_area,retail_avg_building_area,retail_lsoa_area_ratio,retail_total_area
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,4.569009e-07,2.656669e-07,0.076921,1e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,4.215369e-07,2.245372e-07,0.09963,3e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0.0,3.214321e-07,0.065058,2e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [340]:
# One-Hot Encoding - yet again

#Explode category lists
empl_geog_quality_places_exploded = empl_geog_quality_places.explode('category_list')

#Set up encoder
encoder = OneHotEncoder(sparse_output=False)

#Encode
encoded_categories_quality = encoder.fit_transform(empl_geog_quality_places_exploded[['category_list']])
encoded_df = pd.DataFrame(encoded_categories_quality, columns=encoder.get_feature_names_out(['category_list']))

empl_geog_quality_places_exploded = pd.concat([empl_geog_quality_places_exploded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Reaggregate
empl_geog_quality_places_exploded.drop(columns=['category_list'], inplace=True)

# Group by LSOA11CD - sum numeric only
numeric_columns = encoded_df.columns.tolist()
empl_geog_quality_places_encoded = empl_geog_quality_places_exploded.groupby('LSOA11CD')[numeric_columns].sum().reset_index()

#Save columns of interest for analysis

quality_categories = empl_geog_quality_places_encoded.columns[1:]

empl_geog_quality_places_encoded.head()


Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abortion_clinic,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_acoustical_consultant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_nan
0,E01000001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [342]:
#Rejoin with all data

all_data_quality_places = pd.merge(empl_geog_quality_places, empl_geog_quality_places_encoded, on='LSOA11CD')

all_data_quality_places.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_nan
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [381]:
# Collapse Categories (Also Copied from DataExploration)

# Automated Condensing of Column Types

# List all columns
all_columns = encoded_categories.columns.tolist()

# I'll group them by 'suffix' to find commonalities
suffix_groups = {}

# Iterate through columns to identify suffixes
for column in all_columns:
    lower_column = column.lower()
    parts = lower_column.split('_')
    if len(parts) > 1:
        suffix = parts[-1]
        if suffix in suffix_groups:
            suffix_groups[suffix].append(column)
        else:
            suffix_groups[suffix] = [column]
    else:
        # Handle columns without suffixes
        suffix_groups['other'] = suffix_groups.get('other', []) + [column]

# Create a list of DataFrames to concatenate
concatenated_dfs = []

# Iterate through suffix groups and aggregate columns
for suffix, columns in suffix_groups.items():
    if columns:
        if len(columns) == 1:
            # Preserve col name if only one col with suffix
            new_column_name = columns[0].replace('category_list_', '')
            concatenated_dfs.append(encoded_categories[columns].rename(columns={columns[0]: new_column_name}))
        else:
            # Aggregate columns with more than one column in the group
            new_column_name = f'all_{suffix}'
            concatenated_dfs.append(encoded_categories[columns].sum(axis=1).rename(new_column_name))

# Concatenate all DataFrames
condensed_categories = pd.concat(concatenated_dfs, axis=1)

# Print summary information (groups only)
for suffix, columns in suffix_groups.items():
    num_columns = len(columns)
    if num_columns > 1:
        print(f'Number of {suffix.capitalize()} Categories: {num_columns}')

condensed_categories.head()

Number of Service Categories: 81
Number of Clinic Categories: 8
Number of Accommodation Categories: 2
Number of Consultant Categories: 9
Number of Center Categories: 31
Number of Services Categories: 85
Number of Education Categories: 3
Number of Entertainment Categories: 2
Number of Store Categories: 100
Number of Agency Categories: 12
Number of Restaurant Categories: 110
Number of Program Categories: 2
Number of Supplier Categories: 25
Number of Production Categories: 6
Number of Dealer Categories: 14
Number of Manufacturer Categories: 8
Number of Repair Categories: 21
Number of Lounge Categories: 2
Number of Centers Categories: 2
Number of Contractor Categories: 5
Number of Medicine Categories: 6
Number of League Categories: 4
Number of Team Categories: 4
Number of Park Categories: 9
Number of Church Categories: 5
Number of Hospital Categories: 3
Number of Shelter Categories: 2
Number of Studio Categories: 7
Number of Agent Categories: 2
Number of Apartments Categories: 2
Number of 

Unnamed: 0,LSOA11CD,all_service,all_clinic,abuse_and_addiction_treatment,all_accommodation,accountant,all_consultant,active_life,acupuncture,all_center,...,waxing,welders,well_drilling,wholesale_grocer,wildlife_control,wildlife_sanctuary,wills_trusts_and_probate,winery,wood_and_pulp,None
0,E01000001,12.0,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,21.0,0.0,0.0,1.0,5.0,4.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,26.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [348]:
#Remove all categories that have less than five in London (arbitrary cutoff for now)

column_sums = condensed_categories.iloc[:, 1:].sum()
columns_to_drop = column_sums[column_sums < 5].index.tolist()
print(columns_to_drop)

condensed_categories = condensed_categories.drop(columns=columns_to_drop)

#Store new categories
condensed_category_columns = condensed_categories.columns[1:]

condensed_categories.head()

['aesthetician', 'agricultural_cooperatives', 'allergist', 'anesthesiologist', 'apiaries_and_beekeepers', 'artificial_turf', 'ayurveda', 'b2b_machinery_and_tools', 'all_area', 'batting_cage', 'bicycle_sharing_location', 'bistro', 'business_financing', 'cabaret', 'car_auction', 'car_sharing', 'castle', 'chambers_of_commerce', 'coffee_roastery', 'community_gardens', 'convents_and_monasteries', 'customized_merchandise', 'drive_in_theater', 'empanadas', 'feng_shui', 'festival', 'fidelity_and_surety_bonds', 'firewood', 'forest', 'fort', 'gemstone_and_mineral', 'gents_tailor', 'glass_blowing', 'greengrocer', 'greenhouses', 'heliports', 'hydrotherapy', 'all_specialist', 'inn', 'irrigation', 'island', 'knife_sharpening', 'light_rail_and_subway_stations', 'macarons', 'marching_band', 'metal_materials_and_experts', 'mills', 'mission', 'mountain_bike_parks', 'mountain_bike_trails', 'office_of_vital_records', 'oncologist', 'orthotics', 'outdoor_movies', 'package_locker', 'pediatric_cardiology', 'p

Unnamed: 0,LSOA11CD,all_service,all_clinic,abuse_and_addiction_treatment,all_accommodation,accountant,all_consultant,active_life,acupuncture,all_center,...,veterinarian,videographer,vitamins_and_supplements,warehouses,waterproofing,waxing,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,winery
0,E01000001,12.0,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,21.0,0.0,0.0,1.0,5.0,4.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,26.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [353]:
# Join Condensed Version with data (Same Logic as in EarlyModels.ipynb and above)

all_data_cleaned = pd.merge(data_with_geom, condensed_categories, on='LSOA11CD')

all_data_cleaned.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,veterinarian,videographer,vitamins_and_supplements,warehouses,waterproofing,waxing,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,winery
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Test Model with Collapsed and Quality/Occurrence Cleaned Categories

In [390]:

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=42)

# Save results and fit
automl = AutoML(results_path="automl_results_v2/raw_employment_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.65287891862071
RMSE: 1498.5201266686245


## Target Employment Density

In [358]:
# Create a new column Employment Density and create new model to target that
all_data_cleaned['employment_density'] = all_data_cleaned['total_employment'] / (all_data_cleaned['geometry']).to_crs("EPSG:27700").area


In [392]:

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=42)

# Save results and fit
automl = AutoML(results_path="automl_results_v2/employment_density_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.611789894148709
RMSE: 0.009895010757975303




## Target Office Worker Employment Density

In [369]:
# Use office work cols from previous analysis

office_work_cols = [
    '42 : Civil engineering',
    '58 : Publishing activities',
    '59 : Motion picture, video and television programme production, sound recording and music publishing activities',
    '60 : Programming and broadcasting activities',
    '61 : Telecommunications',
    '62 : Computer programming, consultancy and related activities',
    '63 : Information service activities',
    '64 : Financial service activities, except insurance and pension funding',
    '65 : Insurance, reinsurance and pension funding, except compulsory social security',
    '66 : Activities auxiliary to financial services and insurance activities',
    '68 : Real estate activities',
    '69 : Legal and accounting activities',
    '70 : Activities of head offices; management consultancy activities',
    '71 : Architectural and engineering activities; technical testing and analysis',
    '72 : Scientific research and development',
    '73 : Advertising and market research',
    '74 : Other professional, scientific and technical activities',
    '77 : Rental and leasing activities',
    '78 : Employment activities',
    '79 : Travel agency, tour operator and other reservation service and related activities',
    '80 : Security and investigation activities',
    '82 : Office administrative, office support and other business support activities',
    '84 : Public administration and defence; compulsory social security'
]

# Create a new Office Work Total Column :
all_data_cleaned['office_total_employment'] = all_data_cleaned[office_work_cols].sum(axis=1)

# And an Office Work Density Column
all_data_cleaned['office_employment_density'] = all_data_cleaned['office_total_employment'] / (all_data_cleaned['geometry']).to_crs("EPSG:27700").area

all_data_cleaned.head()


Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,warehouses,waterproofing,waxing,wholesale_grocer,wildlife_sanctuary,wills_trusts_and_probate,winery,employment_density,office_total_employment,office_employment_density
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10771,10800,0.081008
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201621,39550,0.174852
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013263,425,0.007417
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.119194,18070,0.094737
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000728,5,3.5e-05


In [394]:
# Run the model

# Create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(condensed_category_columns)
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned[features], all_data_cleaned[target], test_size=0.2, random_state=42)

# Save results and fit
automl = AutoML(results_path="automl_results_v2/office_employment_density_category_cleaned/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.5788239598822451
RMSE: 0.006880789687387832


