# Modelling

This is the script where I store all my ML model runs

## Library Imports

In [3]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np
import csv

#Shapely / Spatial
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

### Import Datasets

In [4]:
# Read London CSV
all_data_london = pd.read_csv("data/combined_data/lag/all_data_london_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_london.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-laged column set
feature_columns_london = [col for col in feature_columns_london_lag if not col.startswith('lag_')]

# ---

# Read Birmingham CSV
all_data_bham = pd.read_csv("data/combined_data/lag/all_data_bham_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_bham.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-lagged column set
feature_columns_bham = [col for col in feature_columns_bham_lag if not col.startswith('lag_')]

# Fix null values ending up in logged variables
all_data_london.fillna(0)
all_data_bham.fillna(0)

Unnamed: 0.1,Unnamed: 0,LSOA11CD,LSOA11NM_x,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,lag_travel,lag_travel_agents,lag_trusts,lag_university_housing,lag_used_vintage_and_consignment,lag_veterinarian,lag_videographer,lag_vitamins_and_supplements,lag_warehouses,lag_window_washing
0,0,E01008881,Birmingham 067A,Birmingham 067A,0.0,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.333333,0.0
1,1,E01008882,Birmingham 066A,Birmingham 066A,0.0,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.500000,0.0
2,2,E01008883,Birmingham 078A,Birmingham 078A,0.0,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,3,E01008884,Birmingham 078B,Birmingham 078B,0.0,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.500000,0.0
4,4,E01008885,Birmingham 076A,Birmingham 076A,0.0,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,634,E01033646,Birmingham 031I,Birmingham 031I,0.0,1624,lsoa2011:E01033646 : Birmingham 031I,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0
635,635,E01033647,Birmingham 058E,Birmingham 058E,0.0,1398,lsoa2011:E01033647 : Birmingham 058E,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
636,636,E01033648,Birmingham 084F,Birmingham 084F,0.0,2715,lsoa2011:E01033648 : Birmingham 084F,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
637,637,E01033649,Birmingham 058F,Birmingham 058F,0.0,1801,lsoa2011:E01033649 : Birmingham 058F,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0


In [5]:
# Create Lag (but no building cols) feature spaces

# List all building footprint related columns here
columns_to_remove = ['num_buildings','num_retail_buildings','num_residential_buildings','num_office_buildings','num_commercial_buildings',
                    'log_num_buildings','all_avg_building_area','all_lsoa_area_ratio','all_total_area',
                     'retail_avg_building_area','retail_lsoa_area_ratio','retail_total_area',
                     'residential_avg_building_area','residential_lsoa_area_ratio','residential_total_area',
                     'commercial_avg_building_area','commercial_lsoa_area_ratio','commercial_total_area',
                     'office_avg_building_area','office_lsoa_area_ratio','office_total_area',
                     'lag_num_retail_buildings','lag_num_residential_buildings','lag_num_office_buildings','lag_num_commercial_buildings',
                    'lag_log_num_buildings','lag_all_avg_building_area','lag_all_lsoa_area_ratio','lag_all_total_area',
                     'lag_retail_avg_building_area','lag_retail_lsoa_area_ratio','lag_retail_total_area',
                     'lag_residential_avg_building_area','lag_residential_lsoa_area_ratio','lag_residential_total_area',
                     'lag_commercial_avg_building_area','lag_commercial_lsoa_area_ratio','lag_commercial_total_area',
                     'lag_office_avg_building_area','lag_office_lsoa_area_ratio','lag_office_total_area',
                    ]

feature_columns_london_lag_poi = [col for col in feature_columns_london_lag if col not in columns_to_remove]

feature_columns_bham_lag_poi = [col for col in feature_columns_bham_lag if col not in columns_to_remove]

In [7]:
# Download POI & auxillary-only feature spaces (No building footprint information)

# London
# Read in POI feature column set
with open("data/combined_data/feature_columns_london_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_poi = [''.join(line.strip().split(',')) for line in lines]

# Birmingham
# Read in POI feature column set
with open("data/combined_data/feature_columns_bham_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_poi = [''.join(line.strip().split(',')) for line in lines]

In [11]:
# Get log of density columns


min_non_zero_london_employment_density = all_data_london['employment_density'].loc[all_data_london['employment_density'] > 0].min()
min_non_zero_london_office_employment_density = all_data_london['office_employment_density'].loc[all_data_london['office_employment_density'] > 0].min()

epsilon_london_employment_density = min_non_zero_london_employment_density / 10
epsilon_london_office_employment_density = min_non_zero_london_office_employment_density / 10

all_data_london['log_employment_density'] = np.log(all_data_london['employment_density'].replace(0, epsilon_london_employment_density))
all_data_london['log_office_employment_density'] = np.log(all_data_london['office_employment_density'].replace(0, epsilon_london_office_employment_density))

# Birmingham
min_non_zero_bham_employment_density = all_data_bham['employment_density'].loc[all_data_bham['employment_density'] > 0].min()
min_non_zero_bham_office_employment_density = all_data_bham['office_employment_density'].loc[all_data_bham['office_employment_density'] > 0].min()

epsilon_bham_employment_density = min_non_zero_bham_employment_density / 10
epsilon_bham_office_employment_density = min_non_zero_bham_office_employment_density / 10

all_data_bham['log_employment_density'] = np.log(all_data_bham['employment_density'].replace(0, epsilon_bham_employment_density))
all_data_bham['log_office_employment_density'] = np.log(all_data_bham['office_employment_density'].replace(0, epsilon_bham_office_employment_density))


In [13]:
# Save Geometries and Names

london_geometries = all_data_london['geometry']
bham_geometries = all_data_bham['geometry']

london_names = all_data_london['LSOA11CD']
bham_names = all_data_bham['LSOA11CD']

### Test Model

In [8]:
# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(results_path="ml_results/dummy_models/test", mode='Explain')

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_log_employment = r2_score(y_test, predictions)
rmse_log_employment = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_log_employment}')
print(f'RMSE: {rmse_log_employment}')

#Save results for plotting
predictions_all = automl.predict(all_data_london[features])

results_test = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all,
})

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7207009026018532
RMSE: 0.678412914276123


## Models

In [23]:
### Employment (log), NO Spatial Lag, London (1)

# Create training and testing data
features = feature_columns_london
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_1 = r2_score(y_test, predictions)
rmse_1 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_1}')
print(f'RMSE: {rmse_1}')

#Save results for plotting
predictions_all_1 = automl.predict(all_data_london[features])
london_geometries = all_data_london.loc[all_data_london[target].index, 'geometry']

results_london = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_1,
})

results_london.to_csv("data/combined_data/model_results_london.csv")



This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.


 [ 6.80946013]
 [ 4.38591787]
 [ 5.83407018]
 [ 3.702589  ]
 [ 6.0516915 ]
 [ 5.03109929]
 [ 6.84453532]
 [ 5.79870698]
 [ 8.45781323]
 [ 5.4395375 ]
 [ 5.67026278]
 [ 7.32026383]
 [ 4.35279486]
 [ 4.23863384]
 [ 5.93759248]
 [ 4.83369157]
 [ 5.55383155]
 [ 4.33399865]
 [ 6.14339587]
 [ 7.94330499]
 [ 6.09279343]
 [ 5.17478511]
 [ 6.77350041]
 [ 4.97250935]
 [ 4.492506  ]
 [ 5.84509274]
 [ 4.67164895]
 [ 5.25216767]
 [ 5.09976909]
 [ 7.9933624 ]
 [ 5.35254857]
 [ 4.51752016]
 [ 5.41707036]
 [ 4.64347693]
 [ 4.0013468 ]
 [ 5.93353793]
 [ 5.76821181]
 [ 4.68915936]
 [ 6.39604136]
 [ 6.37537715]
 [ 5.72583863]
 [ 4.98765799]
 [ 7.27753064]
 [ 5.69480273]
 [ 6.68142554]
 [ 5.30826709]
 [ 6.96415707]
 [ 4.73020408]
 [ 4.68304488]
 [ 4.88760564]
 [ 5.36963984]
 [ 5.51725766]
 [ 5.62433955]
 [ 5.37293002]
 [ 6.8143892 ]
 [ 7.36085126]
 [ 6.54017779]
 [ 4.41897103]
 [ 7.20856807]
 [ 7.79765269]
 [ 6.64372107]
 [ 5.58249041]
 [ 6.61215302]
 [ 5.98662946]
 [ 5.95831963]
 [ 6.2509236 ]
 [ 5.25161

R^2 Score: 0.7578170639491384
RMSE: 0.6317294239997864


 [8.11968228]
 [5.98403355]
 ...
 [4.90988776]
 [6.04245421]
 [5.33173081]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [7.97952744]
 [5.96857115]
 ...
 [4.9867284 ]
 [6.20635363]
 [5.65408465]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [7.47653672]
 [6.47881743]
 ...
 [5.14291188]
 [6.16592261]
 [5.31699988]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [8.05625007]
 [6.02841184]
 ...
 [5.3221688 ]
 [6.05186745]
 [5.21711442]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [8.13455531]
 [6.6509175 ]
 ...
 [4.79327437]
 [6.23482653]
 [5.33121487]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [7.81974694]
 [6.73709199]
 ...
 [4.69893953]
 [6.38250491]
 [5.52998397]]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
 [7.

In [40]:
### Employment (log), NO Spatial Lag, Birmingham (2)

# Create training and testing data
features = feature_columns_bham
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_2 = r2_score(y_test, predictions)
rmse_2 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_2}')
print(f'RMSE: {rmse_2}')

#Save results for plotting
predictions_all_2 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_2,
})


AutoML directory: ml_results/log_employment_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.803686 trained in 5.65 seconds
2_Default_CatBoost rmse 0.783334 trained in 18.54 seconds
3_Default_RandomForest rmse 0.821087 trained in 24.78 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.807126 trained in 5.22 seconds
8_CatBoost rmse 0.810248 trained in 22.31 seconds
12_RandomForest rmse 0.816138 trained in 57.95 seconds
5_Xgboost rmse 0.808929 trained in 5.8 seconds
9_CatBoost rmse 0.830576 trained in 20.01 seconds
13_RandomForest rmse 0.848962 trained in 56.66 seconds
6_Xgboost r

In [41]:
### Employment (log), Spatial Lag, London (3)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_3 = r2_score(y_test, predictions)
rmse_3 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_3}')
print(f'RMSE: {rmse_3}')

#Save results for plotting
predictions_all_3 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_3,
})


AutoML directory: ml_results/log_employment_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.638593 trained in 1086.36 seconds
2_Default_CatBoost rmse 0.63385 trained in 2029.01 seconds
3_Default_RandomForest rmse 0.693922 trained in 2068.46 seconds
Skip not_so_random because of the time limit.
Skip hill_climbing_1 because of the time limit.
Skip hill_climbing_2 because of the time limit.
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.627695 trained in 0.05 seconds
AutoML fit time: 5186.08 seconds
AutoML best model: Ensemble
R^2 Score: 0.7547717661754779
RMSE: 0.635688841342926


In [22]:
### Employment (log), Spatial Lag, Birmingham (4)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_4 = r2_score(y_test, predictions)
rmse_4 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_4}')
print(f'RMSE: {rmse_4}')

#Save results for plotting
predictions_all_4 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_4,
})




This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6180309508339799
RMSE: 0.7356722354888916


In [21]:
### Employment Density, NO Spatial Lag, London (5)

# Create training and testing data
features = feature_columns_london
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_5 = r2_score(y_test, predictions)
rmse_5 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_5}')
print(f'RMSE: {rmse_5}')

#Save results for plotting
predictions_all_5 = automl.predict(all_data_london[features])

results_london_density = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_5,
})

results_london_density.to_csv("data/combined_data/model_results_london_density.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7560692195505975
RMSE: 0.6752620339393616


In [68]:
### Employment Density, NO Spatial Lag, Birmingham (6)

# Create training and testing data
features = feature_columns_bham
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_6 = r2_score(y_test, predictions)
rmse_6 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_6}')
print(f'RMSE: {rmse_6}')

#Save results for plotting
predictions_all_6 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_6,
})


AutoML directory: ml_results/employment_density_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.944998 trained in 6.53 seconds
2_Default_CatBoost rmse 0.898496 trained in 19.96 seconds
3_Default_RandomForest rmse 0.910195 trained in 20.69 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.92632 trained in 6.46 seconds
8_CatBoost rmse 0.919499 trained in 25.68 seconds
12_RandomForest rmse 0.917293 trained in 22.01 seconds
5_Xgboost rmse 0.968809 trained in 7.21 seconds
9_CatBoost rmse 0.918586 trained in 29.07 seconds
13_RandomForest rmse 0.92085 trained in 21.78 seconds
6_Xgboos

In [69]:
### Employment Density, Spatial Lag, London (7)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_7 = r2_score(y_test, predictions)
rmse_7 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_7}')
print(f'RMSE: {rmse_7}')

#Save results for plotting
predictions_all_7 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_7,
})


AutoML directory: ml_results/employment_density_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.657881 trained in 215.07 seconds
2_Default_CatBoost rmse 0.642791 trained in 266.85 seconds
3_Default_RandomForest rmse 0.757284 trained in 138.25 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.669616 trained in 207.17 seconds
8_CatBoost rmse 0.64591 trained in 322.24 seconds
12_RandomForest rmse 0.762435 trained in 209.54 seconds
5_Xgboost rmse 0.675766 trained in 263.41 seconds
9_CatBoost rmse 0.676002 trained in 333.31 seconds
13_RandomForest rmse 0.79277 trained in 200.3

In [70]:
### Employment Density, Spatial Lag, Birmingham (8)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_8 = r2_score(y_test, predictions)
rmse_8 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_8}')
print(f'RMSE: {rmse_8}')

#Save results for plotting
predictions_all_8 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_8,
})


AutoML directory: ml_results/employment_density_bham_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.917768 trained in 13.2 seconds
2_Default_CatBoost rmse 0.896484 trained in 48.55 seconds
3_Default_RandomForest rmse 0.903104 trained in 46.17 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.873724 trained in 11.28 seconds
8_CatBoost rmse 0.891091 trained in 54.12 seconds
12_RandomForest rmse 0.912287 trained in 52.29 seconds
5_Xgboost rmse 0.956213 trained in 10.76 seconds
9_CatBoost rmse 0.903869 trained in 50.05 seconds
13_RandomForest rmse 0.931725 trained in 40.16 seconds


In [71]:
### Office Employment Density , NO Spatial Lag, London (9)

# Create training and testing data
features = feature_columns_london
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_9 = r2_score(y_test, predictions)
rmse_9 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_9}')
print(f'RMSE: {rmse_9}')

#Save results for plotting
predictions_all_9 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_9,
})


AutoML directory: ml_results/office_employment_density_london/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 1.504599 trained in 45.38 seconds
2_Default_CatBoost rmse 1.49624 trained in 50.4 seconds
3_Default_RandomForest rmse 1.580713 trained in 54.52 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 1.519549 trained in 41.03 seconds
8_CatBoost rmse 1.49358 trained in 73.17 seconds
12_RandomForest rmse 1.579707 trained in 61.09 seconds
5_Xgboost rmse 1.52396 trained in 39.45 seconds
9_CatBoost rmse 1.529924 trained in 62.62 seconds
13_RandomForest rmse 1.599111 trained in 43.35 second

In [72]:
### Office Employment Density , NO Spatial Lag, Birmingham (10)

# Create training and testing data
features = feature_columns_bham
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_10 = r2_score(y_test, predictions)
rmse_10 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_10}')
print(f'RMSE: {rmse_10}')

#Save results for plotting
predictions_all_10 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_10,
})


AutoML directory: ml_results/office_employment_density_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 2.101584 trained in 5.72 seconds
2_Default_CatBoost rmse 2.049278 trained in 16.15 seconds
3_Default_RandomForest rmse 2.06467 trained in 18.63 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 2.121226 trained in 5.28 seconds
8_CatBoost rmse 2.067713 trained in 17.37 seconds
12_RandomForest rmse 2.041724 trained in 20.69 seconds
5_Xgboost rmse 2.16027 trained in 5.37 seconds
9_CatBoost rmse 2.133641 trained in 16.67 seconds
13_RandomForest rmse 2.028177 trained in 16.64 seconds
6

In [73]:
### Office Employment Density, Spatial Lag, London (11)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_11 = r2_score(y_test, predictions)
rmse_11 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_11}')
print(f'RMSE: {rmse_11}')

#Save results for plotting
predictions_all_11 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_11,
})


AutoML directory: ml_results/office_employment_density_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 1.484074 trained in 302.33 seconds
2_Default_CatBoost rmse 1.470179 trained in 259.86 seconds
3_Default_RandomForest rmse 1.56113 trained in 139.17 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 1.495399 trained in 185.21 seconds
8_CatBoost rmse 1.473112 trained in 285.45 seconds
12_RandomForest rmse 1.551056 trained in 205.92 seconds
5_Xgboost rmse 1.494743 trained in 152.86 seconds
9_CatBoost rmse 1.514128 trained in 266.47 seconds
13_RandomForest rmse 1.57424 trained i

In [74]:
### Office Employment Density (log), Spatial Lag, Birmingham (12)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_12 = r2_score(y_test, predictions)
rmse_12 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_12}')
print(f'RMSE: {rmse_12}')

#Save results for plotting
predictions_all_12 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_12,
})


AutoML directory: ml_results/office_employment_density_bham_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 2.062851 trained in 14.2 seconds
2_Default_CatBoost rmse 1.991828 trained in 61.23 seconds
3_Default_RandomForest rmse 2.010472 trained in 72.42 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 2.063623 trained in 20.46 seconds
8_CatBoost rmse 2.01133 trained in 67.34 seconds
12_RandomForest rmse 1.985058 trained in 44.24 seconds
5_Xgboost rmse 2.121576 trained in 15.44 seconds
9_CatBoost rmse 2.10722 trained in 62.0 seconds
13_RandomForest rmse 2.010341 trained in 52.61 seco

In [45]:
### Employment (log), NO Spatial Lag, London, POI only (13)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_13 = r2_score(y_test, predictions)
rmse_13 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_13}')
print(f'RMSE: {rmse_13}')

#Save results for plotting
predictions_all_13 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_13,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7256287050510382
RMSE: 0.672401487827301


In [36]:
### Employment (log), NO Spatial Lag, Brimingham, POI only (14)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_14 = r2_score(y_test, predictions)
rmse_14 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_14}')
print(f'RMSE: {rmse_14}')

#Save results for plotting
predictions_all_14 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_14,
})




AutoML directory: ml_results/log_employment_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.796247 trained in 1077.34 seconds
2_Default_CatBoost rmse 0.811589 trained in 10.74 seconds
3_Default_RandomForest rmse 0.819108 trained in 18.32 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.814556 trained in 4.52 seconds
8_CatBoost rmse 0.820528 trained in 17.07 seconds
12_RandomForest rmse 0.81786 trained in 18.95 seconds
5_Xgboost rmse 0.818373 trained in 5.14 seconds
9_CatBoost rmse 0.835969 trained in 14.25 seconds
13_RandomForest rmse 0.847451 trained in 20.41 seconds
6_Xg

In [75]:
### Employment Density, NO Spatial Lag, London, POI only (15)

# Create training and testing data
features = feature_columns_london_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_15 = r2_score(y_test, predictions)
rmse_15 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_15}')
print(f'RMSE: {rmse_15}')

#Save results for plotting
predictions_all_15 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_15,
})


AutoML directory: ml_results/employment_density_london_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.810964 trained in 41.43 seconds
2_Default_CatBoost rmse 0.784844 trained in 51.37 seconds
3_Default_RandomForest rmse 0.922467 trained in 78.52 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.815659 trained in 41.2 seconds
8_CatBoost rmse 0.782558 trained in 62.33 seconds
12_RandomForest rmse 0.918514 trained in 45.81 seconds
5_Xgboost rmse 0.826209 trained in 42.8 seconds
9_CatBoost rmse 0.799203 trained in 58.45 seconds
13_RandomForest rmse 0.944806 trained in 45.33 seconds

In [76]:
### Employment Density, NO Spatial Lag, Brimingham, POI only (16)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_16 = r2_score(y_test, predictions)
rmse_16 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_16}')
print(f'RMSE: {rmse_16}')

#Save results for plotting
predictions_all_16 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_16,
})


AutoML directory: ml_results/employment_density_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.933653 trained in 5.96 seconds
2_Default_CatBoost rmse 0.90717 trained in 14.21 seconds
3_Default_RandomForest rmse 0.9083 trained in 23.18 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.927255 trained in 5.39 seconds
8_CatBoost rmse 0.916364 trained in 18.89 seconds
12_RandomForest rmse 0.92552 trained in 25.7 seconds
5_Xgboost rmse 0.974737 trained in 5.45 seconds
9_CatBoost rmse 0.936304 trained in 16.28 seconds
13_RandomForest rmse 0.935442 trained in 21.21 seconds
6_Xgboo

In [77]:
### Office Employment Density, NO Spatial Lag, London, POI only (17)


# Create training and testing data
features = feature_columns_london_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_17 = r2_score(y_test, predictions)
rmse_17 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_17}')
print(f'RMSE: {rmse_17}')

#Save results for plotting
predictions_all_17 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_17,
})


AutoML directory: ml_results/office_employment_density_london_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 1.594738 trained in 34.7 seconds
2_Default_CatBoost rmse 1.579013 trained in 48.37 seconds
3_Default_RandomForest rmse 1.690103 trained in 51.23 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 1.606491 trained in 36.18 seconds
8_CatBoost rmse 1.579068 trained in 58.23 seconds
12_RandomForest rmse 1.687728 trained in 67.06 seconds
5_Xgboost rmse 1.608838 trained in 35.24 seconds
9_CatBoost rmse 1.602372 trained in 56.5 seconds
13_RandomForest rmse 1.710323 trained in 51.08 

In [78]:
### Office Employment Density, NO Spatial Lag, Birmingham, POI only (18)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_18 = r2_score(y_test, predictions)
rmse_18 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_18}')
print(f'RMSE: {rmse_18}')

#Save results for plotting
predictions_all_18 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_18,
})


AutoML directory: ml_results/office_employment_density_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 2.103662 trained in 6.26 seconds
2_Default_CatBoost rmse 2.032205 trained in 14.76 seconds
3_Default_RandomForest rmse 2.06174 trained in 19.28 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 2.115326 trained in 5.91 seconds
8_CatBoost rmse 2.063828 trained in 17.41 seconds
12_RandomForest rmse 2.032244 trained in 21.91 seconds
5_Xgboost rmse 2.156681 trained in 6.02 seconds
9_CatBoost rmse 2.116306 trained in 17.34 seconds
13_RandomForest rmse 2.046611 trained in 17.34 seco

In [48]:
### Employment (log), Spatial Lag, NO building footprints, London (19)

# Create training and testing data
features = feature_columns_london_lag_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_london_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_19 = r2_score(y_test, predictions)
rmse_19 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_19}')
print(f'RMSE: {rmse_19}')

#Save results for plotting
predictions_all_19 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_19,
})


AutoML directory: ml_results/employment_london_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.677317 trained in 178.57 seconds
2_Default_CatBoost rmse 0.667763 trained in 219.62 seconds
3_Default_RandomForest rmse 0.705324 trained in 156.94 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.682807 trained in 181.02 seconds
8_CatBoost rmse 0.669091 trained in 270.9 seconds
12_RandomForest rmse 0.708888 trained in 198.86 seconds
5_Xgboost rmse 0.684443 trained in 240.9 seconds
9_CatBoost rmse 0.682518 trained in 308.91 seconds
13_RandomForest rmse 0.723025 trained in 193.24 se

In [59]:
### Employment (log), Spatial Lag, NO building footprints, Birmingham (20)

# Create training and testing data
features = feature_columns_bham_lag_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_bham_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_20 = r2_score(y_test, predictions)
rmse_20 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_20}')
print(f'RMSE: {rmse_20}')

#Save results for plotting
predictions_all_20 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_20,
})

AutoML directory: ml_results/employment_bham_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.834064 trained in 14.83 seconds
2_Default_CatBoost rmse 0.828002 trained in 52.99 seconds
3_Default_RandomForest rmse 0.831382 trained in 47.92 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.856098 trained in 13.66 seconds
8_CatBoost rmse 0.846562 trained in 68.72 seconds
12_RandomForest rmse 0.817336 trained in 38.78 seconds
5_Xgboost rmse 0.869507 trained in 11.26 seconds
9_CatBoost rmse 0.888438 trained in 50.76 seconds
13_RandomForest rmse 0.850046 trained in 52.13 seconds
6_X

In [79]:
### Employment density (log), Spatial Lag, NO building footprints, London (21)

# Create training and testing data
features = feature_columns_london_lag_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_21 = r2_score(y_test, predictions)
rmse_21 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_21}')
print(f'RMSE: {rmse_21}')

#Save results for plotting
predictions_all_21 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_21,
})


AutoML directory: ml_results/employment_density_london_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.752636 trained in 182.62 seconds
2_Default_CatBoost rmse 0.741426 trained in 227.56 seconds
3_Default_RandomForest rmse 0.856403 trained in 126.46 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.764233 trained in 222.06 seconds
8_CatBoost rmse 0.742621 trained in 260.06 seconds
12_RandomForest rmse 0.85494 trained in 162.99 seconds
5_Xgboost rmse 0.768562 trained in 203.36 seconds
9_CatBoost rmse 0.766185 trained in 241.98 seconds
13_RandomForest rmse 0.87985 trained in 1

In [80]:
### Employment density (log), Spatial Lag, NO building footprints, Birmingham (22)

# Create training and testing data
features = feature_columns_bham_lag_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_22 = r2_score(y_test, predictions)
rmse_22 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_22}')
print(f'RMSE: {rmse_22}')

#Save results for plotting
predictions_all_22 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_22,
})

AutoML directory: ml_results/employment_density_bham_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.910647 trained in 15.7 seconds
2_Default_CatBoost rmse 0.87662 trained in 54.19 seconds
3_Default_RandomForest rmse 0.905614 trained in 45.36 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.864314 trained in 18.83 seconds
8_CatBoost rmse 0.903975 trained in 64.34 seconds
12_RandomForest rmse 0.907049 trained in 54.09 seconds
5_Xgboost rmse 0.953977 trained in 13.77 seconds
9_CatBoost rmse 0.944015 trained in 58.04 seconds
13_RandomForest rmse 0.9308 trained in 37.84 seconds

In [81]:
### Office Employment Density (log), Spatial Lag, NO building footprints, London (23)

# Create training and testing data
features = feature_columns_london_lag_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_23 = r2_score(y_test, predictions)
rmse_23 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_23}')
print(f'RMSE: {rmse_23}')

#Save results for plotting
predictions_all_23 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_23,
})


AutoML directory: ml_results/office_employment_density_london_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 1.538698 trained in 152.26 seconds
2_Default_CatBoost rmse 1.522154 trained in 229.66 seconds
3_Default_RandomForest rmse 1.609395 trained in 133.37 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 1.554216 trained in 138.95 seconds
8_CatBoost rmse 1.516223 trained in 239.88 seconds
12_RandomForest rmse 1.601996 trained in 155.01 seconds
5_Xgboost rmse 1.556924 trained in 162.92 seconds
9_CatBoost rmse 1.557065 trained in 233.53 seconds
13_RandomForest rmse 1.62409 trai

In [82]:
### Office Employment Density (log), Spatial Lag, NO building footprints, Birmingham (24)

# Create training and testing data
features = feature_columns_bham_lag_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_lag_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_24 = r2_score(y_test, predictions)
rmse_24 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_24}')
print(f'RMSE: {rmse_24}')

#Save results for plotting
predictions_all_24 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_24,
})

AutoML directory: ml_results/office_employment_density_bham_lag_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 2.005497 trained in 17.1 seconds
2_Default_CatBoost rmse 2.001502 trained in 54.7 seconds
3_Default_RandomForest rmse 1.997111 trained in 51.12 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 2.04148 trained in 15.99 seconds
8_CatBoost rmse 2.035985 trained in 58.98 seconds
12_RandomForest rmse 1.980989 trained in 45.21 seconds
5_Xgboost rmse 2.074815 trained in 13.59 seconds
9_CatBoost rmse 2.043254 trained in 57.29 seconds
13_RandomForest rmse 2.018145 trained in 48.2 

In [84]:
### Employment (log), NO Spatial Lag, London,  POI EXCLUSIVE (25)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_25 = r2_score(y_test, predictions)
rmse_25 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_25}')
print(f'RMSE: {rmse_25}')

#Save results for plotting
predictions_all_25 = automl.predict(all_data_london[features])

results_london_poi = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_25,
})

# Save to project folder
results_london_poi.to_csv("data/combined_data/model_results_london.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7102393476371849
RMSE: 0.6910015940666199


In [86]:
### Employment (log), NO Spatial Lag, Birmingham, POI EXCLUSIVE (26)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_26 = r2_score(y_test, predictions)
rmse_26 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_26}')
print(f'RMSE: {rmse_26}')

#Save results for plotting
predictions_all_26 = automl.predict(all_data_bham[features])

results_bham_poi = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_26,
})

# Save to project folder
results_bham_poi.to_csv("data/combined_data/model_results_bham_poi.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6226217466175756
RMSE: 0.7312378883361816


In [83]:
### Employment Density, NO Spatial Lag, London,  POI EXCLUSIVE (27)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_27 = r2_score(y_test, predictions)
rmse_27 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_27}')
print(f'RMSE: {rmse_27}')

#Save results for plotting
predictions_all_27 = automl.predict(all_data_london[features])

results_london_poi_density = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_27,
})

# Save to project folder
results_london_poi_density.to_csv("data/combined_data/model_results_london_poi_density.csv")

AutoML directory: ml_results/poi_exclusive_london_density/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.810964 trained in 38.17 seconds
2_Default_CatBoost rmse 0.784844 trained in 49.5 seconds
3_Default_RandomForest rmse 0.922467 trained in 76.45 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.815659 trained in 37.39 seconds
8_CatBoost rmse 0.782558 trained in 60.3 seconds
12_RandomForest rmse 0.918514 trained in 44.62 seconds
5_Xgboost rmse 0.826209 trained in 37.36 seconds
9_CatBoost rmse 0.799203 trained in 54.8 seconds
13_RandomForest rmse 0.944806 trained in 43.15 seconds
6

In [84]:
### Employment Density, NO Spatial Lag, Birmingham, POI EXCLUSIVE (28)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_28 = r2_score(y_test, predictions)
rmse_28 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_28}')
print(f'RMSE: {rmse_28}')

#Save results for plotting
predictions_all_28 = automl.predict(all_data_bham[features])

results_bham_poi_density = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_28,
})

# Save to project folder
results_bham_poi_density.to_csv("data/combined_data/model_results_bham_poi_density.csv")

AutoML directory: ml_results/poi_exclusive_bham_density/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.975145 trained in 6.78 seconds
2_Default_CatBoost rmse 0.937522 trained in 14.95 seconds
3_Default_RandomForest rmse 1.024099 trained in 19.68 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.943804 trained in 7.71 seconds
8_CatBoost rmse 0.937614 trained in 21.5 seconds
12_RandomForest rmse 0.991453 trained in 27.94 seconds
5_Xgboost rmse 0.968603 trained in 6.12 seconds
9_CatBoost rmse 0.978892 trained in 16.57 seconds
13_RandomForest rmse 1.013362 trained in 22.69 seconds
6_Xg

In [85]:
### Office Employment Density, NO Spatial Lag, London, POI EXCLUSIVE (29)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london_office_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_29 = r2_score(y_test, predictions)
rmse_29 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_29}')
print(f'RMSE: {rmse_29}')

#Save results for plotting
predictions_all_29 = automl.predict(all_data_london[features])

results_london_poi_density = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_29,
})


AutoML directory: ml_results/poi_exclusive_london_office_density/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 1.594738 trained in 35.25 seconds
2_Default_CatBoost rmse 1.579013 trained in 46.64 seconds
3_Default_RandomForest rmse 1.690103 trained in 48.36 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 1.606491 trained in 34.34 seconds
8_CatBoost rmse 1.579068 trained in 55.67 seconds
12_RandomForest rmse 1.687728 trained in 65.8 seconds
5_Xgboost rmse 1.608838 trained in 34.87 seconds
9_CatBoost rmse 1.602372 trained in 60.39 seconds
13_RandomForest rmse 1.710323 trained in 56.55 

In [107]:
### Office Employment Density, NO Spatial Lag, Birmingham, POI EXCLUSIVE (30)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham_office_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_30 = r2_score(y_test, predictions)
rmse_30 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_30}')
print(f'RMSE: {rmse_30}')

#Save results for plotting
predictions_all_30 = automl.predict(all_data_bham[features])

results_bham_poi_density = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_30,
})

AutoML directory: ml_results/poi_exclusive_bham_office_density/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 2.166387 trained in 11.1 seconds
2_Default_CatBoost rmse 2.074265 trained in 14.75 seconds
3_Default_RandomForest rmse 2.117923 trained in 19.14 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 2.125148 trained in 4.07 seconds
8_CatBoost rmse 2.08122 trained in 11.85 seconds
12_RandomForest rmse 2.105842 trained in 21.06 seconds
5_Xgboost rmse 2.208737 trained in 5.83 seconds
9_CatBoost rmse 2.192014 trained in 14.64 seconds
13_RandomForest rmse 2.110464 trained in 18.27 secon

### Attempt with Demographic Data

In [98]:
# Same Analysis on Education, Employment Status, and Multiple Deprivation Data

general_health = pd.read_csv("data/lsoa_data/TS037_general_health.csv", skiprows = 7, header = 0)
employment_residential = pd.read_csv("data/lsoa_data/TS066_economic_activity_status.csv", skiprows = 7, header = 0)
education = pd.read_csv("data/lsoa_data/TS067_highest_qualification.csv", skiprows = 7, header = 0)
household_comp = pd.read_csv("data/lsoa_data/TS003_household_composition.csv", skiprows = 6, header = 0)
age_bands = pd.read_csv("data/lsoa_data/TS007B_age_broad_band.csv", skiprows = 4, header = 0)
english_lang = pd.read_csv("data/lsoa_data/TS029_english_language.csv", skiprows = 6, header = 0)

#Separate name into LSOA11CD and LSOA11NM (taken from DataCleaning.ipynb)
def split_column(value):
    if isinstance(value, str):
        code, name = value.split(' : ')
        return code.strip(), name.strip()
    else:
        return None, None

# Parse Code and Name out
general_health[['LSOA11CD', 'LSOA11NM']] = general_health['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
employment_residential[['LSOA11CD', 'LSOA11NM']] = employment_residential['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
education[['LSOA11CD', 'LSOA11NM']] = education['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
household_comp[['LSOA11CD', 'LSOA11NM']] = household_comp['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
age_bands[['LSOA11CD', 'LSOA11NM']] = age_bands['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
english_lang[['LSOA11CD', 'LSOA11NM']] = english_lang['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))

# Drop original column
general_health = general_health.drop(columns=['2021 super output area - lower layer'])
employment_residential = employment_residential.drop(columns=['2021 super output area - lower layer'])
education = education.drop(columns=['2021 super output area - lower layer'])
household_comp = household_comp.drop(columns=['2021 super output area - lower layer'])
age_bands = age_bands.drop(columns=['2021 super output area - lower layer'])
english_lang = english_lang.drop(columns=['2021 super output area - lower layer'])


multiple_deprivation = pd.read_csv("data/lsoa_data/multiple_deprivation.csv", header = 0)
multiple_deprivation.rename(columns = {'LSOA code (2011)':'LSOA11CD', 'LSOA name (2011)':'LSOA11NM'}, inplace=True)
multiple_deprivation = multiple_deprivation.drop(columns=["Local Authority District code (2019)", "Local Authority District name (2019)"])


multiple_deprivation.head()


Unnamed: 0,LSOA11CD,LSOA11NM,Index of Multiple Deprivation (IMD) Rank,Index of Multiple Deprivation (IMD) Decile
0,E01000001,City of London 001A,29199,9
1,E01000002,City of London 001B,30379,10
2,E01000003,City of London 001C,14915,5
3,E01000005,City of London 001E,8678,3
4,E01000006,Barking and Dagenham 016A,14486,5


In [116]:
# Join all data together

combined_census = pd.merge(general_health, employment_residential, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, education, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, household_comp, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, age_bands, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, english_lang, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, multiple_deprivation, on='LSOA11CD', suffixes=('', '_drop'))

# Get column lists
combined_census = combined_census[[col for col in combined_census.columns if not col.endswith('_drop')]]
combined_census_columns = list(combined_census.columns)
exclude_columns = ['LSOA11CD', 'LSOA11NM', 'residual', 'Total: All usual residents', 'Total: All usual residents aged 16 years and over']
census_feature_columns = [col for col in combined_census.columns if col not in exclude_columns]

# Join with london and birmingham model output data
combined_model = all_data_london
combined_model = combined_model.drop(columns=['LSOA11NM'])
combined_census = combined_census.merge(combined_model, on='LSOA11CD', how='inner')

# Fix string rank data
combined_census['Index of Multiple Deprivation (IMD) Rank'] = combined_census['Index of Multiple Deprivation (IMD) Rank'].str.replace(',', '')
combined_census['Index of Multiple Deprivation (IMD) Rank']  = pd.to_numeric(combined_census['Index of Multiple Deprivation (IMD) Rank'] )   

combined_census.head()

Unnamed: 0,Total: All usual residents,Very good health,Good health,Fair health,Bad health,Very bad health,LSOA11CD,LSOA11NM,Total: All usual residents aged 16 years and over,Economically active (excluding full-time students),...,lag_veterinarian,lag_videographer,lag_vitamins_and_supplements,lag_warehouses,lag_waterproofing,lag_waxing,lag_wholesale_grocer,lag_wildlife_sanctuary,lag_wills_trusts_and_probate,lag_winery
0,100.0,58.2,31.7,8.1,1.2,0.7,E01000001,City of London 001A,100.0,65.7,...,0.166667,0.333333,0.0,0.166667,0.0,0.0,0.0,0.0,0.333333,0.0
1,100.0,60.4,30.6,6.7,1.7,0.6,E01000002,City of London 001B,100.0,69.3,...,0.0,0.0,0.333333,0.833333,0.0,0.166667,0.0,0.0,0.166667,0.5
2,100.0,49.0,36.4,11.5,2.7,0.4,E01000003,City of London 001C,100.0,70.3,...,0.166667,0.166667,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,100.0,45.5,35.3,12.0,5.7,1.5,E01000005,City of London 001E,100.0,55.8,...,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0
4,100.0,64.6,28.8,4.9,1.5,0.1,E01032739,City of London 001F,100.0,78.4,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.5


In [130]:
### Employment (log), NO Spatial Lag, London, Demographic Data

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london + census_feature_columns
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(combined_census[features], combined_census[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/london_employment_demographic/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_23 = r2_score(y_test, predictions)
rmse_23 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_23}')
print(f'RMSE: {rmse_23}')

#Save results for plotting
predictions_all_23 = automl.predict(combined_census[features])

results_london_demographic = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': combined_census[target],
    'predicted': predictions_all_23,
})

# Save to project folder
results_london_demographic.to_csv("data/combined_data/model_results_london_demographic.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7474297689503127
RMSE: 0.6102148294448853


ValueError: array length 4659 does not match index length 4835