# Modelling

This is the script where I store all my ML model runs

## Library Imports

In [60]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np
import csv

#Shapely / Spatial
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

### Import Datasets

In [62]:
# Read London CSV
all_data_london = pd.read_csv("data/combined_data/lag/all_data_london_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_london.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-laged column set
feature_columns_london = [col for col in feature_columns_london_lag if not col.startswith('lag_')]

# ---

# Read Birmingham CSV
all_data_bham = pd.read_csv("data/combined_data/lag/all_data_bham_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_bham.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-lagged column set
feature_columns_bham = [col for col in feature_columns_bham_lag if not col.startswith('lag_')]

# Fix null values ending up in logged variables
all_data_london.fillna(0)
all_data_bham.fillna(0)

Unnamed: 0.1,Unnamed: 0,LSOA11CD,LSOA11NM_x,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,lag_travel,lag_travel_agents,lag_trusts,lag_university_housing,lag_used_vintage_and_consignment,lag_veterinarian,lag_videographer,lag_vitamins_and_supplements,lag_warehouses,lag_window_washing
0,0,E01008881,Birmingham 067A,Birmingham 067A,0.0,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.333333,0.0
1,1,E01008882,Birmingham 066A,Birmingham 066A,0.0,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.500000,0.0
2,2,E01008883,Birmingham 078A,Birmingham 078A,0.0,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,3,E01008884,Birmingham 078B,Birmingham 078B,0.0,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.500000,0.0
4,4,E01008885,Birmingham 076A,Birmingham 076A,0.0,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,634,E01033646,Birmingham 031I,Birmingham 031I,0.0,1624,lsoa2011:E01033646 : Birmingham 031I,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0
635,635,E01033647,Birmingham 058E,Birmingham 058E,0.0,1398,lsoa2011:E01033647 : Birmingham 058E,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
636,636,E01033648,Birmingham 084F,Birmingham 084F,0.0,2715,lsoa2011:E01033648 : Birmingham 084F,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
637,637,E01033649,Birmingham 058F,Birmingham 058F,0.0,1801,lsoa2011:E01033649 : Birmingham 058F,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0


In [64]:
# Download POI & auxillary-only feature spaces (No building footprint information)

# London
# Read in POI feature column set
with open("data/combined_data/feature_columns_london_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_poi = [''.join(line.strip().split(',')) for line in lines]

# Birmingham
# Read in POI feature column set
with open("data/combined_data/feature_columns_bham_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_poi = [''.join(line.strip().split(',')) for line in lines]

In [73]:
# Save Geometries and Names

london_geometries = all_data_london.loc[all_data_london[target].index, 'geometry']
bham_geometries = all_data_bham.loc[all_data_bham[target].index, 'geometry']

london_names = all_data_london.loc[all_data_london[target].index, 'LSOA11CD']
bham_names = all_data_bham.loc[all_data_bham[target].index, 'LSOA11CD']

### Test Model

In [8]:
# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(results_path="ml_results/dummy_models/test", mode='Explain')

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_log_employment = r2_score(y_test, predictions)
rmse_log_employment = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_log_employment}')
print(f'RMSE: {rmse_log_employment}')

#Save results for plotting
predictions_all = automl.predict(all_data_london[features])

results_test = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all,
})

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7207009026018532
RMSE: 0.678412914276123


## Models

In [39]:
### Employment (log), NO Spatial Lag, London (1)

# Create training and testing data
features = feature_columns_london
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_1 = r2_score(y_test, predictions)
rmse_1 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_1}')
print(f'RMSE: {rmse_1}')

#Save results for plotting
predictions_all_1 = automl.predict(all_data_london[features])
london_geometries = all_data_london.loc[all_data_london[target].index, 'geometry']

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_1,
})


AutoML directory: ml_results/log_employment_london/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.638237 trained in 44.72 seconds
2_Default_CatBoost rmse 0.624872 trained in 39.45 seconds
3_Default_RandomForest rmse 0.692124 trained in 1092.47 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.647595 trained in 31.74 seconds
8_CatBoost rmse 0.62888 trained in 57.43 seconds
12_RandomForest rmse 0.689969 trained in 42.49 seconds
5_Xgboost rmse 0.646129 trained in 955.1 seconds
9_CatBoost rmse 0.640378 trained in 954.04 seconds
13_RandomForest rmse 0.713245 trained in 57.77 seconds
6_X

In [40]:
### Employment (log), NO Spatial Lag, Birmingham (2)

# Create training and testing data
features = feature_columns_bham
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_2 = r2_score(y_test, predictions)
rmse_2 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_2}')
print(f'RMSE: {rmse_2}')

#Save results for plotting
predictions_all_2 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_2,
})


AutoML directory: ml_results/log_employment_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.803686 trained in 5.65 seconds
2_Default_CatBoost rmse 0.783334 trained in 18.54 seconds
3_Default_RandomForest rmse 0.821087 trained in 24.78 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.807126 trained in 5.22 seconds
8_CatBoost rmse 0.810248 trained in 22.31 seconds
12_RandomForest rmse 0.816138 trained in 57.95 seconds
5_Xgboost rmse 0.808929 trained in 5.8 seconds
9_CatBoost rmse 0.830576 trained in 20.01 seconds
13_RandomForest rmse 0.848962 trained in 56.66 seconds
6_Xgboost r

In [41]:
### Employment (log), Spatial Lag, London (3)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_3 = r2_score(y_test, predictions)
rmse_3 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_3}')
print(f'RMSE: {rmse_3}')

#Save results for plotting
predictions_all_3 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_3,
})


AutoML directory: ml_results/log_employment_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.638593 trained in 1086.36 seconds
2_Default_CatBoost rmse 0.63385 trained in 2029.01 seconds
3_Default_RandomForest rmse 0.693922 trained in 2068.46 seconds
Skip not_so_random because of the time limit.
Skip hill_climbing_1 because of the time limit.
Skip hill_climbing_2 because of the time limit.
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.627695 trained in 0.05 seconds
AutoML fit time: 5186.08 seconds
AutoML best model: Ensemble
R^2 Score: 0.7547717661754779
RMSE: 0.635688841342926


In [22]:
### Employment (log), Spatial Lag, Birmingham (4)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_4 = r2_score(y_test, predictions)
rmse_4 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_4}')
print(f'RMSE: {rmse_4}')

#Save results for plotting
predictions_all_4 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_4,
})




This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6180309508339799
RMSE: 0.7356722354888916


In [60]:
### Employment Density, NO Spatial Lag, London (5)

# Create training and testing data
features = feature_columns_london
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_5 = r2_score(y_test, predictions)
rmse_5 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_5}')
print(f'RMSE: {rmse_5}')

#Save results for plotting
predictions_all_5 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_5,
})


AutoML directory: ml_results/employment_density_london/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.008187 trained in 90.4 seconds
2_Default_CatBoost rmse 0.007915 trained in 175.05 seconds
3_Default_RandomForest rmse 0.008384 trained in 72.8 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.007647 trained in 542.18 seconds
8_CatBoost rmse 0.008524 trained in 320.1 seconds
12_RandomForest rmse 0.008001 trained in 128.94 seconds
5_Xgboost rmse 0.008374 trained in 763.09 seconds
9_CatBoost rmse 0.008263 trained in 55.59 seconds
13_RandomForest rmse 0.008615 trained in 297.25 second

In [61]:
### Employment Density, NO Spatial Lag, Birmingham (6)

# Create training and testing data
features = feature_columns_bham
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_6 = r2_score(y_test, predictions)
rmse_6 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_6}')
print(f'RMSE: {rmse_6}')

#Save results for plotting
predictions_all_6 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_6,
})


AutoML directory: ml_results/employment_density_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.006401 trained in 54.94 seconds
2_Default_CatBoost rmse 0.006805 trained in 20.89 seconds
3_Default_RandomForest rmse 0.005864 trained in 21.74 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.005535 trained in 28.11 seconds
8_CatBoost rmse 0.006953 trained in 27.14 seconds
12_RandomForest rmse 0.006101 trained in 1000.47 seconds
5_Xgboost rmse 0.006364 trained in 5.41 seconds
9_CatBoost rmse 0.007136 trained in 20.74 seconds
13_RandomForest rmse 0.006015 trained in 340.22 seconds
6

In [28]:
### Employment Density, Spatial Lag, London (7)

# Create training and testing data
features = feature_columns_london_lag
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_7 = r2_score(y_test, predictions)
rmse_7 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_7}')
print(f'RMSE: {rmse_7}')

#Save results for plotting
predictions_all_7 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_7,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7209666533175008
RMSE: 0.008803357370197773


In [30]:
### Employment Density, Spatial Lag, Birmingham (8)

# Create training and testing data
features = feature_columns_bham_lag
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_8 = r2_score(y_test, predictions)
rmse_8 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_8}')
print(f'RMSE: {rmse_8}')

#Save results for plotting
predictions_all_8 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_8,
})


AutoML directory: ml_results/employment_density_bham_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.006931 trained in 18.03 seconds
2_Default_CatBoost rmse 0.006576 trained in 44.52 seconds
3_Default_RandomForest rmse 0.006011 trained in 33.96 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.006089 trained in 173.69 seconds
8_CatBoost rmse 0.006753 trained in 417.24 seconds
12_RandomForest rmse 0.006154 trained in 351.31 seconds
5_Xgboost rmse 0.006984 trained in 9.58 seconds
9_CatBoost rmse 0.006702 trained in 58.83 seconds
13_RandomForest rmse 0.006145 trained in 39.22 secon

In [31]:
### Office Employment Density , NO Spatial Lag, London (9)

# Create training and testing data
features = feature_columns_london
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_9 = r2_score(y_test, predictions)
rmse_9 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_9}')
print(f'RMSE: {rmse_9}')

#Save results for plotting
predictions_all_9 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_9,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.5798782180896167
RMSE: 0.007589899934828281


In [32]:
### Office Employment Density , NO Spatial Lag, Birmingham (10)

# Create training and testing data
features = feature_columns_bham
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_10 = r2_score(y_test, predictions)
rmse_10 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_10}')
print(f'RMSE: {rmse_10}')

#Save results for plotting
predictions_all_10 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_10,
})


AutoML directory: ml_results/office_employment_density_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.00504 trained in 5.81 seconds
2_Default_CatBoost rmse 0.005388 trained in 15.23 seconds
3_Default_RandomForest rmse 0.004535 trained in 15.5 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.00489 trained in 4.56 seconds
8_CatBoost rmse 0.00553 trained in 31.96 seconds
12_RandomForest rmse 0.005026 trained in 19.02 seconds
5_Xgboost rmse 0.005062 trained in 5.02 seconds
9_CatBoost rmse 0.005488 trained in 14.59 seconds
13_RandomForest rmse 0.00493 trained in 17.42 seconds
6_Xg

In [33]:
### Office Employment Density, Spatial Lag, London (11)

# Create training and testing data
features = feature_columns_london_lag
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_11 = r2_score(y_test, predictions)
rmse_11 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_11}')
print(f'RMSE: {rmse_11}')

#Save results for plotting
predictions_all_11 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_11,
})


AutoML directory: ml_results/office_employment_density_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.005845 trained in 575.16 seconds
2_Default_CatBoost rmse 0.005881 trained in 394.81 seconds
3_Default_RandomForest rmse 0.005943 trained in 167.17 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.005577 trained in 163.09 seconds
8_CatBoost rmse 0.006433 trained in 2035.32 seconds
12_RandomForest rmse 0.006022 trained in 121.81 seconds
5_Xgboost rmse 0.005969 trained in 231.43 seconds
9_CatBoost rmse 0.006262 trained in 1942.51 seconds
13_RandomForest rmse 0.006054 train

In [34]:
### Office Employment Density (log), Spatial Lag, Birmingham (12)

# Create training and testing data
features = feature_columns_bham_lag
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_12 = r2_score(y_test, predictions)
rmse_12 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_12}')
print(f'RMSE: {rmse_12}')

#Save results for plotting
predictions_all_12 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_12,
})


AutoML directory: ml_results/office_employment_density_bham_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.005194 trained in 12.15 seconds
2_Default_CatBoost rmse 0.005375 trained in 51.11 seconds
3_Default_RandomForest rmse 0.004895 trained in 36.26 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.005175 trained in 11.0 seconds
8_CatBoost rmse 0.005434 trained in 1160.44 seconds
12_RandomForest rmse 0.005042 trained in 33.76 seconds
5_Xgboost rmse 0.005152 trained in 10.12 seconds
9_CatBoost rmse 0.005301 trained in 51.18 seconds
13_RandomForest rmse 0.004905 trained in 39.18

In [45]:
### Employment (log), NO Spatial Lag, London, POI only (13)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_13 = r2_score(y_test, predictions)
rmse_13 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_13}')
print(f'RMSE: {rmse_13}')

#Save results for plotting
predictions_all_13 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_13,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7256287050510382
RMSE: 0.672401487827301


In [36]:
### Employment (log), NO Spatial Lag, Brimingham, POI only (14)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_14 = r2_score(y_test, predictions)
rmse_14 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_14}')
print(f'RMSE: {rmse_14}')

#Save results for plotting
predictions_all_14 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_14,
})




AutoML directory: ml_results/log_employment_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.796247 trained in 1077.34 seconds
2_Default_CatBoost rmse 0.811589 trained in 10.74 seconds
3_Default_RandomForest rmse 0.819108 trained in 18.32 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.814556 trained in 4.52 seconds
8_CatBoost rmse 0.820528 trained in 17.07 seconds
12_RandomForest rmse 0.81786 trained in 18.95 seconds
5_Xgboost rmse 0.818373 trained in 5.14 seconds
9_CatBoost rmse 0.835969 trained in 14.25 seconds
13_RandomForest rmse 0.847451 trained in 20.41 seconds
6_Xg

In [37]:
### Employment Density, NO Spatial Lag, London, POI only (15)


# Create training and testing data
features = feature_columns_london_poi
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_15 = r2_score(y_test, predictions)
rmse_15 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_15}')
print(f'RMSE: {rmse_15}')

#Save results for plotting
predictions_all_15 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_15,
})


AutoML directory: ml_results/employment_density_london_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.008769 trained in 957.67 seconds
2_Default_CatBoost rmse 0.00864 trained in 46.56 seconds
3_Default_RandomForest rmse 0.008734 trained in 36.43 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.008414 trained in 36.38 seconds
8_CatBoost rmse 0.009213 trained in 976.73 seconds
12_RandomForest rmse 0.008739 trained in 40.06 seconds
5_Xgboost rmse 0.008733 trained in 33.48 seconds
9_CatBoost rmse 0.009017 trained in 51.28 seconds
13_RandomForest rmse 0.00894 trained in 952.44 seco

In [38]:
### Employment Density, NO Spatial Lag, Brimingham, POI only (16)

# Create training and testing data
features = feature_columns_bham_poi
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_16 = r2_score(y_test, predictions)
rmse_16 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_16}')
print(f'RMSE: {rmse_16}')

#Save results for plotting
predictions_all_16 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_16,
})


AutoML directory: ml_results/employment_density_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.006297 trained in 5.41 seconds
2_Default_CatBoost rmse 0.006803 trained in 13.56 seconds
3_Default_RandomForest rmse 0.005603 trained in 564.01 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.005603 trained in 5.44 seconds
8_CatBoost rmse 0.007077 trained in 17.55 seconds
12_RandomForest rmse 0.005912 trained in 18.75 seconds
5_Xgboost rmse 0.006318 trained in 4.55 seconds
9_CatBoost rmse 0.007382 trained in 20.47 seconds
13_RandomForest rmse 0.005977 trained in 19.38 seconds
6

In [39]:
### Office Employment Density, NO Spatial Lag, London, POI only (17)


# Create training and testing data
features = feature_columns_london_poi
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_17 = r2_score(y_test, predictions)
rmse_17 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_17}')
print(f'RMSE: {rmse_17}')

#Save results for plotting
predictions_all_17 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_17,
})


AutoML directory: ml_results/office_employment_density_london_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.006148 trained in 30.61 seconds
2_Default_CatBoost rmse 0.006144 trained in 1963.44 seconds
3_Default_RandomForest rmse 0.006119 trained in 127.65 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.005762 trained in 846.96 seconds
8_CatBoost rmse 0.006526 trained in 478.61 seconds
12_RandomForest rmse 0.006101 trained in 91.97 seconds
5_Xgboost rmse 0.006205 trained in 29.41 seconds
9_CatBoost rmse 0.006437 trained in 47.23 seconds
13_RandomForest rmse 0.006152 trained in

In [43]:
### Office Employment Density, NO Spatial Lag, Birmingham, POI only (18)

# Create training and testing data
features = feature_columns_bham_poi
target = 'office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_18 = r2_score(y_test, predictions)
rmse_18 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_18}')
print(f'RMSE: {rmse_18}')

#Save results for plotting
predictions_all_18 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_18,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.4667612316312225
RMSE: 0.001374521292746067


In [84]:
### Employment (log), NO Spatial Lag, London,  POI EXCLUSIVE (19)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_19 = r2_score(y_test, predictions)
rmse_19 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_19}')
print(f'RMSE: {rmse_19}')

#Save results for plotting
predictions_all_19 = automl.predict(all_data_london[features])

results_london_poi = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_19,
})

# Save to project folder
results_london_poi.to_csv("data/combined_data/model_results_london_poi.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7102393476371849
RMSE: 0.6910015940666199


In [86]:
### Employment (log), NO Spatial Lag, Birmingham, POI EXCLUSIVE (20)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_20 = r2_score(y_test, predictions)
rmse_20 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_20}')
print(f'RMSE: {rmse_20}')

#Save results for plotting
predictions_all_20 = automl.predict(all_data_bham[features])

results_bham_poi = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_20,
})

# Save to project folder
results_bham_poi.to_csv("data/combined_data/model_results_bham_poi.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6226217466175756
RMSE: 0.7312378883361816


In [92]:
### Employment Density, NO Spatial Lag, London,  POI EXCLUSIVE (21)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_21 = r2_score(y_test, predictions)
rmse_21 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_21}')
print(f'RMSE: {rmse_21}')

#Save results for plotting
predictions_all_21 = automl.predict(all_data_london[features])

results_london_poi_density = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_21,
})

# Save to project folder
results_london_poi_density.to_csv("data/combined_data/model_results_london_poi_density.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6473549565866394
RMSE: 0.009896671399474144


In [93]:
### Employment density, NO Spatial Lag, Birmingham, POI EXCLUSIVE (22)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_22 = r2_score(y_test, predictions)
rmse_22 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_22}')
print(f'RMSE: {rmse_22}')

#Save results for plotting
predictions_all_22 = automl.predict(all_data_bham[features])

results_bham_poi_density = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_22,
})

# Save to project folder
results_bham_poi_density.to_csv("data/combined_data/model_results_bham_poi_density.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7687657039007642
RMSE: 0.001383011694997549
