# Modelling

This is the script where I store all my ML model runs

## Library Imports

In [1]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np
import csv

#Shapely / Spatial
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

### Import Datasets

In [3]:
# Read London CSV
all_data_london = pd.read_csv("data/combined_data/lag/all_data_london_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_london.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-laged column set
feature_columns_london = [col for col in feature_columns_london_lag if not col.startswith('lag_')]

# ---

# Read Birmingham CSV
all_data_bham = pd.read_csv("data/combined_data/lag/all_data_bham_lag.csv")

# Read in feature column set
with open("data/combined_data/total_feature_columns_bham.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_lag = [''.join(line.strip().split(',')) for line in lines]

# Create non-lagged column set
feature_columns_bham = [col for col in feature_columns_bham_lag if not col.startswith('lag_')]

# Fix null values ending up in logged variables
all_data_london.fillna(0)
all_data_bham.fillna(0)

Unnamed: 0.1,Unnamed: 0,LSOA11CD,LSOA11NM_x,LSOA11NM_y,Unnamed: 2,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,...,lag_travel,lag_travel_agents,lag_trusts,lag_university_housing,lag_used_vintage_and_consignment,lag_veterinarian,lag_videographer,lag_vitamins_and_supplements,lag_warehouses,lag_window_washing
0,0,E01008881,Birmingham 067A,Birmingham 067A,0.0,1599,lsoa2011:E01008881 : Birmingham 067A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.333333,0.0
1,1,E01008882,Birmingham 066A,Birmingham 066A,0.0,1747,lsoa2011:E01008882 : Birmingham 066A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.500000,0.0
2,2,E01008883,Birmingham 078A,Birmingham 078A,0.0,1816,lsoa2011:E01008883 : Birmingham 078A,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,3,E01008884,Birmingham 078B,Birmingham 078B,0.0,1870,lsoa2011:E01008884 : Birmingham 078B,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.500000,0.0
4,4,E01008885,Birmingham 076A,Birmingham 076A,0.0,1308,lsoa2011:E01008885 : Birmingham 076A,0.0,0.0,0.0,...,0.000000,0.166667,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,634,E01033646,Birmingham 031I,Birmingham 031I,0.0,1624,lsoa2011:E01033646 : Birmingham 031I,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0
635,635,E01033647,Birmingham 058E,Birmingham 058E,0.0,1398,lsoa2011:E01033647 : Birmingham 058E,0.0,0.0,0.0,...,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
636,636,E01033648,Birmingham 084F,Birmingham 084F,0.0,2715,lsoa2011:E01033648 : Birmingham 084F,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
637,637,E01033649,Birmingham 058F,Birmingham 058F,0.0,1801,lsoa2011:E01033649 : Birmingham 058F,0.0,0.0,0.0,...,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.000000,0.0


In [8]:
# Download POI & auxillary-only feature spaces (No building footprint information)

# London
# Read in POI feature column set
with open("data/combined_data/feature_columns_london_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london_poi = [''.join(line.strip().split(',')) for line in lines]

# Birmingham
# Read in POI feature column set
with open("data/combined_data/feature_columns_bham_poi.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham_poi = [''.join(line.strip().split(',')) for line in lines]

In [10]:
# Get log of density columns

epsilon = 1e-10

all_data_london['log_employment_density'] = np.log(all_data_london['employment_density'].replace(0, epsilon))
all_data_london['log_office_employment_density'] = np.log(all_data_london['office_employment_density'].replace(0, epsilon))

all_data_bham['log_employment_density'] = np.log(all_data_bham['employment_density'].replace(0, epsilon))
all_data_bham['log_office_employment_density'] = np.log(all_data_bham['office_employment_density'].replace(0, epsilon)) 


In [20]:
# Save Geometries and Names

london_geometries = all_data_london['geometry']
bham_geometries = all_data_bham['geometry']

london_names = all_data_london['LSOA11CD']
bham_names = all_data_bham['LSOA11CD']

### Test Model

In [8]:
# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(results_path="ml_results/dummy_models/test", mode='Explain')

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_log_employment = r2_score(y_test, predictions)
rmse_log_employment = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_log_employment}')
print(f'RMSE: {rmse_log_employment}')

#Save results for plotting
predictions_all = automl.predict(all_data_london[features])

results_test = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all,
})

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7207009026018532
RMSE: 0.678412914276123


## Models

In [39]:
### Employment (log), NO Spatial Lag, London (1)

# Create training and testing data
features = feature_columns_london
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_1 = r2_score(y_test, predictions)
rmse_1 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_1}')
print(f'RMSE: {rmse_1}')

#Save results for plotting
predictions_all_1 = automl.predict(all_data_london[features])
london_geometries = all_data_london.loc[all_data_london[target].index, 'geometry']

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_1,
})


AutoML directory: ml_results/log_employment_london/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.638237 trained in 44.72 seconds
2_Default_CatBoost rmse 0.624872 trained in 39.45 seconds
3_Default_RandomForest rmse 0.692124 trained in 1092.47 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.647595 trained in 31.74 seconds
8_CatBoost rmse 0.62888 trained in 57.43 seconds
12_RandomForest rmse 0.689969 trained in 42.49 seconds
5_Xgboost rmse 0.646129 trained in 955.1 seconds
9_CatBoost rmse 0.640378 trained in 954.04 seconds
13_RandomForest rmse 0.713245 trained in 57.77 seconds
6_X

In [40]:
### Employment (log), NO Spatial Lag, Birmingham (2)

# Create training and testing data
features = feature_columns_bham
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_2 = r2_score(y_test, predictions)
rmse_2 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_2}')
print(f'RMSE: {rmse_2}')

#Save results for plotting
predictions_all_2 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_2,
})


AutoML directory: ml_results/log_employment_bham/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.803686 trained in 5.65 seconds
2_Default_CatBoost rmse 0.783334 trained in 18.54 seconds
3_Default_RandomForest rmse 0.821087 trained in 24.78 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.807126 trained in 5.22 seconds
8_CatBoost rmse 0.810248 trained in 22.31 seconds
12_RandomForest rmse 0.816138 trained in 57.95 seconds
5_Xgboost rmse 0.808929 trained in 5.8 seconds
9_CatBoost rmse 0.830576 trained in 20.01 seconds
13_RandomForest rmse 0.848962 trained in 56.66 seconds
6_Xgboost r

In [41]:
### Employment (log), Spatial Lag, London (3)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_3 = r2_score(y_test, predictions)
rmse_3 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_3}')
print(f'RMSE: {rmse_3}')

#Save results for plotting
predictions_all_3 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_3,
})


AutoML directory: ml_results/log_employment_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.638593 trained in 1086.36 seconds
2_Default_CatBoost rmse 0.63385 trained in 2029.01 seconds
3_Default_RandomForest rmse 0.693922 trained in 2068.46 seconds
Skip not_so_random because of the time limit.
Skip hill_climbing_1 because of the time limit.
Skip hill_climbing_2 because of the time limit.
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.627695 trained in 0.05 seconds
AutoML fit time: 5186.08 seconds
AutoML best model: Ensemble
R^2 Score: 0.7547717661754779
RMSE: 0.635688841342926


In [22]:
### Employment (log), Spatial Lag, Birmingham (4)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_4 = r2_score(y_test, predictions)
rmse_4 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_4}')
print(f'RMSE: {rmse_4}')

#Save results for plotting
predictions_all_4 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_4,
})




This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6180309508339799
RMSE: 0.7356722354888916


In [134]:
### Employment Density, NO Spatial Lag, London (5)

# Create training and testing data
features = feature_columns_london
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_5 = r2_score(y_test, predictions)
rmse_5 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_5}')
print(f'RMSE: {rmse_5}')

#Save results for plotting
predictions_all_5 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_5,
})


AutoML directory: ml_results/employment_density_london/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.711278 trained in 52.53 seconds
2_Default_CatBoost rmse 0.679557 trained in 62.23 seconds
3_Default_RandomForest rmse 0.803243 trained in 43.05 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.706569 trained in 43.27 seconds
8_CatBoost rmse 0.68361 trained in 76.41 seconds
12_RandomForest rmse 0.807704 trained in 69.05 seconds
5_Xgboost rmse 0.728611 trained in 46.56 seconds
9_CatBoost rmse 0.705359 trained in 70.99 seconds
13_RandomForest rmse 0.83544 trained in 61.51 seconds
6_X

In [16]:
### Employment Density, NO Spatial Lag, Birmingham (6)

# Create training and testing data
features = feature_columns_bham
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_6 = r2_score(y_test, predictions)
rmse_6 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_6}')
print(f'RMSE: {rmse_6}')

#Save results for plotting
predictions_all_6 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_6,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.5111932494414406
RMSE: 0.7408556938171387


In [None]:
### Employment Density, Spatial Lag, London (7)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_7 = r2_score(y_test, predictions)
rmse_7 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_7}')
print(f'RMSE: {rmse_7}')

#Save results for plotting
predictions_all_7 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_7,
})


AutoML directory: ml_results/employment_density_london_lag/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.697738 trained in 232.55 seconds
2_Default_CatBoost rmse 0.690238 trained in 300.13 seconds
3_Default_RandomForest rmse 0.798875 trained in 162.55 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.709979 trained in 268.49 seconds
8_CatBoost rmse 0.688124 trained in 397.58 seconds
12_RandomForest rmse 0.800764 trained in 184.92 seconds
5_Xgboost rmse 0.71357 trained in 252.43 seconds
9_CatBoost rmse 0.718557 trained in 361.72 seconds
13_RandomForest rmse 0.831279 trained in 275.

In [None]:
### Employment Density, Spatial Lag, Birmingham (8)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_8 = r2_score(y_test, predictions)
rmse_8 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_8}')
print(f'RMSE: {rmse_8}')

#Save results for plotting
predictions_all_8 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_8,
})


In [None]:
### Office Employment Density , NO Spatial Lag, London (9)

# Create training and testing data
features = feature_columns_london
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_9 = r2_score(y_test, predictions)
rmse_9 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_9}')
print(f'RMSE: {rmse_9}')

#Save results for plotting
predictions_all_9 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_9,
})


In [None]:
### Office Employment Density , NO Spatial Lag, Birmingham (10)

# Create training and testing data
features = feature_columns_bham
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_10 = r2_score(y_test, predictions)
rmse_10 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_10}')
print(f'RMSE: {rmse_10}')

#Save results for plotting
predictions_all_10 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_10,
})


In [None]:
### Office Employment Density, Spatial Lag, London (11)

# Create training and testing data
features = feature_columns_london_lag
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_11 = r2_score(y_test, predictions)
rmse_11 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_11}')
print(f'RMSE: {rmse_11}')

#Save results for plotting
predictions_all_11 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_11,
})


In [None]:
### Office Employment Density (log), Spatial Lag, Birmingham (12)

# Create training and testing data
features = feature_columns_bham_lag
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_lag/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_12 = r2_score(y_test, predictions)
rmse_12 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_12}')
print(f'RMSE: {rmse_12}')

#Save results for plotting
predictions_all_12 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_12,
})


In [45]:
### Employment (log), NO Spatial Lag, London, POI only (13)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_13 = r2_score(y_test, predictions)
rmse_13 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_13}')
print(f'RMSE: {rmse_13}')

#Save results for plotting
predictions_all_13 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_13,
})


This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7256287050510382
RMSE: 0.672401487827301


In [36]:
### Employment (log), NO Spatial Lag, Brimingham, POI only (14)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_14 = r2_score(y_test, predictions)
rmse_14 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_14}')
print(f'RMSE: {rmse_14}')

#Save results for plotting
predictions_all_14 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_14,
})




AutoML directory: ml_results/log_employment_bham_poi/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'Random Forest']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 0.796247 trained in 1077.34 seconds
2_Default_CatBoost rmse 0.811589 trained in 10.74 seconds
3_Default_RandomForest rmse 0.819108 trained in 18.32 seconds
* Step not_so_random will try to check up to 12 models
4_Xgboost rmse 0.814556 trained in 4.52 seconds
8_CatBoost rmse 0.820528 trained in 17.07 seconds
12_RandomForest rmse 0.81786 trained in 18.95 seconds
5_Xgboost rmse 0.818373 trained in 5.14 seconds
9_CatBoost rmse 0.835969 trained in 14.25 seconds
13_RandomForest rmse 0.847451 trained in 20.41 seconds
6_Xg

In [None]:
### Employment Density, NO Spatial Lag, London, POI only (15)


# Create training and testing data
features = feature_columns_london_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_15 = r2_score(y_test, predictions)
rmse_15 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_15}')
print(f'RMSE: {rmse_15}')

#Save results for plotting
predictions_all_15 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_15,
})


In [None]:
### Employment Density, NO Spatial Lag, Brimingham, POI only (16)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_16 = r2_score(y_test, predictions)
rmse_16 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_16}')
print(f'RMSE: {rmse_16}')

#Save results for plotting
predictions_all_16 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_16,
})


In [None]:
### Office Employment Density, NO Spatial Lag, London, POI only (17)


# Create training and testing data
features = feature_columns_london_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_london_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_17 = r2_score(y_test, predictions)
rmse_17 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_17}')
print(f'RMSE: {rmse_17}')

#Save results for plotting
predictions_all_17 = automl.predict(all_data_london[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_17,
})


In [None]:
### Office Employment Density, NO Spatial Lag, Birmingham, POI only (18)

# Create training and testing data
features = feature_columns_bham_poi
target = 'log_office_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/office_employment_density_bham_poi/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_18 = r2_score(y_test, predictions)
rmse_18 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_18}')
print(f'RMSE: {rmse_18}')

#Save results for plotting
predictions_all_18 = automl.predict(all_data_bham[features])

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_18,
})


In [None]:
### Employment (log), Spatial Lag, NO building footprints, London (19)

In [None]:
### Employment density (log), Spatial Lag, NO building footprints, London (20)

In [None]:
### Office Employment Density (log), Spatial Lag, NO building footprints, London (21)

In [None]:
### Employment (log), Spatial Lag, NO building footprints, Birmingham (22)

In [None]:
### Employment density (log), Spatial Lag, NO building footprints, Birmingham (23)

In [None]:
### Office Employment Density (log), Spatial Lag, NO building footprints, Birmingham (24)

In [84]:
### Employment (log), NO Spatial Lag, London,  POI EXCLUSIVE (25)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_19 = r2_score(y_test, predictions)
rmse_19 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_19}')
print(f'RMSE: {rmse_19}')

#Save results for plotting
predictions_all_19 = automl.predict(all_data_london[features])

results_london_poi = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_19,
})

# Save to project folder
results_london_poi.to_csv("data/combined_data/model_results_london_poi.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7102393476371849
RMSE: 0.6910015940666199


In [86]:
### Employment (log), NO Spatial Lag, Birmingham, POI EXCLUSIVE (26)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_20 = r2_score(y_test, predictions)
rmse_20 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_20}')
print(f'RMSE: {rmse_20}')

#Save results for plotting
predictions_all_20 = automl.predict(all_data_bham[features])

results_bham_poi = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_20,
})

# Save to project folder
results_bham_poi.to_csv("data/combined_data/model_results_bham_poi.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6226217466175756
RMSE: 0.7312378883361816


In [None]:
### Employment Density, NO Spatial Lag, London,  POI EXCLUSIVE (27)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london_poi = [column for column in feature_columns_london_poi if column != 'log_num_places']
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_london_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_21 = r2_score(y_test, predictions)
rmse_21 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_21}')
print(f'RMSE: {rmse_21}')

#Save results for plotting
predictions_all_21 = automl.predict(all_data_london[features])

results_london_poi_density = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': all_data_london[target],
    'predicted': predictions_all_21,
})

# Save to project folder
results_london_poi_density.to_csv("data/combined_data/model_results_london_poi_density.csv")

In [None]:
### Employment Density, NO Spatial Lag, Birmingham, POI EXCLUSIVE (28)

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_bham_poi = [column for column in feature_columns_bham_poi if column != 'log_num_places']
target = 'log_employment_density'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_bham[features], all_data_bham[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/poi_exclusive_bham_density/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_22 = r2_score(y_test, predictions)
rmse_22 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_22}')
print(f'RMSE: {rmse_22}')

#Save results for plotting
predictions_all_22 = automl.predict(all_data_bham[features])

results_bham_poi_density = pd.DataFrame({
    'name': bham_names,
    'geometry': bham_geometries,
    'observed': all_data_bham[target],
    'predicted': predictions_all_22,
})

# Save to project folder
results_bham_poi_density.to_csv("data/combined_data/model_results_bham_poi_density.csv")

### Attempt with Demographic Data

In [98]:
# Same Analysis on Education, Employment Status, and Multiple Deprivation Data

general_health = pd.read_csv("data/lsoa_data/TS037_general_health.csv", skiprows = 7, header = 0)
employment_residential = pd.read_csv("data/lsoa_data/TS066_economic_activity_status.csv", skiprows = 7, header = 0)
education = pd.read_csv("data/lsoa_data/TS067_highest_qualification.csv", skiprows = 7, header = 0)
household_comp = pd.read_csv("data/lsoa_data/TS003_household_composition.csv", skiprows = 6, header = 0)
age_bands = pd.read_csv("data/lsoa_data/TS007B_age_broad_band.csv", skiprows = 4, header = 0)
english_lang = pd.read_csv("data/lsoa_data/TS029_english_language.csv", skiprows = 6, header = 0)

#Separate name into LSOA11CD and LSOA11NM (taken from DataCleaning.ipynb)
def split_column(value):
    if isinstance(value, str):
        code, name = value.split(' : ')
        return code.strip(), name.strip()
    else:
        return None, None

# Parse Code and Name out
general_health[['LSOA11CD', 'LSOA11NM']] = general_health['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
employment_residential[['LSOA11CD', 'LSOA11NM']] = employment_residential['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
education[['LSOA11CD', 'LSOA11NM']] = education['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
household_comp[['LSOA11CD', 'LSOA11NM']] = household_comp['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
age_bands[['LSOA11CD', 'LSOA11NM']] = age_bands['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))
english_lang[['LSOA11CD', 'LSOA11NM']] = english_lang['2021 super output area - lower layer'].apply(lambda x: pd.Series(split_column(x)))

# Drop original column
general_health = general_health.drop(columns=['2021 super output area - lower layer'])
employment_residential = employment_residential.drop(columns=['2021 super output area - lower layer'])
education = education.drop(columns=['2021 super output area - lower layer'])
household_comp = household_comp.drop(columns=['2021 super output area - lower layer'])
age_bands = age_bands.drop(columns=['2021 super output area - lower layer'])
english_lang = english_lang.drop(columns=['2021 super output area - lower layer'])


multiple_deprivation = pd.read_csv("data/lsoa_data/multiple_deprivation.csv", header = 0)
multiple_deprivation.rename(columns = {'LSOA code (2011)':'LSOA11CD', 'LSOA name (2011)':'LSOA11NM'}, inplace=True)
multiple_deprivation = multiple_deprivation.drop(columns=["Local Authority District code (2019)", "Local Authority District name (2019)"])


multiple_deprivation.head()


Unnamed: 0,LSOA11CD,LSOA11NM,Index of Multiple Deprivation (IMD) Rank,Index of Multiple Deprivation (IMD) Decile
0,E01000001,City of London 001A,29199,9
1,E01000002,City of London 001B,30379,10
2,E01000003,City of London 001C,14915,5
3,E01000005,City of London 001E,8678,3
4,E01000006,Barking and Dagenham 016A,14486,5


In [116]:
# Join all data together

combined_census = pd.merge(general_health, employment_residential, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, education, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, household_comp, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, age_bands, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, english_lang, on='LSOA11CD', suffixes=('', '_drop'))
combined_census = pd.merge(combined_census, multiple_deprivation, on='LSOA11CD', suffixes=('', '_drop'))

# Get column lists
combined_census = combined_census[[col for col in combined_census.columns if not col.endswith('_drop')]]
combined_census_columns = list(combined_census.columns)
exclude_columns = ['LSOA11CD', 'LSOA11NM', 'residual', 'Total: All usual residents', 'Total: All usual residents aged 16 years and over']
census_feature_columns = [col for col in combined_census.columns if col not in exclude_columns]

# Join with london and birmingham model output data
combined_model = all_data_london
combined_model = combined_model.drop(columns=['LSOA11NM'])
combined_census = combined_census.merge(combined_model, on='LSOA11CD', how='inner')

# Fix string rank data
combined_census['Index of Multiple Deprivation (IMD) Rank'] = combined_census['Index of Multiple Deprivation (IMD) Rank'].str.replace(',', '')
combined_census['Index of Multiple Deprivation (IMD) Rank']  = pd.to_numeric(combined_census['Index of Multiple Deprivation (IMD) Rank'] )   

combined_census.head()

Unnamed: 0,Total: All usual residents,Very good health,Good health,Fair health,Bad health,Very bad health,LSOA11CD,LSOA11NM,Total: All usual residents aged 16 years and over,Economically active (excluding full-time students),...,lag_veterinarian,lag_videographer,lag_vitamins_and_supplements,lag_warehouses,lag_waterproofing,lag_waxing,lag_wholesale_grocer,lag_wildlife_sanctuary,lag_wills_trusts_and_probate,lag_winery
0,100.0,58.2,31.7,8.1,1.2,0.7,E01000001,City of London 001A,100.0,65.7,...,0.166667,0.333333,0.0,0.166667,0.0,0.0,0.0,0.0,0.333333,0.0
1,100.0,60.4,30.6,6.7,1.7,0.6,E01000002,City of London 001B,100.0,69.3,...,0.0,0.0,0.333333,0.833333,0.0,0.166667,0.0,0.0,0.166667,0.5
2,100.0,49.0,36.4,11.5,2.7,0.4,E01000003,City of London 001C,100.0,70.3,...,0.166667,0.166667,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,100.0,45.5,35.3,12.0,5.7,1.5,E01000005,City of London 001E,100.0,55.8,...,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0
4,100.0,64.6,28.8,4.9,1.5,0.1,E01032739,City of London 001F,100.0,78.4,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.5


In [130]:
### Employment (log), NO Spatial Lag, London, Demographic Data

# Ignore depracation warnings
warnings.filterwarnings('ignore')

# Create training and testing data
features = feature_columns_london + census_feature_columns
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(combined_census[features], combined_census[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/london_employment_demographic/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=2,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=False,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

predictions = automl.predict(X_test)
predictions = predictions.astype(np.float32)

# Fix NaN values
y_test = y_test.fillna(0)

r2_23 = r2_score(y_test, predictions)
rmse_23 = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_23}')
print(f'RMSE: {rmse_23}')

#Save results for plotting
predictions_all_23 = automl.predict(combined_census[features])

results_london_demographic = pd.DataFrame({
    'name': london_names,
    'geometry': london_geometries,
    'observed': combined_census[target],
    'predicted': predictions_all_23,
})

# Save to project folder
results_london_demographic.to_csv("data/combined_data/model_results_london_demographic.csv")

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.7474297689503127
RMSE: 0.6102148294448853


ValueError: array length 4659 does not match index length 4835