# Modelling

This is the script where I store all my ML model runs

## Library Imports

In [24]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np
import csv

#Shapely / Spatial
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

### Import Datasets

In [52]:
# Read London CSV
all_data_london = pd.read_csv("data/combined_data/lag/all_data_london_lag.csv")

# Read in feature column set
with open("data/combined_data/feature_columns_london.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_london = [''.join(line.strip().split(',')) for line in lines]

# ---

# Read Birmingham CSV
all_data_bham = pd.read_csv("data/combined_data/lag/all_data_bham_lag.csv")

# Read in feature column set
with open("data/combined_data/feature_columns_bham.csv", 'r') as file:
    lines = file.readlines()

# Reconstruct
feature_columns_bham = [''.join(line.strip().split(',')) for line in lines]

for column in feature_columns_london:
    print(f"{column}: {all_data_london[column].dtype}")

log_num_buildings: float64
log_num_places: float64
population: int64
all_avg_building_area: float64
all_lsoa_area_ratio: float64
all_total_area: float64
residential_total_area: int64
residential_avg_building_area: int64
residential_lsoa_area_ratio: int64
commercial_avg_building_area: float64
commercial_lsoa_area_ratio: float64
commercial_total_area: float64
office_total_area: int64
office_avg_building_area: int64
office_lsoa_area_ratio: int64
retail_total_area: int64
retail_avg_building_area: int64
retail_lsoa_area_ratio: int64
all_service: float64
all_clinic: float64
abuse_and_addiction_treatment: float64
all_accommodation: float64
accountant: float64
all_consultant: float64
active_life: float64
acupuncture: float64
all_center: float64
all_services: float64
all_education: float64
all_entertainment: float64
all_store: float64
all_agency: float64
all_restaurant: float64
all_program: float64
all_supplier: float64
all_production: float64
agriculture: float64
all_dealer: float64
all_manufa

In [56]:

### Employment (log), NO Spatial Lag, London

# Create training and testing data
features = feature_columns_london
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_london[features], all_data_london[target], test_size=0.2, random_state=3)

automl = AutoML(results_path="ml_results/test/", mode='Explain')

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2_log_employment = r2_score(y_test, predictions)
rmse_log_employment = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_log_employment}')
print(f'RMSE: {rmse_log_employment}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
NaN values in y_test: 2
NaN values in predictions: 0


 [ 6.49748322]
 [ 4.54180142]
 [ 6.01768014]
 [ 4.2415199 ]
 [ 6.04816863]
 [ 4.98589227]
 [ 6.61638305]
 [ 5.85642621]
 [ 8.6102142 ]
 [ 5.70488593]
 [ 5.90392968]
 [ 7.51259801]
 [ 4.82789466]
 [ 4.84221932]
 [ 6.13017413]
 [ 4.46873781]
 [ 5.51133439]
 [ 4.60700342]
 [ 6.47522923]
 [ 8.24740407]
 [ 6.05429932]
 [ 4.99725577]
 [ 6.5463495 ]
 [ 5.18743941]
 [ 4.48004982]
 [ 5.89330909]
 [ 4.52596018]
 [ 5.37712619]
 [ 4.72934768]
 [ 7.71757123]
 [ 5.04870078]
 [ 4.6986303 ]
 [ 5.41329381]
 [ 4.79789826]
 [ 4.74819896]
 [ 5.86121985]
 [ 5.80126139]
 [ 4.19049022]
 [ 6.19252631]
 [ 6.19873711]
 [ 5.55442616]
 [ 5.44455048]
 [ 7.70987746]
 [ 5.6249356 ]
 [ 6.79849812]
 [ 5.12594554]
 [ 6.80634734]
 [ 4.87554404]
 [ 4.5518944 ]
 [ 4.80262706]
 [ 5.25680301]
 [ 5.44003865]
 [ 5.30359217]
 [ 5.29173705]
 [ 6.38352439]
 [ 7.10422084]
 [ 6.64794108]
 [ 4.77217099]
 [ 7.47814557]
 [ 7.54772803]
 [ 6.50596759]
 [ 5.7685003 ]
 [ 6.78653285]
 [ 6.22698113]
 [ 5.88300559]
 [ 6.05913684]
 [ 4.79665

ValueError: Input contains NaN.

## Models

In [None]:
### Employment (log), NO Spatial Lag, London

# Create training and testing data
features = feature_columns_london
target = 'log_total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data_cleaned_london[features], all_data_cleaned_london[target], test_size=0.2, random_state=3)

automl = AutoML(
    results_path ='ml_results/log_employment/',
    algorithms=["CatBoost", "Xgboost", "Random Forest"],
    model_time_limit=1*60,
    start_random_models=5,
    hill_climbing_steps=3,
    top_models_to_improve=3,
    features_selection=False,
    stack_models=True,
    train_ensemble=True,
    explain_level=1,
    validation_strategy={
        "validation_type": "kfold",
        "k_folds": 4,
        "shuffle": False,
        "stratify": True,
    }
)

# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2_log_employment = r2_score(y_test, predictions)
rmse_log_employment = np.sqrt(mean_squared_error(y_test, predictions))

print(f'R^2 Score: {r2_log_employment}')
print(f'RMSE: {rmse_log_employment}')

#Save results for plotting
predictions_all = automl.predict(all_data_cleaned[features])
geometries = all_data_cleaned.loc[all_data_cleaned[target].index, 'geometry']

results_office_density_cleaned_perform = pd.DataFrame({
    'geometry': geometries,
    'observed': all_data_cleaned[target],
    'predicted': predictions_all,
})

In [None]:
### Employment (log), NO Spatial Lag, Birmingham



In [None]:
### Employment (log), Spatial Lag, London



In [None]:
### Employment (log), Spatial Lag, Birmingham



In [None]:
### Employment Density, NO Spatial Lag, London



In [None]:
### Employment Density, NO Spatial Lag, Birmingham



In [None]:
### Employment Density, Spatial Lag, London



In [None]:
### Employment Density, Spatial Lag, Birmingham



In [None]:
### Office Employment (log), NO Spatial Lag, London



In [None]:
### Office Employment (log), NO Spatial Lag, Birmingham



In [None]:
### Office Employment (log), Spatial Lag, London



In [None]:
### Office Employment (log), Spatial Lag, Birmingham



In [None]:
### Office Employment Density, NO Spatial Lag, London



In [None]:
### Office Employment Density, NO Spatial Lag, Birmingham



In [None]:
### Office Employment Density, Spatial Lag, London



In [None]:
### Office Employment Density, Spatial Lag, Birmingham

