TODO:
- Spatial cross validation
- Include SHAP graphs


In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, GroupKFold, cross_val_predict
from sklearn.model_selection import train_test_split

In [6]:
#Load in data
#Import csv and remove non-numerical variables
df = pd.read_csv('weekly_new.csv')
# df = df.drop(['city', 'name', 'year', 'week', 'latitude', 'longitude', 'espg'] , axis=1)
# df = df[['counts_week', 'country', '3_way_int_count', '4_way_int_count', 'dist_to_greenspace', 'bike_points',
#             'bus_stops', 'restaurants', 'cycle_length', 'dem_std', 'lst_std', 'pop_sum', 'dem_mean', 'shop_list']]
df

# #Select country and drop column
# country = df[df['country'] == 'Netherlands']
# country = country.drop('country', axis=1)


Unnamed: 0,country,city,GSV,espg,name,year,week,latitude,longitude,counts_week,...,cycle_length,ndvi_mean,ndvi_std,dem_mean,dem_std,lst_mean,lst_std,lc_entropy,pop_sum,build_area
0,Netherlands,Amsterdam,y,32631.0,Plesmanlaan,2022.0,38.0,52.351376,4.834098,128756.0,...,1397.713746,0.442252,0.190051,1.543379,1.493269,15.439598,0.668205,0.829520,778.743835,0.030575
1,Netherlands,Amsterdam,y,32631.0,Erasmusgracht,2022.0,38.0,52.375633,4.839661,102080.0,...,1379.214971,0.317476,0.172268,1.963636,2.013262,17.103036,0.746097,0.760094,662.914734,0.032607
2,Netherlands,Amsterdam,y,32631.0,Nieuwe Hemweg,2022.0,38.0,52.404118,4.842763,25152.0,...,940.962374,0.340138,0.201127,3.389908,4.263058,18.937200,1.111123,0.807145,730.611694,0.015659
3,Netherlands,Amsterdam,y,32631.0,Netwerkweg,2022.0,38.0,52.412605,4.880053,40640.0,...,1112.546007,0.400875,0.225344,-1.476636,2.439342,17.850033,1.267783,0.770912,394.863800,0.035528
4,Netherlands,Amsterdam,y,32631.0,Weesperstraat,2022.0,38.0,52.364513,4.906150,164388.0,...,1141.381386,0.183133,0.112661,6.075117,2.805760,15.368978,2.110428,0.628650,773.353027,0.063414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,USA,Charlotte,n,32617.0,"Selwyn Ave, Bicycles (Q2)",2022.0,6.0,35.179017,-80.837083,174.0,...,499.595921,0.550940,0.167871,213.110092,4.210819,19.641176,0.804630,0.653389,328.154846,0.030143
912,,,,,,,,,,,...,,,,,,15.029029,0.776854,,,
913,,,,,,,,,,,...,,,,,,20.430821,0.937948,,,
914,,,,,,,,,,,...,,,,,,19.271217,0.977173,,,


## Random Forest

In [24]:
#Count variable as stratify
y = country.loc[:,'counts_week']
X = country.drop('counts_week', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize y variable
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_train = y_train.ravel()
y_test = scaler.transform(y_test.values.reshape(-1, 1))

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 5, 10, 20],
              'min_samples_split': [2, 5, 10]}

# Create the random forest model
rf = RandomForestRegressor()

# Create the K-fold cross-validation object
kf = KFold(n_splits=10)

# Create the grid search object
grid_search = GridSearchCV(rf, param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score:  -0.6777389949241421


In [25]:
# Get the best model from the grid search
model_forest = grid_search.best_estimator_

# Use the best model to make predictions on the test set
y_pred = model_forest.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

R-squared:  0.27445806494342984
MSE:  1.6182007721428437
RMSE:  1.2720852063218264
MAE:  0.6021136942769878


## XGBOOST

In [26]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [3, 5, 10],
              'learning_rate': [0.1, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Create the XGBoost model
xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(xgb_reg,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   cv=5,
                   verbose=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters:  {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1}
Best score:  -0.6041355675372122


In [27]:
best_params = grid_search.best_params_

model_xgb = xgb.XGBRegressor(**best_params)

# Get the predictions of the model on the test data
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

R-squared:  0.23169240995894902
MSE:  1.71358246212288
RMSE:  1.3090387550118139
MAE:  0.5947602570184656


## SVM

In [28]:
from sklearn.svm import SVR

# Define the parameter grid to search
param_grid = {'C': [0.1, 1, 10],
              'gamma': [0.1, 1, 10],
              'kernel': ['linear', 'rbf']
              }

# Create the SVM model
svm = SVR()

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(svm,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   cv=kf,
                   verbose=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters:  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best score:  -1.0078467678287928


In [None]:
best_params = grid_search.best_params_

model_xgb = xgb.XGBRegressor(**best_params)

# Get the predictions of the model on the test data
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

Block variable for spatial autocorrelation

List of variables:
- counts week
- street per node
- streeth length total
- circuity
- 3 way int
- all distance
- all points
- cycle length
- ndvi mean
- dem std
- pop sum
- lst mean
- built env
- entropy