- Figure out which train datset is predicting which test dataset
- Make pandas table in order to be able to evaluate the different models that just combines them all

## All countries

This code uses spatial cross-validation to evaluate Random Forest, XGBoost and SVM models, by dividing the data into group folds according to the country in which the counter is found. 

In [1]:
import shap
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import statsmodels.api as sm
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, GroupKFold
from sklearn.linear_model import PoissonRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor

  from .autonotebook import tqdm as notebook_tqdm


### Data selection

In [2]:
#Load in data
#Import csv and remove non-numerical variables
df = pd.read_csv('/Users/winke/Documents/University/Thesis/Predicting_cycling/weekly_new.csv')
df = df.drop(['name', 'year', 'week', 'latitude', 'longitude', 'espg'] , axis=1)

df = df[['counts_week', 'country', 'dist_to_greenspace', 'dist_to_edu', 'bike_points', 'bus_stops', 'business_shops', 'traffic_signals', 'cycle_length',
         'lst_mean', 'pop_sum', 'build_area', 'ndvi_mean', 'dist_to_bikePOI', 'dist_to_train', '3_way_int_count', 'median_speed', 'orientation_entropy', 'lc_entropy',
         'restaurants', 'dem_mean', 'dem_std']]

country = df

#Create dependent and independent variable
y = country.loc[:,'counts_week']
X = country.drop(['counts_week', 'country'], axis=1)

# # Normalize dependent variable
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(y.values.reshape(-1, 1))
# y = pd.Series(data_scaled.ravel())
# y.to_csv('y_og_std.csv')

#Use natural log as normalization
y = np.log(y + 1e-8).round(5)

y.to_csv("log_counts.csv")

Option 1: Spatial cross validation

In [4]:
#Create spatial cross-validation according to countries
k = GroupKFold(n_splits=3)
countries = country['country'].values 
country_kfold = k.split(X, y, countries)

# Create the nested list of train and test indices for each fold
train_indices, test_indices = [list(traintest) for traintest in zip(*country_kfold)]
city_cv = [*zip(train_indices,test_indices)]

Option 2: Regular train/test split

In [3]:
#Create traintestsplit for machine learning models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Linear Regression (OLS)

In [13]:
#Add a constant to the data and run OLS regression
Xc = sm.add_constant(X)
Xc.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

model = sm.OLS(y, Xc).fit()

# Create a DataFrame with the X variables
X_df = pd.DataFrame(X, columns=X.columns)

# Compute the correlation matrix
corr_matrix = X_df.corr()

corr_matrix.to_csv('correlation_matrix.csv')

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            counts_week   R-squared:                       0.324
Model:                            OLS   Adj. R-squared:                  0.309
Method:                 Least Squares   F-statistic:                     21.40
Date:                Sat, 25 Feb 2023   Prob (F-statistic):           1.12e-62
Time:                        13:04:20   Log-Likelihood:                -1821.4
No. Observations:                 912   AIC:                             3685.
Df Residuals:                     891   BIC:                             3786.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   5.2109    

In [14]:
# Predict target variable using the OLS model
y_pred = model.predict(Xc)

# Calculate evaluation metrics
mse = mean_squared_error(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

# Print evaluation metrics
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

MSE:  3.1782828492864774
RMSE:  1.782773919846955
MAE:  1.2514798545777468


In [22]:
Xnc = Xc.drop('const', axis=1)

#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(Xnc.values, i) for i in range(Xnc.shape[1])]
vif["features"] = Xnc.columns

print(vif)

          VIF             features
0    2.917260   dist_to_greenspace
1    3.476426          dist_to_edu
2    1.954642          bike_points
3    2.584574            bus_stops
4    2.448311       business_shops
5    2.815068      traffic_signals
6    4.110663         cycle_length
7   13.175478             lst_mean
8    2.450190              pop_sum
9    7.833051           build_area
10  10.737904            ndvi_mean
11   2.822339      dist_to_bikePOI
12   2.599683        dist_to_train
13   6.508410      3_way_int_count
14   5.072080         median_speed
15  27.279388  orientation_entropy
16  36.031328           lc_entropy
17   4.032751          restaurants
18   1.231853             dem_mean
19   2.871206              dem_std


## Random Forest

With spatial cross validation

In [6]:
# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10]}

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    trains = df.iloc[train_index]['country'].drop_duplicates().values
    test = df.iloc[test_index]['country'].drop_duplicates().values

    # Create the random forest model
    rf = RandomForestRegressor()

    # Create the K-fold cross-validation object
    kf = KFold(n_splits=2)

    # Create the grid search object
    grid_search = GridSearchCV(rf, param_grid, cv=kf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

NameError: name 'city_cv' is not defined

With K-fold cross validation

In [25]:
# Define the parameter grid to search
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10]}

# Create the random forest model
rf = RandomForestRegressor()

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(rf, param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_rf = RandomForestRegressor(**best_params)

#Run the model on the test split of the data
model_rf.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

#Calculate model statistics and print them
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

Best parameters:  {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 400}
Best score:  -2.4375273466785763
R-squared:  0.5858604965781343
MSE:  1.5685782007960256
RMSE:  1.2524289204565764
MAE:  0.888828969122294


In [26]:
# Save the trained model
with open('/Users/winke/Documents/University/Thesis/Predicting_cycling/models/standardized/all_rf_3.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

## XGBOOST

With spatial cross validation

In [13]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 250, 300, 400],
              'max_depth': [3, 5, 10, 15],
              'learning_rate': [0.1, 0.3, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the XGBoost model
    xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')
    
    # Create the grid search object
    grid_search = GridSearchCV(xgb_reg,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

NameError: name 'city_cv' is not defined

With regular K-Fold cross validation

In [19]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 250, 300],
              'max_depth': [3, 5, 10, 15],
              'learning_rate': [0.1, 0.3, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Create the XGBoost model
xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(xgb_reg,
                param_grid=param_grid,
                cv=kf,
                scoring='neg_mean_squared_error',
                verbose=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_xgb = xgb.XGBRegressor(**best_params)

# Get the predictions of the model on the test data
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
# f1 = f1_score(y_test, y_pred > 0.5)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)
# print("F1 score: ", f1)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
Best parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
Best score:  -0.50421547972479
R-squared:  0.6365273739220325
MSE:  0.25812837715400255
RMSE:  0.5080633593893605
MAE:  0.2769399810781852


In [20]:
# Save the trained model
with open('/Users/winke/Documents/University/Thesis/Predicting_cycling/models/standardized/all_xgb_1.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)

In [None]:
#Using SHAP to explain things
explainer = shap.Explainer(model_xgb, X_train)
shap_values = explainer(X)

# shap_values.display_data = shap.datasets.adult(display=True)[0].values
shap.plots.bar(shap_values)

shap.plots.beeswarm(shap_values)

shap.plots.beeswarm(shap_values.abs, color="shap_red")

shap.plots.bar(shap_values[1])

shap.plots.scatter(shap_values[:,"pop_sum"], color=shap_values)

## SVM

With spatial cross validation

In [None]:
from sklearn.svm import SVR

# Define the parameter grid to search
param_grid = {'C': [0.1, 1, 10],
              'gamma': [0.1, 1, 10],
              'kernel': ['linear', 'rbf']
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the SVM model
    svm = SVR()

    # Create the grid search object
    grid_search = GridSearchCV(svm,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

With regular K-fold cross validation

In [7]:
# Define the parameter grid to search
param_grid = {'C': [100, 10, 1, 0.1],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']
              }

# Create the SVM model
svm_reg = SVR()

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(svm_reg,
                param_grid=param_grid,
                cv=kf,
                scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_svm = SVR(**best_params)

# Get the predictions of the model on the test data
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

# Calculate the R-squared, RMSE, MSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse) 
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)

Best parameters:  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Best score:  -4.719221120924114
R-squared:  0.037234537323651784
MSE:  3.6465319167948893
RMSE:  1.9095894628937629
MAE:  1.4805920938632107
