- Figure out which train datset is predicting which test dataset
- Make pandas table in order to be able to evaluate the different models that just combines them all

## All countries

This code uses spatial cross-validation to evaluate Random Forest, XGBoost and SVM models, by dividing the data into group folds according to the country in which the counter is found. 

In [16]:
import numpy as np
import pandas as pd
import xgboost as xgb
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, GroupKFold, cross_val_predict

### Data selection

In [19]:
#Load in data
#Import csv and remove non-numerical variables
df = pd.read_csv('/Users/winke/Documents/University/Thesis/Predicting_cycling/weekly_new.csv')
df = df.drop(['name', 'year', 'week', 'latitude', 'longitude', 'espg'] , axis=1)
df = df[['counts_week', 'country', 'dem_mean', 'cycle_length', 'lst_mean', 'pop_sum', 'dem_std', 'build_area', 'street_length_total', 
          'ndvi_mean', 'restaurants', 'dist_to_bikePOI', '3_way_int_count', 'bike_points', 'daily_shops', 'median_speed', 'business_shops',
           'traffic_signals', 'dist_to_edu', 'dist_to_train', 'streets_per_node_avg', 'circuity_avg', 'lc_entropy']]

country = df

#Create dependent and independent variable
y = country.loc[:,'counts_week']
X = country.drop(['counts_week', 'country'], axis=1)

# Normalize dependent variable
scaler = StandardScaler()
data_scaled = scaler.fit_transform(y.values.reshape(-1, 1))
y = pd.Series(data_scaled.ravel())

#Create spatial cross-validation according to countries
k = GroupKFold(n_splits=3)
countries = country['country'].values 
country_kfold = k.split(X, y, countries)

# Create the nested list of train and test indices for each fold
train_indices, test_indices = [list(traintest) for traintest in zip(*country_kfold)]
city_cv = [*zip(train_indices,test_indices)]

## Random Forest

In [20]:
# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10]}

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    trains = df.iloc[train_index]['country'].drop_duplicates().values
    test = df.iloc[test_index]['country'].drop_duplicates().values

    # Create the random forest model
    rf = RandomForestRegressor()

    # Create the grid search object
    grid_search = GridSearchCV(rf, param_grid, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

*** FOLD *** training: USA UK | test: Netherlands
Fold score: 2.8964298136069058
R-squared:  -0.3589703200095624
MSE:  2.8964298136069058
RMSE:  1.7018900709525588
MAE:  0.9156997191240007
Best parameters:  {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Best score:  -0.022754330443188475
*** FOLD *** training: Netherlands USA | test: UK
Fold score: 0.1215839639726358
R-squared:  -10.177984576091378
MSE:  0.1215839639726358
RMSE:  0.34868892149398123
MAE:  0.23320196703240645
Best parameters:  {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}
Best score:  -1.495535406168285
*** FOLD *** training: Netherlands UK | test: USA
Fold score: 0.2674602863725074
R-squared:  -6.2906314820217775
MSE:  0.2674602863725074
RMSE:  0.5171656276015523
MAE:  0.36423559352920687
Best parameters:  {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50}
Best score:  -1.7334977370224667


## XGBOOST

In [22]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 250, 300, 400],
              'max_depth': [3, 5, 10, 15],
              'learning_rate': [0.1, 0.3, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the XGBoost model
    xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')

    # Create the grid search object
    grid_search = GridSearchCV(xgb_reg,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
*** FOLD *** training: Netherlands UK | test: USA
Fold score: 2.829087069148973
R-squared:  -0.3273739075722957
MSE:  2.829087069148973
RMSE:  1.681989021708814
MAE:  0.9037642899237963
Best parameters:  {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best score:  -0.022677074181330213
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
*** FOLD *** training: Netherlands UK | test: USA
Fold score: 0.3218394665700373
R-squared:  -28.58874242747198
MSE:  0.3218394665700373
RMSE:  0.567308969231086
MAE:  0.4134543880908087
Best parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50, 'subsample': 1}
Best score:  -1.4113930878591516
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
*** FOLD *** training: Netherlands UK | test: USA
Fold score: 0.3514573618674418
R-squared:  -8.580286261454024
MSE:  0.3514573618

## SVM

In [23]:
from sklearn.svm import SVR

# Define the parameter grid to search
param_grid = {'C': [0.1, 1, 10],
              'gamma': [0.1, 1, 10],
              'kernel': ['linear', 'rbf']
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the SVM model
    svm = SVR()

    # Create the grid search object
    grid_search = GridSearchCV(svm,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
