- Figure out which train datset is predicting which test dataset
- Make pandas table in order to be able to evaluate the different models that just combines them all

## All countries

This code uses spatial cross-validation to evaluate Random Forest, XGBoost and SVM models, by dividing the data into group folds according to the country in which the counter is found. 

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import statsmodels.api as sm
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, GroupKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Data selection

In [None]:
#Load in data
#Import csv and remove non-numerical variables
df = pd.read_csv('/Users/winke/Documents/University/Thesis/Predicting_cycling/weekly_new.csv')
df = df.drop(['name', 'year', 'week', 'latitude', 'longitude', 'espg'] , axis=1)

df = df[['counts_week', 'country', 'dem_mean', 'cycle_length', 'lst_mean', 'pop_sum', 'dem_std', 'build_area', 'street_length_total', 
          'ndvi_mean', 'restaurants', 'dist_to_bikePOI', '3_way_int_count', 'bike_points', 'daily_shops', 'median_speed', 'business_shops',
           'traffic_signals', 'dist_to_edu', 'dist_to_train', 'streets_per_node_avg', 'circuity_avg', 'lc_entropy', 'bus_stops', 'dist_to_greenspace']]

country = df

#Create dependent and independent variable
y = country.loc[:,'counts_week']
X = country.drop(['counts_week', 'country'], axis=1)

# Normalize dependent variable
scaler = StandardScaler()
data_scaled = scaler.fit_transform(y.values.reshape(-1, 1))
y = pd.Series(data_scaled.ravel())

Spatial cross validation

In [None]:
#Create spatial cross-validation according to countries
k = GroupKFold(n_splits=3)
countries = country['country'].values 
country_kfold = k.split(X, y, countries)

# Create the nested list of train and test indices for each fold
train_indices, test_indices = [list(traintest) for traintest in zip(*country_kfold)]
city_cv = [*zip(train_indices,test_indices)]

Regular train/test split

In [None]:
#Create traintestsplit for machine learning models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

## Linear Regression (OLS)

In [None]:
#Add a constant to the data and run OLS regression
Xc = sm.add_constant(X)
Xc.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
model = sm.OLS(y, Xc).fit()

print(model.summary())

In [None]:
Xnc = Xc.drop('const', axis=1)

#Check multicolinearity with VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(Xnc.values, i) for i in range(Xnc.shape[1])]
vif["features"] = Xnc.columns

print(vif)

## Random Forest

With spatial cross validation

In [None]:
# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10]}

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    trains = df.iloc[train_index]['country'].drop_duplicates().values
    test = df.iloc[test_index]['country'].drop_duplicates().values

    # Create the random forest model
    rf = RandomForestRegressor()

    # Create the K-fold cross-validation object
    kf = KFold(n_splits=2)

    # Create the grid search object
    grid_search = GridSearchCV(rf, param_grid, cv=kf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

With K-fold cross validation

In [None]:
# Define the parameter grid to search
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10]}

# Create the random forest model
rf = RandomForestRegressor()

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(rf, param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_rf = RandomForestRegressor(**best_params)

#Run the model on the test split of the data
model_rf.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

#Calculate model statistics and print them
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
f1 = f1_score(y_test, y_pred > 0.5)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)
print("F1 score:", f1)

## XGBOOST

With spatial cross validation

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 250, 300, 400],
              'max_depth': [3, 5, 10, 15],
              'learning_rate': [0.1, 0.3, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the XGBoost model
    xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')
    
    # Create the grid search object
    grid_search = GridSearchCV(xgb_reg,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

With regular K-Fold cross validation

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

# Define the parameter grid to search
param_grid = {'n_estimators': [50, 100, 200, 250, 300],
              'max_depth': [3, 5, 10, 15],
              'learning_rate': [0.1, 0.3, 0.5, 1],
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8, 1]
              }

# Create the XGBoost model
xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror')

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(xgb_reg,
                param_grid=param_grid,
                cv=kf,
                scoring='neg_mean_squared_error',
                verbose=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_xgb = xgb.XGBRegressor(**best_params)

# Get the predictions of the model on the test data
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
# f1 = f1_score(y_test, y_pred > 0.5)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)
# print("F1 score: ", f1)

## SVM

With spatial cross validation

In [None]:
from sklearn.svm import SVR

# Define the parameter grid to search
param_grid = {'C': [0.1, 1, 10],
              'gamma': [0.1, 1, 10],
              'kernel': ['linear', 'rbf']
              }

# Iterate through the folds
for train_index, test_index in city_cv:
    # Use the indices to split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create the SVM model
    svm = SVR()

    # Create the grid search object
    grid_search = GridSearchCV(svm,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    verbose=True)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    #Run the model on the test split of the data
    y_pred = grid_search.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    
    #Calculate model statistics and print them
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    print("*** FOLD *** training: " + trains[0]+ " " + trains[1] + " | test: " + test[0])
    print("Fold score:", score)
    print("R-squared: ", r2)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

With regular K-fold cross validation

In [None]:
# Define the parameter grid to search
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1],
              'kernel': ['linear', 'rbf']
              }

# Create the SVM model
svm_reg = SVR()

# Create the K-fold cross-validation object
kf = KFold(n_splits=5)

# Create the grid search object
grid_search = GridSearchCV(svm_reg,
                param_grid=param_grid,
                cv=kf,
                scoring='neg_mean_squared_error',
                verbose=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

#Create model from best parameters
best_params = grid_search.best_params_
model_svm = SVR(**best_params)

# Get the predictions of the model on the test data
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

# Calculate the R-squared, RMSE, MSE and F1-score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("R-squared: ", r2)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("MAE: ", mae)