In [None]:
import shap
import pickle
import pandas as pd
import pdpbox
import numpy as np
import dalex as dx
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import PartialDependenceDisplay, partial_dependence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#Loading dataframe
df = pd.read_csv('weekly_new.csv')
df = df[['counts_week' ,'country', 'dist_to_greenspace', 'dist_to_edu', 'bike_points', 'bus_stops', 'business_shops', 'traffic_signals', 'cycle_length',
         'lst_mean', 'pop_sum', 'build_area', 'ndvi_mean', 'dist_to_bikePOI', 'dist_to_train', '3_way_int_count', 'median_speed', 'orientation_entropy', 'lc_entropy',
         'restaurants', 'dem_mean', 'dem_std']]



# X_val = df
# X_val = df[df['country'] == 'Netherlands']
X_val = df[df['country'] == 'USA']
# X_val = df[df['country'] == 'UK']

y_a = X_val.loc[:,'counts_week']
X = X_val.drop(['counts_week', 'country'], axis=1)

# # Normalize dependent variable
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(y.values.reshape(-1, 1))
# y = pd.Series(data_scaled.ravel())

#Use natural log as normalization
y_a = np.log(y_a + 1e-8).round(5)

In [None]:
#Create traintestsplit for machine learning models
X_train, X_test, y_train, y_test = train_test_split(X, y_a, test_size=0.10, random_state=42)

In [None]:
#Opening the model for illustration
model_path = '/Users/winke/Documents/University/Thesis/Predicting_cycling/models/standardized/us_rf_3.pkl'

# Load the model from the file
with open(model_path, "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
y_pred = loaded_model.predict(X_test)
y_pred = np.exp(y_pred) - 1e-8

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print(rmse)

In [None]:
y_pred = 7.5
y_actual = np.exp(y_pred) - 1e-8
print(y_actual)

In [None]:
#Creating scatterplots between actual and predicted outcomes

city = 'Philadelphia'

df_scat = pd.read_csv('/Users/winke/Documents/University/Thesis/Predicting_cycling/models/weekly_new.csv')

df_scat = df_scat[['counts_week', 'city', 'country', 'dist_to_greenspace', 'dist_to_edu', 'bike_points', 'bus_stops', 'business_shops', 'traffic_signals', 'cycle_length',
         'lst_mean', 'pop_sum', 'build_area', 'ndvi_mean', 'dist_to_bikePOI', 'dist_to_train', '3_way_int_count', 'median_speed', 'orientation_entropy', 'lc_entropy',
         'restaurants', 'dem_mean', 'dem_std']]

df_scat = df_scat[df_scat['city'] == city]


y_a = df_scat.loc[:,'counts_week']
X = df_scat.drop(['counts_week', 'country', 'city'], axis=1)

#Opening the model for illustration
model_path = '/Users/winke/Documents/University/Thesis/Predicting_cycling/models/standardized/us_rf_3.pkl'

# Load the model from the file
with open(model_path, "rb") as f:
    scatter_model = pickle.load(f)
    
y_pred = loaded_model.predict(X)
y_pred = np.exp(y_pred) - 1e-8

print(y_pred)
scatter = pd.DataFrame()

scatter['Actual weekly count'] = y_a
scatter['Predicted weekly count'] = y_pred

f, ax = plt.subplots()
plt.scatter(y_a, y_pred, color='black')
slope, intercept = np.polyfit(y_a, y_pred, 1)
r2 = r2_score(y_a, y_pred).round(3)

plt.text(.01, .99, s='R2: ' + str(r2), ha='left', va='top', transform=ax.transAxes, fontsize=14)
plt.plot(y_a, slope * y_a + intercept, color='red')
plt.xlabel('Actual weekly count', fontname='Times New Roman', fontsize=14)
plt.ylabel('Predicted weekly count', fontname='Times New Roman', fontsize=14)
plt.subplots_adjust(left=0.15)
plt.savefig('/Users/winke/Documents/University/Thesis/Predicting_cycling/models/Figures/Scatter plot/'+city+'.png', dpi=300)

plt.show()
print(r2)

In [None]:
_, ax = plt.subplots(ncols=3, figsize=(12, 4), sharey=True, constrained_layout=True)

features_info = {
    "features": ['dem_mean', 'cycle_length', 'lst_mean'],
    "kind": "both"
}

display = PartialDependenceDisplay.from_estimator(
    loaded_model,
    X_train,
    **features_info,
    ax=ax
)

In [None]:
plt.rc('xtick', labelsize=16) 
_, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 8), sharey=True, constrained_layout=True)

# features_info = {
#     "features": ['dem_mean'],
#     "kind": "average"
# }

features_info = {
    "features": ['dem_mean', 'cycle_length', 'lst_mean', 'pop_sum', 'traffic_signals', 'dist_to_edu', 'dist_to_greenspace', 'median_speed', 
                 'dist_to_train', 'bike_points'],
    "kind": "average"
}

display = PartialDependenceDisplay.from_estimator(
    loaded_model,
    X_train,
    **features_info,
    ax=ax
)
# # add a horizontal line
# plt.axhline(y=7.82, color='red', linestyle='--')
# plt.axhline(y=8.922658, color='red', linestyle='--')
# plt.axhline(y=8.5171931, color='red', linestyle='--')

plt.savefig('pdp_us.png', dpi=300)

In [None]:
_, ax = plt.subplots(ncols=2, figsize=(8, 4), sharey=True, constrained_layout=True)

features_info = {
    "features": ['pop_sum', 'traffic_signals'],
    "kind": "average"
}

display = PartialDependenceDisplay.from_estimator(
    loaded_model,
    X_train,
    **features_info,
    ax=ax
)

In [None]:
#Iterate through variables and print the ICE plot for each of them
# Create an explainer for the model
exp = dx.Explainer(loaded_model, X, y, label="Random Forest")

# Generate partial dependence profiles for the first three features
pd_rf = exp.model_profile(variables = ['dem_mean', 'cycle_length', 'lst_mean'])

pd_rf.plot(geom = 'profiles')

In [None]:
y_pred = loaded_model.predict(X_test)
print(y_pred)
y_actual = np.exp(y_pred) - 1e-8
print(y_actual)

In [None]:
#Produce all SHAP values
#Using SHAP to explain things
explainer = shap.Explainer(loaded_model, X_train)
shap_values = explainer(X, check_additivity=False)

# shap_values.display_data = shap.datasets.adult(display=True)[0].values

#Shap dependence plot
#Shap 
#Grouping PDP


shap.plots.beeswarm(shap_values, max_display=20)


In [None]:
shap.plots.scatter(shap_values[:,'dem_mean'], color=shap_values[:,"cycle_length"], show=False)
plt.savefig('int_plot1.png', dpi=300)

In [None]:
shap.plots.scatter(shap_values[:,'cycle_length'], color=shap_values[:,"lst_mean"], show=False)
plt.savefig('int_plot2.png', dpi=300)

In [None]:
shap.plots.scatter(shap_values[:,'lst_mean'], color=shap_values[:,"pop_sum"], show=False)
plt.savefig('int_plot3.png', dpi=300)

In [None]:
shap.plots.scatter(shap_values[:,'pop_sum'], color=shap_values[:,"traffic_signals"], show=False)
plt.savefig('int_plot4.png', dpi=300)

In [None]:
shap.plots.scatter(shap_values[:,'traffic_signals'], color=shap_values[:,"dem_mean"], show=False)
plt.savefig('int_plot5.png', dpi=300)

In [None]:
plt.rc('xtick', labelsize=16) 
_, ax = plt.subplots(nrows=1, ncols=5, figsize=(20, 4), sharey=True, constrained_layout=True)

#editing
# features_info = {
#     "features": ['dem_mean'],
#     "kind": "average"
# }

features_info = {
    "features": ['dem_std', 'ndvi_mean', 'bike_points', 'lst_mean', 'dem_mean'],
    "kind": "average"
}

display = PartialDependenceDisplay.from_estimator(
    loaded_model,
    X_train,
    **features_info,
    ax=ax
)
# # add a horizontal line
# plt.axhline(y=7.82, color='red', linestyle='--')
# plt.axhline(y=8.922658, color='red', linestyle='--')
# plt.axhline(y=8.5171931, color='red', linestyle='--')

plt.savefig('pdp_us_2.png', dpi=300)