In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from scipy import stats

In [None]:
# setting up file paths
file_path = '../Data/'
output_file_path = file_path + 'Output/'

In [None]:
# google time result
result = pd.read_csv(output_file_path + 'googlerouteapi2024allresult.csv')

In [None]:
# freeflow routing result based on the simplified graph
ff_sim = pd.read_csv(output_file_path + 'result0319parsimonious/' + 'freeflow_OD3am_all_googlerouteapi_simplified_parsi.csv')

In [None]:
# rename the column name and merge the freeflow result with google api result
ff_sim.rename(
    columns={'distance': 'ff_distance', 'travel_time': 'ff_time', 'route': 'ff_route'},
    inplace=True)
ff_sim_df_result = ff_sim[['oid', 'did', 'ff_distance', 'ff_time', 'ff_route']]

df = result.merge(ff_sim_df_result, left_on=['oid', 'did'], right_on=['oid', 'did'])

In [None]:
# OLS routing result based on both turn and traffic control penalties
penalized_param = pd.read_csv(output_file_path + 'result0226/'+ 'penalized_OD3am_all_googlerouteapi_new_graph_new_turn_control_model2_no_slight.csv')

In [None]:
# rename the column name and merge with previous df
penalized_param.rename(columns={'distance': 'all_ols_distance', 'total_time_with_turn_penalty': 'all_ols_time', 'route': 'all_ols_route'}, inplace=True)
penalized_param_result = penalized_param[['oid', 'did', 'all_ols_distance', 'all_ols_time', 'all_ols_route']]
df = df.merge(penalized_param_result, left_on=['oid', 'did'], right_on=['oid', 'did'])

In [None]:
df = df.drop(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'], axis = 1)

In [None]:
# OLS routing result based on only turn penalty on non-simplified graph
non_simplify_turn = pd.read_csv(output_file_path + 'result0319parsimonious/' + 'penalized_OD3am_all_googlerouteapi_model2_parsi.csv')

In [None]:
# rename the column name and merge with previous df
non_simplify_turn.rename(columns={'distance': 'non_simplify_turn_distance', 'total_time_with_turn_penalty': 'non_simplify_turn_time', 'route': 'non_simplify_route'}, inplace = True)
non_simplify_turn_result = non_simplify_turn[['oid', 'did', 'non_simplify_turn_distance','non_simplify_turn_time', 'non_simplify_route']]
df = df.merge(non_simplify_turn_result, left_on=['oid', 'did'], right_on=['oid', 'did'])


In [None]:
# OLS routing result based on only turn penalty on simplified graph
simplify_turn = pd.read_csv(output_file_path + 'result0319parsimonious/' + 'penalized_OD3am_all_googlerouteapi_simplified_parsi_model2.csv')

In [None]:
# rename the column name and merge with previous df
simplify_turn.rename(columns={'distance': 'simplify_turn_distance', 'total_time_with_turn_penalty': 'simplify_turn_time', 'route': 'simplify_route'}, inplace = True)
simplify_turn_result = simplify_turn[['oid', 'did', 'simplify_turn_distance','simplify_turn_time', 'simplify_route']]
df = df.merge(simplify_turn_result, left_on=['oid', 'did'], right_on=['oid', 'did'])


In [None]:
# calculate percentages
df['ff_time_pct'] = df['ff_time']/df['duration']
df['all_ols_time_pct'] = df['all_ols_time']/df['duration']
df['non_simplify_turn_time_pct'] = df['non_simplify_turn_time']/df['duration']
df['simplify_turn_time_pct'] = df['simplify_turn_time']/df['duration']

In [None]:
# function for prediction accuracy evaluation
def evaluate(predictions, test_labels):
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    mdeian_mape = 100 * np.median(errors / test_labels)
    accuracy_mean = 100 - mape
    accuracy_median = 100 - mdeian_mape
    print('Model Performance')
    print('Average Error: {:0.4f} seconds.'.format(np.mean(errors)))
    print('Median Error: {:0.4f} seconds.'.format(np.median(errors)))
    print('accuracy_mean = {:0.2f}%.'.format(accuracy_mean))
    print('accuracy_median = {:0.2f}%.'.format(accuracy_median))
    
    return accuracy_mean

In [None]:
# split for the test
test = pd.read_csv(output_file_path + '/result0319parsimonious/test3.csv')
test['split'] = 'test'
test = test[['oid', 'did', 'split']]
df = df.merge(test, how='left', left_on=['oid', 'did'], right_on=['oid', 'did'])
df[['split']] = df[['split']].fillna('train')
test_df = df[df['split'] == 'test']

In [None]:
evaluate(test_df['ff_time'], test_df['duration'])

In [None]:
Errors = abs(test_df['all_ols_time'] - test_df['duration'])
print('Average baseline error:', round(np.mean(Errors), 3), ' seconds')
# Evaluating the model
mse = mean_squared_error(test_df['duration'], test_df['all_ols_time']  )
print(f'Mean Squared Error: {mse}')
r2 = r2_score(test_df['duration'], test_df['all_ols_time'])
print(f'R-squared: {r2}')

In [None]:
Errors = abs(test_df['non_simplify_turn_time'] - test_df['duration'])
print('Average baseline error:', round(np.mean(Errors), 3), ' seconds')
# Evaluating the model
mse = mean_squared_error(test_df['duration'], test_df['non_simplify_turn_time']  )
print(f'Mean Squared Error: {mse}')
r2 = r2_score(test_df['duration'], test_df['non_simplify_turn_time'])
print(f'R-squared: {r2}')

In [None]:
Errors = abs(test_df['simplify_turn_time'] - test_df['duration'])
print('Average baseline error:', round(np.mean(Errors), 3), ' seconds')
# Evaluating the model
mse = mean_squared_error(test_df['duration'], test_df['simplify_turn_time']  )
print(f'Mean Squared Error: {mse}')
r2 = r2_score(test_df['duration'], test_df['simplify_turn_time'])
print(f'R-squared: {r2}')

In [None]:
stats.ttest_rel(test_df['simplify_turn_time'], test_df['duration'])

In [None]:
stats.ttest_rel(test_df['ff_time'], test_df['duration'])


In [None]:
stats.ttest_rel(test_df['all_ols_time'], test_df['duration'])

In [None]:
# libraries & dataset
import seaborn as sns
import matplotlib.pyplot as plt
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")
sns.set_theme(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(data=df, x="duration", color="skyblue", label="google travel time", kde=True)
sns.histplot(data=df, x="ff_time", color="red", label="shortest edge traversal time", kde=True)
sns.histplot(data=df, x="all_ols_time", color="yellow", label="travel time based on OLS parameters", kde=True)

plt.legend() 
plt.show()

# machine learning predictions

In [None]:
rf = pd.read_csv(output_file_path + 'result0331/' + 'test1_best_rf_predict0331.csv')

In [None]:
rf_result = rf[['oid', 'did', 'best_rf_predict']]

In [None]:
test_df = test_df.merge(rf_result, left_on=['oid', 'did'], right_on=['oid', 'did'])


In [None]:
test_df['rf_predict_gg_pct'] = test_df['best_rf_predict']/test_df['duration']
test_df['rf_predict_uber_pct'] = test_df['best_rf_predict']/test_df['mean_travel_time']

In [None]:
stats.ttest_rel(test_df['best_rf_predict'], test_df['duration'])

In [None]:
# libraries & dataset
import seaborn as sns
import matplotlib.pyplot as plt
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")
sns.set_theme(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(data=test_df, x="duration", color="skyblue", label="google travel time", kde=True)
sns.histplot(data=test_df, x="ff_time", color="red", label="shortest freeflow traversal time", kde=True)
sns.histplot(data=test_df, x="all_ols_time", color="yellow", label="travel time based on OLS parameters", kde=True)
sns.histplot(data=test_df, x="best_rf_predict", color="green", label="travel time based on OLS parameters", kde=True)

plt.legend() 
plt.show()

In [None]:
sns.set(style="darkgrid")
sns.set_theme(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(data=df, x="duration", color="skyblue", label="all sample: google travel time", kde=True)
sns.histplot(data=df, x="ff_time", color="red", label="all sample: shortest freeflow traversal time", kde=True)
sns.histplot(data=df, x="all_ols_time", color="yellow", label="all sample: travel time based on OLS parameters", kde=True)
sns.histplot(data=test_df, x="duration", color="skyblue", label="test: google travel time", kde=True)
sns.histplot(data=test_df, x="ff_time", color="red", label="test:shortest freeflow traversal time", kde=True)
sns.histplot(data=test_df, x="all_ols_time", color="yellow", label="test:travel time based on OLS parameters", kde=True)
sns.histplot(data=test_df, x="best_rf_predict", color="green", label="test:travel time based on random forest", kde=True)


plt.legend() 
plt.show()