In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from pprint import pprint
from scipy import stats

In [None]:
# file paths
file_path = '../Data/'
output_file_path = file_path + 'Output/'

In [None]:
# import the freeflow result based on non-simplified graph
ff_df = pd.read_csv(
    output_file_path + 'result0226/' + "freeflow_OD3am_all_googlerouteapi_new_graph_new_turn_control_slight.csv")

In [None]:
# import the google api result
gg_df_result_all = pd.read_csv(output_file_path + 'googlerouteapi2024allresult.csv')
# merge the freeflow travel time and google travel time into one dataframe
df = ff_df.merge(gg_df_result_all, left_on=['oid', 'did'], right_on=['oid', 'did'])

In [None]:
df['diff'] = df['duration'] - df['travel_time']

# Random Forest Regression Model

In [None]:
# split train and test set
train1, test1 = train_test_split(df, test_size=0.2, random_state=123)

In [None]:
y = train1['duration']
x =train1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count','travel_time']]
x_test =test1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count','travel_time']]
y_test = test1['duration']

In [None]:
# Fitting the default Random forest Regression to the dataset
regressor = RandomForestRegressor()
# Fit the regressor with x and y data
regressor.fit(x, y)

In [None]:
# Predict the result
predictions = regressor.predict(x_test)

In [None]:
Errors=abs(predictions-y_test)
print('Average baseline error:', round(np.mean(Errors), 3), ' seconds')

In [None]:
test1['rf_predict_default'] = regressor.predict(x_test)

In [None]:
# Evaluating the default random forest model: mean square error and r-squared
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')


## Freeflow travel time prediction

In [None]:
# Evaluating the default freeflow model: mean square error and r-squared
mse = mean_squared_error(y_test, test1['travel_time'])
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, test1['travel_time'])
print(f'R-squared: {r2}')

In [None]:
errors = abs(y_test - test1['travel_time'])
print('Average Error: {:0.3f} seconds.'.format(np.mean(errors)))


In [None]:
errors = abs(y_test - test1['travel_time'])
print('Median Error: {:0.3f} seconds.'.format(np.median(errors)))

In [None]:
mape = 100 * np.mean(errors / y_test )
accuracy = 100 - mape
print('Average Accuracy = {:0.2f}%.'.format(accuracy))

In [None]:
mape = 100 * np.median(errors / y_test )
accuracy = 100 - mape
print('Average Accuracy = {:0.2f}%.'.format(accuracy))

## Try modelling using the difference as the dependent variable

In [None]:
# try modelling using the difference
y = train1['diff']
x =train1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count']]
y_test = test1['diff']
x_test =test1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count']]

In [None]:
regressor = RandomForestRegressor(random_state=123)
regressor.fit(x, y)

In [None]:
predictions = regressor.predict(x_test)

In [None]:
Errors=abs(predictions-y_test)
print('Average baseline error:', round(np.mean(Errors), 3), ' seconds')

In [None]:
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

### It is a lot worse than modelling the google travel time, so we abandon the model that use the difference as the dependent variable

## Hyper tuning of random forest regression

In [None]:
regressor = RandomForestRegressor(random_state=123)

In [None]:
print('Parameters currently in use:\n')
pprint(regressor.get_params())

In [None]:
# Create and randomized grid of hyper parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, None]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
train_labels = train1['duration']
train_features =train1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count','travel_time']]
test_features =test1[['signal_count', 'stop_count', 'crossing_count', 'give_way_count','mini_roundabout_count','left_count','slight_left_count','right_count','slight_right_count', 'u_count','travel_time']]
test_labels = test1['duration']

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 123)
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 5, verbose=2, random_state=123, n_jobs= -1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(train_features, train_labels);

In [None]:
# the best hyper parameters
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} seconds.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
base_model = RandomForestRegressor(n_estimators = 100, random_state = 123)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

In [None]:
best_random = RandomForestRegressor(n_estimators = 600, random_state = 123, min_samples_split = 2, max_features = None, max_depth=10, bootstrap= True)
best_random.fit(train_features, train_labels)
random_accuracy = evaluate(best_random, test_features, test_labels)

In [None]:
predictions = best_random.predict(x_test)
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

In [None]:
test1['best_rf_predict'] = best_random.predict(x_test)


In [None]:
# t-test
stats.ttest_rel(test1['best_rf_predict'], y_test)

In [None]:
# save to csv
test1.to_csv(output_file_path + 'result0331/' + 'test1_best_rf_predict0331.csv')
