# OLS Model Building for penalty estimation

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [None]:
# setting up file path
file_path = "../Data/"
output_file_path = file_path + "Output/"

In [None]:
# import the dataframe with the result of freeflow travel time and routing
ff_df = pd.read_csv(
    output_file_path
    + "result0226/"
    + "freeflow_OD3am_all_googlerouteapi_new_graph_new_turn_control_slight.csv",
)

In [None]:
# Import the result of google route api travel time
gg_df_result_all = pd.read_csv(output_file_path + "googlerouteapi2024allresult.csv")

In [None]:
# merge the freeflow travel time and google travel time into one dataframe
df = ff_df.merge(gg_df_result_all, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
# split the training and testing set
train1, test1 = train_test_split(df, test_size=0.2, random_state=123)

## OLS Model 1: based on freeflow routing

In [None]:
# model the difference between google travel time and edge freeflow traversal travel time
train1["diff"] = train1["duration"] - train1["travel_time"]

In [None]:
# model all the traffic control and turn count to explain the difference between google travel time and edge freeflow traversal travel time
res = sm.OLS(
    endog=train1["diff"],
    exog=train1[
        [
            "signal_count",
            "stop_count",
            "crossing_count",
            "give_way_count",
            "mini_roundabout_count",
            "left_count",
            "slight_left_count",
            "right_count",
            "slight_right_count",
            "u_count",
        ]
    ].assign(intercept=0),
).fit()
print(res.summary())

In [None]:
# only include significant and positive coefficients
res2 = sm.OLS(
    endog=train1["diff"],
    exog=train1[
        [
            "signal_count",
            "stop_count",
            "crossing_count",
            "left_count",
            "right_count",
            "slight_right_count",
            "u_count",
        ]
    ].assign(intercept=0),
).fit()
print(res2.summary())

In [None]:
# only include significant and positive coefficients, excluding slight right count
res3 = sm.OLS(
    endog=train1["diff"],
    exog=train1[
        ["signal_count", "stop_count", "crossing_count", "left_count", "right_count", "u_count"]
    ].assign(intercept=0),
).fit()
print(res3.summary())

In [None]:
# only include significant and positive coefficients, excluding slight right count and crossing count
res4 = sm.OLS(
    endog=train1["diff"],
    exog=train1[["signal_count", "stop_count", "left_count", "right_count", "u_count"]].assign(
        intercept=0,
    ),
).fit()
print(res4.summary())

In [None]:
# include only turns
res5 = sm.OLS(
    endog=train1["diff"],
    exog=train1[
        ["left_count", "slight_left_count", "right_count", "slight_right_count", "u_count"]
    ].assign(intercept=0),
).fit()
print(res5.summary())

## OLS Revised Model 1: Re-estimate the OLS model based on the routing result based on the coefficient estimated by the OLS models above


### Use the coefficient of only turns in model 1 to run the penalized routing (using OD_pairs_time_distance_uber_routes_0224.py)

In [None]:
# Import the result
p_df2 = pd.read_csv(
    output_file_path
    + "result0319parsimonious/"
    + "penalized_OD3am_all_googlerouteapi_model1_parsi.csv",
)

In [None]:
# merge with Google travel time into one dataframe
df2 = p_df2.merge(gg_df_result_all, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
# split the training and testing set
train2, test2 = train_test_split(df2, test_size=0.2, random_state=123)

## OLS Revised Model 2


In [None]:
train2["diff"] = train2["duration"] - train2["total_time_with_turn_penalty"]

In [None]:
res6 = sm.OLS(
    endog=train2["diff"],
    exog=train2[
        ["left_count", "slight_left_count", "right_count", "slight_right_count", "u_count"]
    ].assign(intercept=0),
).fit()
print(res6.summary())


### Add the significant coefficients in new revised model 2 to the previous penalties in model 1 and run the penalized routing (using OD_pairs_time_distance_uber_routes_0224.py)



In [None]:
p_df3 = pd.read_csv(
    output_file_path
    + "result0319parsimonious/"
    + "penalized_OD3am_all_googlerouteapi_model2_parsi.csv",
)

In [None]:
# merge with google travel time into one dataframe
df3 = p_df3.merge(gg_df_result_all, left_on=["oid", "did"], right_on=["oid", "did"])
# split the training and testing set
train3, test3 = train_test_split(df3, test_size=0.2, random_state=123)

## OLS Revised Model 2

In [None]:
train3["diff"] = train3["duration"] - train3["total_time_with_turn_penalty"]

In [None]:
res7 = sm.OLS(
    endog=train3["diff"],
    exog=train3[
        ["left_count", "slight_left_count", "right_count", "slight_right_count", "u_count"]
    ].assign(intercept=0),
).fit()
print(res4.summary())