# OLS Model Building and Result Visulization

In [None]:
import matplotlib.pyplot as plt
import osmnx as ox
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
file_path = "../Data/"
output_file_path = file_path + "Output/"

In [None]:
# import the dataframe with 10000 OD pairs (selected from OD pairs with 3am travel time reference in uber movement) result of freeflow travel time and routing
ff_df = pd.read_csv(output_file_path + "route_result_uber_hod3am_10000_freeflow.csv")

In [None]:
# import the dataframe of the 1000 OD pairs with Google route api travel time (duration) added
gg_df = pd.read_csv(output_file_path + "routesdataadded10000.csv")
gg_df_result = gg_df[["oid", "did", "distance", "duration", "polyline"]]

In [None]:
# merge the freeflow travel time and google travel time into one dataframe
df = ff_df.merge(gg_df_result, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
# split the training and testing set
train1, test1 = train_test_split(df, test_size=0.2, random_state=123)

## OLS Model 1: based on freeflow routing

In [None]:
# model 1
response = "duration"
predictors = [
    "freeflow_travel_time",
    "freeflow_signal_count",
    "freeflow_stop_count",
    "freeflow_crossing_count",
    "freeflow_give_way_count",
    "freeflow_mini_roundabout_count",
    "freeflow_left_count",
    "freeflow_right_count",
    "freeflow_u_count",
]
data = train1[[response] + predictors].dropna()
X = data[predictors]
y = data[response]
model1 = sm.OLS(y, sm.add_constant(X))
result1 = model1.fit()
print(result1.summary())

In [None]:
# calculate VIFs
X = train1[
    [
        "freeflow_travel_time",
        "freeflow_signal_count",
        "freeflow_stop_count",
        "freeflow_crossing_count",
        "freeflow_give_way_count",
        "freeflow_mini_roundabout_count",
        "freeflow_left_count",
        "freeflow_right_count",
        "freeflow_u_count",
    ]
]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

### Check the number of traffic control counts of the whole study region and in the sampled OD pairs

##### In the study region

In [None]:
G = ox.io.load_graphml(output_file_path + "LA_clip_convex_strong_network.graphml")

In [None]:
gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)

In [None]:
tc_df = gdf_nodes["highway"].value_counts().reset_index()

In [None]:
tc_df["percentage"] = tc_df["count"] / len(gdf_nodes)
tc_df["percentage"] = tc_df["percentage"].astype(float).map("{:.2%}".format)

In [None]:
tc_df

##### In the sampled 10,000 OD pairs

In [None]:
data = {
    "traffic controls": ["Tom", "nick", "krish", "jack"],
    "number of OD pairs with one or more corresponding controls": [20, 21, 19, 18],
}

In [None]:
data = [
    ["traffic signal", len(ff_df[ff_df["freeflow_signal_count"] >= 1])],
    ["stop sign", len(ff_df[ff_df["freeflow_stop_count"] >= 1])],
    ["crossing", len(ff_df[ff_df["freeflow_crossing_count"] >= 1])],
    ["give way", len(ff_df[ff_df["freeflow_give_way_count"] >= 1])],
    ["mini roundabout", len(ff_df[ff_df["freeflow_mini_roundabout_count"] >= 1])],
]

In [None]:
tc_OD_df = pd.DataFrame(
    data,
    columns=["Traffic controls", "Number of OD pairs with one or more corresponding controls"],
)

In [None]:
tc_OD_df

## OLS Revised Model 1: only including traffic signals and stop sign as traffic controls in the model


In [None]:
# new revised model 1
response = "duration"
predictors = [
    "freeflow_travel_time",
    "freeflow_signal_count",
    "freeflow_stop_count",
    "freeflow_left_count",
    "freeflow_right_count",
    "freeflow_u_count",
]
data = train1[[response] + predictors].dropna()
X = data[predictors]
y = data[response]
model1 = sm.OLS(y, sm.add_constant(X))
result1 = model1.fit()
print(result1.summary())

In [None]:
# calculate VIFs
X = train1[
    [
        "freeflow_travel_time",
        "freeflow_signal_count",
        "freeflow_stop_count",
        "freeflow_left_count",
        "freeflow_right_count",
        "freeflow_u_count",
    ]
]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

### Use the significant parameters in new revised model 1 to run the penalized routing (using OD_pairs_time_distance_uber_routes_1217.py)

In [None]:
# Import the result
p_df2 = pd.read_csv(output_file_path + "route_result_uber_hod3am_10000_bo_newmodel1_1215.csv")

In [None]:
# merge with Google travel time into one dataframe
df2 = p_df2.merge(gg_df_result, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
# split the training and testing set
train2, test2 = train_test_split(df2, test_size=0.2, random_state=123)

## OLS Revised Model 2


In [None]:
# new model 2
response = "duration"
predictors = [
    "total_time",
    "penalized_signal_count",
    "penalized_stop_count",
    "penalized_left_count",
    "penalized_right_count",
    "penalized_u_count",
]
data = train2[[response] + predictors].dropna()
X = data[predictors]
y = data[response]
model2 = sm.OLS(y, sm.add_constant(X))
result2 = model2.fit()
print(result2.summary())

In [None]:
# VIFs
X = train2[
    [
        "total_time",
        "penalized_signal_count",
        "penalized_stop_count",
        "penalized_left_count",
        "penalized_right_count",
        "penalized_u_count",
    ]
]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)


### Add the significant parameters in new revised model 2 to the previous parameter and run the penalized routing (using OD_pairs_time_distance_uber_routes_1217.py)


In [None]:
p_df3 = pd.read_csv(output_file_path + "route_result_uber_hod3am_10000_bo_newmodel2_1215.csv")

In [None]:
# merge with google travel time into one dataframe
df3 = p_df3.merge(gg_df_result, left_on=["oid", "did"], right_on=["oid", "did"])
# split the training and testing set
train3, test3 = train_test_split(df3, test_size=0.2, random_state=123)

## OLS Revised Model 2

In [None]:
# new model 3
response = "duration"
predictors = [
    "total_time",
    "penalized_signal_count",
    "penalized_stop_count",
    "penalized_left_count",
    "penalized_right_count",
    "penalized_u_count",
]
data = train3[[response] + predictors].dropna()
X = data[predictors]
y = data[response]
model3 = sm.OLS(y, sm.add_constant(X))
result3 = model3.fit()
print(result3.summary())

In [None]:
# VIFs
X = train3[
    [
        "total_time",
        "penalized_signal_count",
        "penalized_stop_count",
        "penalized_left_count",
        "penalized_right_count",
        "penalized_u_count",
    ]
]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

## Result Visualization

In [None]:
# Import the routing result using the original parameters based on R5 and ORSM
original_df4 = pd.read_csv(
    output_file_path + "route_result_uber_hod3am_10000_originalparam1215.csv",
)

In [None]:
# merge with google api travel time result
df4 = original_df4.merge(gg_df_result, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
# split the training and testing set
train4, test4 = train_test_split(df4, test_size=0.2, random_state=123)

In [None]:
# merge the testing dataframe with result based on our model parameter and original R5 ORSM parameters
test34 = test3.merge(test4, left_on=["oid", "did"], right_on=["oid", "did"])

In [None]:
test34["penalized_travel_time_google_model"] = test34["total_time_x"] / test34["duration_x"]
test34["penalized_travel_time_uber_model"] = test34["total_time_x"] / test34["uber_time_x"]
test34["penalized_travel_time_google_r5_osrm"] = test34["total_time_y"] / test34["duration_y"]
test34["penalized_travel_time_uber_r5_osrm"] = test34["total_time_y"] / test34["uber_time_y"]

In [None]:
# Result comparison plotting
plt.figure(figsize=(20, 10))
ax = sns.boxplot(
    data=test34[
        [
            "penalized_travel_time_google_model",
            "penalized_travel_time_uber_model",
            "penalized_travel_time_google_r5_osrm",
            "penalized_travel_time_uber_r5_osrm",
        ]
    ],
    palette="coolwarm",
)
plt.title("Travel Comparison")
plt.ylabel("proportion compared with google route api and uber movement")
medians = (
    test34[
        [
            "penalized_travel_time_google_model",
            "penalized_travel_time_uber_model",
            "penalized_travel_time_google_r5_osrm",
            "penalized_travel_time_uber_r5_osrm",
        ]
    ]
    .median()
    .values
)

for i, median_value in enumerate(medians):
    ax.text(
        i,
        median_value,
        f"{median_value:.2f}",
        horizontalalignment="center",
        size="x-small",
        color="b",
        weight="semibold",
    )
plt.show()