#### Training an ARIMA model for FCR estimation to compare its performance with our vehicle-specific metamodels
#### Ehsan Moradi, Ph.D. Candidate

In [2]:
# Load required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


In [3]:
# General settings
VEHICLES = (
    "019 Hyundai Elantra GT 2019 (2.0L Auto)",
    "025 Chevrolet Captiva 2010 (2.4L Auto)",
    "027 Chevrolet Cruze 2011 (1.8L Manual)",
)
FEATURES = ["SPD_KH", "ACC_MS2", "ALT_M"]
DEPENDENT = "FCR_LH"
SETTINGS = {
    "INPUT_TYPE": "ENSEMBLE",
    "INPUT_INDEX": "06",
    "OUTPUT_TYPE": "ARIMA",
    "OUTPUT_INDEX": "COMPARE",
}


In [4]:
# Load sample data from Excel to a pandas dataframe
def load_from_Excel(vehicle, sheet, settings):
    directory = (
        "../../Academia/PhD/Field Experiments/Veepeak/"
        + vehicle
        + "/Processed/"
        + settings["INPUT_TYPE"]
        + "/"
    )
    input_file = vehicle + " - {0} - {1}.xlsx".format(
        settings["INPUT_TYPE"], settings["INPUT_INDEX"]
    )
    input_path = directory + input_file
    df = pd.read_excel(input_path, sheet_name=sheet, header=0)
    return df


In [5]:
# Save the predicted field back to Excel file
def save_to_excel(df, vehicle, settings):
    directory = (
        "../../Academia/PhD/Field Experiments/Veepeak/"
        + vehicle
        + "/Processed/"
        + settings["OUTPUT_TYPE"]
        + "/"
    )
    output_file = vehicle + " - {0} - {1}.xlsx".format(
        settings["OUTPUT_TYPE"], settings["OUTPUT_INDEX"]
    )
    output_path = directory + output_file
    with pd.ExcelWriter(output_path, engine="openpyxl", mode="w") as writer:
        df.to_excel(writer, header=True, index=None)
    print("{} -> Data is saved to Excel successfully!".format(vehicle))
    return None


In [None]:
# Training the ARIMA model and generating out-of-sample predictions
predictions, observations = {}, {}
for vehicle in VEHICLES:
    df = load_from_Excel(vehicle, "Sheet1", SETTINGS)
    # Apply feature scaling
    scaler_features = StandardScaler().fit(df[FEATURES])
    scaler_dependent = StandardScaler().fit(df[[DEPENDENT]])
    df[FEATURES] = scaler_features.transform(df[FEATURES])
    df[[DEPENDENT]] = scaler_dependent.transform(df[[DEPENDENT]])
    # Train-Test splitting (70%-30%)
    split_point = int(.7 * len(df))
    train = df[:split_point].copy(deep=True)
    # Train the ARIMA model
    # The AR order is chosen as 6 (in accordance with our RNN modeling lag order)
    # As the variables could be considered stationary (they are bounded and trendless), "difference" is set to 0.
    # Moving-average order of 3 is applied.
    model_l6 = ARIMA(train[DEPENDENT], exog=train[FEATURES], order=(6, 0, 3))
    fit_l6 = model_l6.fit(method_kwargs={"warn_convergence": False})
    # Out-of-sample prediction
    predictions[vehicle] = fit_l6.predict(
        start=len(train), end=len(df) - 1, exog=df[FEATURES][split_point:]).values
    # Apply inverse scaling
    df[FEATURES] = scaler_features.inverse_transform(df[FEATURES])
    predictions[vehicle] = scaler_dependent.inverse_transform(
        predictions[vehicle])
    df[[DEPENDENT]] = scaler_dependent.inverse_transform(
        df[[DEPENDENT]])
    observations[vehicle] = df[DEPENDENT][split_point:]
    df.loc[split_point:, "FCR_LH_PRED_ARIMA"] = predictions[vehicle]
    save_to_excel(df, vehicle, SETTINGS)


In [None]:
# Time-series plot of ARIMA predictions vs. true observations for a selected time-window
for vehicle in VEHICLES:
    fig, ax = plt.subplots(figsize=(12, 4))
    sns.lineplot(x=range(750), y=predictions[vehicle][0:750], color="blue")
    sns.lineplot(x=range(750), y=observations[vehicle][0:750], color="red")
    plt.legend(labels=["Predictions (AR Order = 6)", "True Observations"])
    plt.show()


In [None]:
# Scatter plot to compare ARIMA predictions and true observations
for vehicle in VEHICLES:
    fig, ax = plt.subplots(figsize=(4, 4))
    sns.scatterplot(x=observations[vehicle], y=predictions[vehicle])
    upper_bound = np.max([np.max(observations[vehicle]),
                         np.max(predictions[vehicle])])
    plt.xlim(0, upper_bound)
    plt.ylim(0, upper_bound)
    plt.xlabel("True Observations")
    plt.ylabel("ARIMA Predictions (AR Order = 6)")
    plt.show()


In [None]:
# Calculate R-squared score
for vehicle in VEHICLES:
    print("{0}: {1}".format(vehicle, r2_score(
        observations[vehicle], predictions[vehicle])))
