In [1]:
import pandas as pd

df = pd.read_csv('data_processed_v2.csv', parse_dates=['date'], index_col='date')
df = df.sort_index()

y = df['fare']
X = df.drop('fare', axis=1)

In [2]:
from sklearn.preprocessing import StandardScaler

non_bool_cols = X.select_dtypes(exclude='bool').columns
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

scaler = StandardScaler()
X_scaled_non_bool = scaler.fit_transform(X[non_bool_cols])

X_scaled_non_bool = pd.DataFrame(X_scaled_non_bool, columns=non_bool_cols, index=X.index)
X_scaled = pd.concat([X_scaled_non_bool, X[bool_cols]], axis=1)

In [None]:
from itertools import product
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
import numpy as np

# Define parameter ranges
p_values = [0, 1, 2]
d_values = [0]
q_values = [0, 1, 2]
s = [4]
parameters = list(product(p_values, d_values, q_values, s))

# Define TimeSeriesSplit and initialize variables
tscv = TimeSeriesSplit(n_splits=5)
best_mse = float("inf")
best_order = None

# Iterate over all parameter combinations
for order in parameters:
    fold_mses = []  # Store MSE for each fold

    # Cross-validation loop
    for train_index, test_index in tscv.split(y):
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]

        # Reset indices for compatibility with SARIMAX
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

        # Fit model and make predictions
        try:
            model = SARIMAX(y_train, exog=X_train, seasonal_order=order)
            model_fit = model.fit(disp=False)
            y_pred = model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1, exog=X_test)
            mse = mean_squared_error(y_test, y_pred)
            fold_mses.append(mse)
        except Exception as e:
            print(f"Error with order {order} in fold: {e}")
            fold_mses.append(np.nan)  # Append NaN if there's an error to exclude from mean calculation

    # Calculate mean MSE across all folds (excluding NaNs)
    mean_mse = np.nanmean(fold_mses)
    print(f"Order {order}, Mean MSE across folds: {mean_mse}")

    # Update best parameters if current mean MSE is lower
    if mean_mse < best_mse:
        best_mse = mean_mse
        best_order = order

print(f"\nBest order: {best_order} with Mean MSE: {best_mse}")



Order (0, 0, 0, 4), Mean MSE across folds: 2092.8115508028204




Order (0, 0, 1, 4), Mean MSE across folds: 2042.936234956817




Order (0, 0, 2, 4), Mean MSE across folds: 2035.5885189738287




Order (1, 0, 0, 4), Mean MSE across folds: 2029.122172628776


