In [None]:
# Cell 1: Imports and Setup
import fireducks.pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import mlflow.sklearn
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# MLflow setup
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('SARIMAX_Example')

# Load data
df = pd.read_parquet('../data/sequences.parquet')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)

# Parameters
SEQUENCE_ID = 1  # Example with one sequence
TRAIN_SIZE = 168  # 7 days
VAL_SIZE = 48    # 2 days
TEST_SIZE = 24   # 1 day

In [None]:
# Cell 2: Define simpler model configurations
model_configs = [
    {
        'name': 'simple_model',
        'order': (1, 0, 0),  # Simplified order
        'seasonal_order': (0, 0, 0, 0),  # Remove seasonality initially
        'lags': [1]
    },
    {
        'name': 'base_model',
        'order': (1, 0, 1),  # Simple ARIMA
        'seasonal_order': (0, 0, 0, 0),  # No seasonality
        'lags': [1, 2]
    }
]

In [None]:
# Cell 3: Define preprocessing function with more robust scaling
def preprocess_data(data):
    """Preprocess sequence data"""
    # Remove outliers
    Q1 = data['Power'].quantile(0.25)
    Q3 = data['Power'].quantile(0.75)
    IQR = Q3 - Q1
    data = data[
        (data['Power'] >= Q1 - 1.5 * IQR) &
        (data['Power'] <= Q3 + 1.5 * IQR)
    ].copy()
    
    # Scale data
    scaler = StandardScaler()
    data['Power_scaled'] = scaler.fit_transform(data[['Power']])
    
    return data, scaler

# Get sequence data
sequence_data = df[df['sequence'] == SEQUENCE_ID].copy()
sequence_data, scaler = preprocess_data(sequence_data)

In [None]:
# Cell 4: Training and evaluation with improved error handling
for config in model_configs:
    # Start MLflow run
    with mlflow.start_run(run_name=f"sequence_{SEQUENCE_ID}_{config['name']}"):
        try:
            # Log parameters
            mlflow.log_params({
                'sequence': SEQUENCE_ID,
                'model_name': config['name'],
                'order_p': config['order'][0],
                'order_d': config['order'][1],
                'order_q': config['order'][2],
                'seasonal_P': config['seasonal_order'][0],
                'seasonal_D': config['seasonal_order'][1],
                'seasonal_Q': config['seasonal_order'][2],
                'seasonal_period': config['seasonal_order'][3],
                'lags': config['lags'],
                'train_size': TRAIN_SIZE,
                'val_size': VAL_SIZE,
                'test_size': TEST_SIZE
            })
            
            # Split data
            train_data = sequence_data['Power_scaled'][:TRAIN_SIZE]
            val_data = sequence_data['Power'][TRAIN_SIZE:TRAIN_SIZE+VAL_SIZE]
            test_data = sequence_data['Power'][TRAIN_SIZE+VAL_SIZE:TRAIN_SIZE+VAL_SIZE+TEST_SIZE]
            
            # Train model with modified initialization
            model = SARIMAX(
                train_data,
                order=config['order'],
                seasonal_order=config['seasonal_order'],
                enforce_stationarity=False,
                initialization='approximate',  # Try 'approximate' initialization
                trend='c'  # Add constant trend
            )
            
            # Fit with modified settings
            fitted_model = model.fit(disp=False,
                                   method='powell',  # Change optimizer
                                   maxiter=50)      # Limit iterations
            
            # Make predictions
            val_predictions = fitted_model.predict(
                start=len(train_data),
                end=len(train_data) + len(val_data) - 1,
                dynamic=False  # Use actual values for lagged predictions
            )
            val_predictions = scaler.inverse_transform(val_predictions.reshape(-1, 1)).ravel()
            
            test_predictions = fitted_model.predict(
                start=len(train_data) + len(val_data),
                end=len(train_data) + len(val_data) + len(test_data) - 1,
                dynamic=True  # Use predicted values for lagged predictions
            )
            test_predictions = scaler.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
            
            # Calculate metrics
            metrics = {
                'val_rmse': np.sqrt(mean_squared_error(val_data, val_predictions)),
                'val_mae': mean_absolute_error(val_data, val_predictions),
                'test_rmse': np.sqrt(mean_squared_error(test_data, test_predictions)),
                'test_mae': mean_absolute_error(test_data, test_predictions)
            }
            
            # Log metrics
            mlflow.log_metrics(metrics)
            
            # Print results
            print(f"\nResults for {config['name']}:")
            print(f"Validation RMSE: {metrics['val_rmse']:.4f}")
            print(f"Test RMSE: {metrics['test_rmse']:.4f}")
            
        except Exception as e:
            print(f"Error in {config['name']}: {str(e)}")
            continue