In [3]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress warnings to keep the output clean during grid search
warnings.filterwarnings("ignore")

# --- 1. Load and Preprocess Data ---
print("Loading and preprocessing data...")
# Load the dataset
data = pd.read_excel('/content/AirQualityUCI-dataset.xlsx')

# Assign clear column names
data.columns = ['Date', 'Time', 'CO_true', 'PT08_S1', 'NMHC_true', 'Benzene_true', 'PT08_S2',
                'NOx_true', 'PT08_S3', 'NO2_true', 'PT08_S4', 'PT08_S5', 'Temperature', 'RH', 'AH']

# Drop non-numeric columns for modeling
data.drop(['Date', 'Time'], axis=1, inplace=True)

# Replace the placeholder for missing values with NaN
data.replace(-200, np.nan, inplace=True)

# Fill missing values using the forward-fill method
data.fillna(method='ffill', inplace=True)

# Define the columns we want to forecast
columns_to_forecast = ['CO_true', 'PT08_S1', 'NMHC_true', 'Benzene_true', 'PT08_S2',
                       'NOx_true', 'PT08_S3', 'NO2_true', 'PT08_S4', 'PT08_S5',
                       'Temperature', 'RH', 'AH']
print("Data loaded and preprocessed successfully.\n")


# --- 2. Hyperparameter Tuning Function ---
def find_best_arima_order(train_data):
    """
    Finds the best ARIMA (p,d,q) order for a time series using a grid search.
    Args:
        train_data (pd.Series): The training time series data.
    Returns:
        tuple: The (p,d,q) order with the lowest AIC.
    """
    best_aic = float("inf")
    best_order = None
    p_values, d_values, q_values = range(0, 3), range(0, 2), range(0, 3)

    for p in p_values:
        for d in d_values:
            for q in q_values:
                try:
                    model = ARIMA(train_data, order=(p, d, q))
                    model_fit = model.fit()
                    if model_fit.aic < best_aic:
                        best_aic = model_fit.aic
                        best_order = (p, d, q)
                except:
                    continue
    return best_order


# --- 3. Main Training and Forecasting Loop ---
results = {}

for col in columns_to_forecast:
    print(f"--- Processing column: {col} ---")

    # Split data
    series = data[col].astype(float)
    train_data, test_data = series[:-48], series[-48:]

    # Feature Scaling
    scaler = MinMaxScaler()
    scaled_train = scaler.fit_transform(train_data.values.reshape(-1, 1))

    # Hyperparameter Tuning
    print("Finding optimal ARIMA order...")
    best_order = find_best_arima_order(scaled_train)
    if best_order is None:
        print(f"Could not find a suitable order for {col}. Skipping.")
        continue
    print(f"Best order found for {col}: {best_order}")

    # Train final model
    print("Training final model...")
    model = ARIMA(scaled_train, order=best_order)
    model_fit = model.fit()

    # Forecast and Inverse Transform
    forecast_scaled = model_fit.forecast(steps=48)
    forecast = scaler.inverse_transform(forecast_scaled.reshape(-1, 1)).flatten()

    # Evaluate
    rmse = np.sqrt(mean_squared_error(test_data, forecast))
    print(f"RMSE for {col}: {rmse}\n")

    # Store results
    results[col] = {'Best Order': best_order, 'RMSE': rmse, 'Forecast': forecast}


# --- 4. Display Final Results Summary ---
summary_df = pd.DataFrame([(col, res['Best Order'], res['RMSE']) for col, res in results.items()],
                          columns=['Target Variable', 'Best ARIMA Order', 'Final RMSE'])

print("\n--- Forecasting Summary ---")
print(summary_df.to_string())

# You can also view the 48-hour forecast for a specific column, e.g., 'CO_true'
print("\n--- Sample Forecast for CO_true ---")
print(results['CO_true']['Forecast'])

Loading and preprocessing data...
Data loaded and preprocessed successfully.

--- Processing column: CO_true ---
Finding optimal ARIMA order...
Best order found for CO_true: (2, 1, 2)
Training final model...
RMSE for CO_true: 1.0184698545314455

--- Processing column: PT08_S1 ---
Finding optimal ARIMA order...
Best order found for PT08_S1: (2, 1, 1)
Training final model...
RMSE for PT08_S1: 166.9083656679457

--- Processing column: NMHC_true ---
Finding optimal ARIMA order...
Best order found for NMHC_true: (2, 1, 2)
Training final model...
RMSE for NMHC_true: 2.806418941948365e-13

--- Processing column: Benzene_true ---
Finding optimal ARIMA order...
Best order found for Benzene_true: (2, 1, 2)
Training final model...
RMSE for Benzene_true: 4.645044606493481

--- Processing column: PT08_S2 ---
Finding optimal ARIMA order...
Best order found for PT08_S2: (2, 1, 2)
Training final model...
RMSE for PT08_S2: 200.95145604371706

--- Processing column: NOx_true ---
Finding optimal ARIMA or