In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import torch.backends.cudnn as cudnn

cudnn.deterministic = True
cudnn.benchmark = False

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


def get_true_aqi(C):
    breakpoints = [
        ((0.000, 0.054), (0, 50)),
        ((0.05400001, 0.070), (51, 100)),
        ((0.07000001, 0.085), (101, 150)),
        ((0.08500001, 0.105), (151, 200)),
        ((0.10500001, 0.200), (201, 300)),
    ]

    aqi = torch.zeros_like(C)

    for (bp_lo, bp_hi), (i_lo, i_hi) in breakpoints:
        mask = (C >= bp_lo) & (C <= bp_hi)
        aqi[mask] = ((i_hi - i_lo) / (bp_hi - bp_lo)) * (C[mask] - bp_lo) + i_lo

    aqi[C > 500.4] = 500
    return aqi


CSV_PATH = "datasets/final/datasets_lagwise/Ozone_Combined_AQI_2022_2024.csv"
df = pd.read_csv(CSV_PATH)

df["DATE"] = pd.to_datetime(df["Date"], format="%d-%m-%y", dayfirst=True, errors="coerce")
df = df.dropna(subset=["DATE", "Daily_Mean_Ozone", "Daily_AQI_Value"]).reset_index(drop=True)

data = df[["DATE", "Daily_Mean_Ozone", "Daily_AQI_Value"]].copy()
data = data.sort_values("DATE").reset_index(drop=True)

print("\n--- 1. Original Data Frame (Head) ---")
print(data.head())
print("-" * 50)
print("Shape:", data.shape)


class AQIPredictor(nn.Module):
    def __init__(self, n_hidden=64):
        super(AQIPredictor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(2, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        return self.net(x)


def data_loss(model, C_scaled, AQI_true):
    AQI_pred = model(C_scaled)
    return torch.mean((AQI_pred - AQI_true) ** 2)


def physics_loss(model, C_unscaled, C_scaled):
    AQI_from_formula = get_true_aqi(C_unscaled).detach()
    AQI_from_model = model(C_scaled)
    return torch.mean((AQI_from_model - AQI_from_formula) ** 2)


results = {
    "LAG": [],
    "Model": [],
    "lambda_data": [],
    "lambda_phys": [],
    "MAE": [],
    "RMSE": [],
    "NMSE": [],
}

lag_list = [1, 7, 14, 30]

lambda_configs = [
    (0.0, 1.0),
    (0.3, 0.7),
    (0.5, 0.5),
    (0.7, 0.3),
    (1.0, 0.0),
]


for LOOKBACK_DAYS in lag_list:
    print(f"\n========================================")
    print(f"Preparing supervised dataset for LAG = {LOOKBACK_DAYS}")
    print(f"========================================")

    tmp = data.copy()
    tmp[f"AQI_Targeted_Value_LAG_{LOOKBACK_DAYS}"] = tmp["Daily_AQI_Value"].shift(-LOOKBACK_DAYS)
    tmp[f"OZONE_Targeted_Value_LAG_{LOOKBACK_DAYS}"] = tmp["Daily_Mean_Ozone"].shift(-LOOKBACK_DAYS)

    supervised_df = (
        tmp[
            [
                "DATE",
                "Daily_Mean_Ozone",
                "Daily_AQI_Value",
                f"AQI_Targeted_Value_LAG_{LOOKBACK_DAYS}",
                f"OZONE_Targeted_Value_LAG_{LOOKBACK_DAYS}",
            ]
        ]
        .dropna()
        .reset_index(drop=True)
    )

    print("\n--- Supervised Data (Head) ---")
    print(supervised_df.head())
    print("-" * 50)
    print("Shape:", supervised_df.shape)

    C_pm25_data = supervised_df["Daily_Mean_Ozone"].values.astype(np.float32).reshape(-1, 1)
    C_aqi_data = supervised_df["Daily_AQI_Value"].values.astype(np.float32).reshape(-1, 1)
    AQI_data = supervised_df[f"AQI_Targeted_Value_LAG_{LOOKBACK_DAYS}"].values.astype(np.float32).reshape(-1, 1)
    PM_Targeted_Value = supervised_df[f"OZONE_Targeted_Value_LAG_{LOOKBACK_DAYS}"].values.astype(np.float32).reshape(-1, 1)

    X_data = np.hstack((C_pm25_data, C_aqi_data))
    y_data = AQI_data
    tar_pm = PM_Targeted_Value

    C_train, C_test, AQI_train, AQI_test, tr_pm, te_pm = train_test_split(
        X_data, y_data, tar_pm, test_size=0.2, random_state=42, shuffle=False
    )

    scaler = StandardScaler()
    C_train_scaled = scaler.fit_transform(C_train)
    C_test_scaled = scaler.transform(C_test)

    C_train_tensor_base = torch.tensor(C_train_scaled, dtype=torch.float32).to(device)
    AQI_train_tensor_base = torch.tensor(AQI_train, dtype=torch.float32).to(device)
    C_test_tensor_base = torch.tensor(C_test_scaled, dtype=torch.float32).to(device)
    AQI_test_tensor_base = torch.tensor(AQI_test, dtype=torch.float32).to(device)

    tr_pm_tensor_base = torch.tensor(tr_pm, dtype=torch.float32).to(device)
    te_pm_tensor_base = torch.tensor(te_pm, dtype=torch.float32).to(device)

    for lambda_data, lambda_phys in lambda_configs:
        torch.manual_seed(42)
        np.random.seed(42)
        print(
            f"\n>>> Training MLP+Physics | LAG = {LOOKBACK_DAYS}, "
            f"lambda_data = {lambda_data}, lambda_phys = {lambda_phys}"
        )

        model = AQIPredictor(n_hidden=64).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        num_epochs = 3000
        print_every = 200

        with torch.no_grad():
            aqi_phys_train = get_true_aqi(tr_pm_tensor_base)
            abs_diff = torch.abs(aqi_phys_train - AQI_train_tensor_base)

            print("\n--- First 10 Samples: True vs Physics AQI (Targeted Day) ---")
            for i in range(min(10, len(AQI_train_tensor_base))):
                true_val = float(AQI_train_tensor_base[i].cpu().item())
                phys_val = float(aqi_phys_train[i].cpu().item())
                diff_val = float(abs_diff[i].cpu().item())
                print(f"Sample {i+1:02d}: True={true_val:.3f}, Physics={phys_val:.3f}, Diff={diff_val:.3f}")

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            l_data = data_loss(model, C_train_tensor_base, AQI_train_tensor_base)
            l_phys = physics_loss(model, tr_pm_tensor_base, C_train_tensor_base)

            loss = lambda_data * l_data + lambda_phys * l_phys

            loss.backward()
            optimizer.step()

            if epoch < 5:
                print(
                    f"Epoch {epoch+1}: "
                    f"Data Loss = {l_data.item():.6f}, "
                    f"Physics Loss = {l_phys.item():.6f}, "
                    f"Total Loss = {loss.item():.6f}"
                )

            if (epoch + 1) % print_every == 0:
                print(
                    f"Epoch {epoch+1}/{num_epochs}, "
                    f"Total Loss = {loss.item():.6f}, "
                    f"Data Loss = {l_data.item():.6f}, "
                    f"Physics Loss = {l_phys.item():.6f}"
                )

        model.eval()
        with torch.no_grad():
            AQI_pred_test = model(C_test_tensor_base)

            y_true = AQI_test_tensor_base.cpu().numpy()
            y_pred = AQI_pred_test.cpu().numpy()

            mse = mean_squared_error(y_true, y_pred)
            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mse)

            denom = np.mean((y_true - np.mean(y_true)) ** 2)
            eps = 1e-12
            nmse = mse / (denom + eps)

            print(
                f"\n--- Evaluation | LAG = {LOOKBACK_DAYS}, "
                f"lambda_data = {lambda_data}, lambda_phys = {lambda_phys} ---"
            )
            print(f"Mean Squared Error (MSE):   {mse:.4f}")
            print(f"Mean Absolute Error (MAE):  {mae:.4f}")
            print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
            print(f"Normalized Mean Squared Error (NMSE): {nmse:.4f}")

        results["LAG"].append(LOOKBACK_DAYS)
        results["Model"].append("MLP+Physics")
        results["lambda_data"].append(lambda_data)
        results["lambda_phys"].append(lambda_phys)
        results["MAE"].append(mae)
        results["RMSE"].append(rmse)
        results["NMSE"].append(nmse)


results_df = pd.DataFrame(results)
results_df.to_csv("OZONE_MLP_Physics_LambdaSweep_Results.csv", index=False)

print("\n===== Final Ozone MLP+Physics Lambda Sweep Results =====")
print(results_df)


Using device: cpu

--- 1. Original Data Frame (Head) ---
        DATE  Daily_Mean_Ozone  Daily_AQI_Value
0 2022-01-01          0.025000        23.000000
1 2022-01-02          0.032333        30.333333
2 2022-01-03          0.029667        27.666667
3 2022-01-04          0.036000        33.333333
4 2022-01-05          0.033000        30.666667
--------------------------------------------------
Shape: (1091, 3)

Preparing supervised dataset for LAG = 1

--- Supervised Data (Head) ---
        DATE  Daily_Mean_Ozone  Daily_AQI_Value  AQI_Targeted_Value_LAG_1  \
0 2022-01-01          0.025000        23.000000                 30.333333   
1 2022-01-02          0.032333        30.333333                 27.666667   
2 2022-01-03          0.029667        27.666667                 33.333333   
3 2022-01-04          0.036000        33.333333                 30.666667   
4 2022-01-05          0.033000        30.666667                 23.000000   

   OZONE_Targeted_Value_LAG_1  
0                 