In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------------- 1. DATA PREPROCESSING ---------------------- #
class AQIDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx:idx+self.seq_length], dtype=torch.float32),
            torch.tensor(self.data[idx+self.seq_length], dtype=torch.float32)
        )

def load_and_preprocess_data(file_path, seq_length=20, train_split=0.8):
    df = pd.read_csv(file_path, parse_dates=['From Date'], index_col='From Date')
    df.fillna(method='ffill', inplace=True)

    scaler = MinMaxScaler(feature_range=(0, 1))
    normalized_data = scaler.fit_transform(df[['Overall AQI']].values)

    train_size = int(len(normalized_data) * train_split)
    train_data, test_data = normalized_data[:train_size], normalized_data[train_size:]

    train_dataset = AQIDataset(train_data, seq_length)
    test_dataset = AQIDataset(test_data, seq_length)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return train_loader, test_loader, scaler

# ---------------------- 2. GRU MODEL ---------------------- #
class GRUModel(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=64, num_layers=2, dropout=0.2):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.batch_norm(out[:, -1, :])
        return self.fc(out)

# ---------------------- 3. TRAINING FUNCTION ---------------------- #
def train_model(model, train_loader, test_loader, num_epochs=100, lr=0.001, patience=10):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred.squeeze(), y_batch.squeeze())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        test_loss = 0
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                loss = criterion(y_pred.squeeze(), y_batch)
                test_loss += loss.item()
        test_loss /= len(test_loader)
        scheduler.step(test_loss)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.5f}, Test Loss={test_loss:.5f}")

        if test_loss < best_loss:
            best_loss = test_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            return True
    return False

# ---------------------- 4. EVALUATION FUNCTION ---------------------- #
def evaluate_model(model, test_loader):
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    actuals, predictions = [], []

    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch).squeeze()
            actuals.extend(y_batch.tolist())
            predictions.extend(y_pred.tolist())

    mae = mean_absolute_error(actuals, predictions)
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(actuals, predictions)
    mape = np.mean(np.abs((np.array(actuals) - np.array(predictions)) / np.array(actuals))) * 100

    print(f"Evaluation Metrics: MAE={mae:.4f}, MSE={mse:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, MAPE={mape:.4f}%")
    return actuals, predictions

# ---------------------- 5. VISUALIZATION FUNCTION ---------------------- #
def plot_predictions(actuals, predictions):
    actuals = np.array(actuals).flatten()
    predictions = np.array(predictions).flatten()

    plt.figure(figsize=(10, 5))
    plt.plot(actuals, label="Actual AQI", color='blue')
    plt.plot(predictions, label="Predicted AQI", linestyle='dashed', color='red')
    plt.xlabel("Samples")
    plt.ylabel("AQI")
    plt.title("Actual vs Predicted AQI")
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.scatter(actuals, predictions, alpha=0.7, edgecolors='k', label="Predicted vs Actual")
    plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], color="red", linestyle='--', label="Perfect Prediction")
    plt.xlabel("Actual AQI")
    plt.ylabel("Predicted AQI")
    plt.title("Scatter Plot: Actual vs Predicted AQI")
    plt.legend()
    plt.show()

# ---------------------- 6. MAIN FUNCTION ---------------------- #
if __name__ == "__main__":
    file_path = "KNN_IMPUTED_HYD_AQI_TIME_SERIES_SORTED_with_AQI_NORMALIZED_data.csv"
    seq_length = 20
    train_loader, test_loader, scaler = load_and_preprocess_data(file_path, seq_length)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GRUModel().to(device)
    train_model(model, train_loader, test_loader)
    actuals, predictions = evaluate_model(model, test_loader)
    plot_predictions(actuals, predictions)
