In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/DL/air+quality.zip" -d "/content/airquality"

In [None]:
import numpy as np
import pandas as pd
import os

df = pd.read_excel('/content/airquality/AirQualityUCI.xlsx')

In [None]:
num_samples = df.shape[0]
print(f"Number of samples (time points): {num_samples}")

In [None]:
num_features = df.shape[1]
print(f"Number of features: {num_features}")

In [None]:
df.describe()

Dataset Description:
The Air Quality dataset contains hourly air pollution measurements collected from an air pollution monitoring station in Italy. It includes data on pollutants such as CO (carbon monoxide), NO2 (nitrogen dioxide), and O3 (ozone), along with temperature and humidity. The dataset originates from the UCI Machine Learning Repository and is commonly used for time series forecasting and environmental analysis.

Dataset Source: UCI Air Quality Dataset(https://archive.ics.uci.edu/dataset/360/air+quality)

Columns:

CO(GT): Concentration of carbon monoxide (mg/m³)
NO2(GT): Concentration of nitrogen dioxide (µg/m³)
O3(GT): Ozone concentration (µg/m³)
Temperature: Ambient temperature (°C)
RH: Relative humidity (%)
AH: Absolute humidity

In [None]:
import numpy as np
# Replace invalid values (-200) with NaN
df.replace(-200, np.nan, inplace=True)
# Check missing values
df.isnull().sum()


In [None]:
df_interpolated = df.interpolate(method='linear')

In [None]:
df_cleaned = df_interpolated
print("Missing values after imputation:", df_cleaned.isnull().sum().sum())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import autocorrelation_plot

# Assuming 'CO' is your target variable (modify as needed)
target_variable = 'CO(GT)'

# Visualization 1: Time series plot of target variable
plt.figure(figsize=(12, 6))
plt.plot(df_cleaned.index, df_cleaned[target_variable])
plt.title(f'Time Series of {target_variable}')
plt.xlabel('Date')
plt.ylabel(target_variable)
plt.grid(True)
plt.tight_layout()
plt.show()
# Description: This plot shows the temporal variation of CO levels over time, revealing patterns, trends, and potential seasonality.

# Visualization 2: Correlation matrix heatmap
plt.figure(figsize=(10, 8))
numeric_df = df_cleaned.select_dtypes(include=['number'])
correlation = numeric_df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
# Description: This heatmap reveals the correlation between different air quality parameters, helping identify which features may have significant relationships with the target variable.

# Visualization 3: Autocorrelation plot
plt.figure(figsize=(12, 6))
autocorrelation_plot(df_cleaned[target_variable])
plt.title(f'Autocorrelation Plot of {target_variable}')
plt.grid(True)
plt.tight_layout()
plt.show()
# Description: This autocorrelation plot helps identify seasonality and temporal dependencies in the CO measurements, which is crucial for time series forecasting.

In [None]:
categorical_features = [col for col in df_cleaned.columns if df_cleaned[col].dtype == 'object']
if categorical_features:
    df_encoded = pd.get_dummies(df_cleaned, columns=categorical_features, drop_first=True)
    print("Shape after encoding:", df_encoded.shape)
else:
    df_encoded = df_cleaned
    print("No categorical features to encode")

In [None]:
train_size = int(0.7 * len(df_encoded))
val_size = int(0.15 * len(df_encoded))

# Sequential splitting (no shuffling)
X_train = df_encoded.iloc[:train_size].drop([target_variable], axis=1)
y_train = df_encoded.iloc[:train_size][target_variable]

X_val = df_encoded.iloc[train_size:train_size+val_size].drop([target_variable], axis=1)
y_val = df_encoded.iloc[train_size:train_size+val_size][target_variable]

X_test = df_encoded.iloc[train_size+val_size:].drop([target_variable], axis=1)
y_test = df_encoded.iloc[train_size+val_size:][target_variable]

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
features = df_cleaned.drop([target_variable], axis=1)

datetime_cols = X_train.select_dtypes(include=['datetime64', 'timedelta64']).columns
X_train = X_train.drop(datetime_cols, axis=1)
X_val = X_val.drop(datetime_cols, axis=1)
X_test = X_test.drop(datetime_cols, axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, index=X_val.index, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

In [None]:
import numpy as np

def create_sequences(data, seq_length):
    """
    Create input sequences and target values for time series forecasting.

    Parameters:
    -----------
    data : array-like
        The time series data (should be normalized/scaled).
    seq_length : int
        The length of input sequences (lookback period).

    Returns:
    --------
    X : numpy.ndarray
        Input sequences with shape (samples, seq_length, features).
    y : numpy.ndarray
        Target values with shape (samples,).
    """
    xs = []
    ys = []

    for i in range(len(data) - seq_length):
        x = data[i:(i + seq_length)]
        y = data[i + seq_length]

        xs.append(x)
        ys.append(y)

    return np.array(xs), np.array(ys)


seq_length = 24  #24 hours (if hourly data)
X_train_seq, y_train_seq = create_sequences(np.column_stack((X_train_scaled_df.values, y_train.values.reshape(-1, 1))), seq_length)
X_val_seq, y_val_seq = create_sequences(np.column_stack((X_val_scaled_df.values, y_val.values.reshape(-1, 1))), seq_length)
X_test_seq, y_test_seq = create_sequences(np.column_stack((X_test_scaled_df.values, y_test.values.reshape(-1, 1))), seq_length)

# Extract target variable from the last feature
X_train_seq = X_train_seq[:, :, :-1]
y_train_seq = y_train_seq[:, -1]

X_val_seq = X_val_seq[:, :, :-1]
y_val_seq = y_val_seq[:, -1]

X_test_seq = X_test_seq[:, :, :-1]
y_test_seq = y_test_seq[:, -1]

# Print the shapes to verify
print(f"Training sequences shape: {X_train_seq.shape}, Training targets shape: {y_train_seq.shape}")
print(f"Validation sequences shape: {X_val_seq.shape}, Validation targets shape: {y_val_seq.shape}")
print(f"Test sequences shape: {X_test_seq.shape}, Test targets shape: {y_test_seq.shape}")

In [None]:
#RNN

In [None]:
!pip install torchinfo


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchinfo import summary
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Hyperparameters (initial values, will be tuned)
input_size = X_train_seq.shape[2]  # Number of features
hidden_size = 64
num_layers = 3
dropout_rate = 0.2
learning_rate = 0.001
batch_size = 32
num_epochs = 100
seq_length = 24  # Same as used in sequence creation

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_seq)
y_train_tensor = torch.FloatTensor(y_train_seq)
X_val_tensor = torch.FloatTensor(X_val_seq)
y_val_tensor = torch.FloatTensor(y_val_seq)
X_test_tensor = torch.FloatTensor(X_test_seq)
y_test_tensor = torch.FloatTensor(y_test_seq)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
class StackedRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate=0.2):
        super(StackedRNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Stacked RNN layers with tanh activation (default for RNN)
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0
        )

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_rate)

        # Fully connected layers with ReLU activation
        self.fc1 = nn.Linear(hidden_size, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate through RNN layers
        out, _ = self.rnn(x, h0)

        # Get the outputs from the last time step
        out = out[:, -1, :]

        # Apply dropout
        out = self.dropout(out)

        # Pass through fully connected layers
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)

        return out.squeeze()

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience=10):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss = train_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

        val_loss = val_loss / len(val_loader.dataset)
        val_losses.append(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}: '
              f'Train Loss: {train_loss:.4f}, '
              f'Validation Loss: {val_loss:.4f}')

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f'Early stopping triggered after {epoch+1} epochs')
                break

    # Load the best model
    model.load_state_dict(best_model_state)
    return model, train_losses, val_losses, best_val_loss

In [None]:
def hyperparameter_tuning():
    # Define hyperparameter combinations to try
    hidden_sizes = [32, 64, 128]
    dropout_rates = [0.1, 0.2, 0.3]
    learning_rates = [0.01, 0.001, 0.0001]

    results = []

    for hidden_size in hidden_sizes:
        for dropout_rate in dropout_rates:
            for lr in learning_rates:
                print(f"\nTrying: hidden_size={hidden_size}, dropout_rate={dropout_rate}, lr={lr}")

                # Initialize model
                model = StackedRNNModel(input_size, hidden_size, num_layers, dropout_rate)

                # Initialize optimizer and criterion
                criterion = nn.MSELoss()
                optimizer = optim.Adam(model.parameters(), lr=lr)

                # Train with early stopping
                start_time = time.time()
                _, _, _, best_val_loss = train_model(
                    model, train_loader, val_loader, criterion, optimizer,
                    num_epochs=30, patience=5
                )
                training_time = time.time() - start_time

                # Evaluate on validation set
                results.append({
                    'hidden_size': hidden_size,
                    'dropout_rate': dropout_rate,
                    'learning_rate': lr,
                    'validation_loss': best_val_loss,
                    'training_time': training_time
                })

                print(f"Best validation loss: {best_val_loss:.4f}, Training time: {training_time:.2f}s")

    # Find best hyperparameters
    results.sort(key=lambda x: x['validation_loss'])
    best_params = results[0]

    print("\nHyperparameter Tuning Results:")
    for i, res in enumerate(results):
        print(f"{i+1}. hidden_size={res['hidden_size']}, "
              f"dropout_rate={res['dropout_rate']}, "
              f"learning_rate={res['learning_rate']}, "
              f"val_loss={res['validation_loss']:.4f}, "
              f"time={res['training_time']:.2f}s")

    print(f"\nBest hyperparameters: hidden_size={best_params['hidden_size']}, "
          f"dropout_rate={best_params['dropout_rate']}, "
          f"learning_rate={best_params['learning_rate']}")

    return best_params

In [None]:
def evaluate_model(model, test_loader, criterion=torch.nn.MSELoss()):
    model.eval()
    predictions = []
    actuals = []
    batch_losses = []

    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            batch_losses.append(loss.item())  # Store batch-wise loss
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    # Convert to numpy arrays
    predictions = np.array(predictions)
    actuals = np.array(actuals)

    # Calculate overall metrics
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    print(f'Test Results:')
    print(f'MSE: {mse:.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R²: {r2:.4f}')

    # Plot actual vs predicted values
    plt.figure(figsize=(10, 6))
    plt.plot(actuals, label='Actual')
    plt.plot(predictions, label='Predicted')
    plt.title('Actual vs Predicted Values')
    plt.xlabel('Sample Index')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot test loss over batches
    plt.figure(figsize=(8, 5))
    plt.plot(batch_losses, marker='o', linestyle='-', color='b', label='Test Loss (MSE)')
    plt.xlabel('Batch Index')
    plt.ylabel('Loss')
    plt.title('Test Loss Over Batches')
    plt.legend()
    plt.grid(True)
    plt.show()

    return mse, rmse, mae, r2, predictions, actuals, batch_losses

In [None]:
if __name__ == "__main__":
    # Print model summary
    sample_input = torch.zeros((batch_size, seq_length, input_size))
    model = StackedRNNModel(input_size, hidden_size, num_layers, dropout_rate)
    summary(model, input_data=sample_input)
    best_params = hyperparameter_tuning()
    hidden_size = best_params['hidden_size']
    dropout_rate = best_params['dropout_rate']
    learning_rate = best_params['learning_rate']

    # Initialize model with best parameters
    model = StackedRNNModel(input_size, hidden_size, num_layers, dropout_rate)

    # Initialize criterion and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    model, train_losses, val_losses, _ = train_model(
        model, train_loader, val_loader, criterion, optimizer,
        num_epochs=num_epochs, patience=10
    )

    # Save the trained model
    torch.save(model.state_dict(), 'best_rnn_model.pth')


In [None]:
mse, rmse, mae, r2, predictions, actuals, batch_losses = evaluate_model(model, test_loader)

## Discussion

1. Dataset Description
The Air Quality dataset from UCI contains 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device1. The device was located in a significantly polluted area at road level within an Italian city, and data were recorded from March 2004 to February 2005. There are 15 features in the dataset.

2. RNN Architecture

Input Layer - Input size: 35 features

RNN Layers- 3 stacked RNN layers

64 hidden units per layer

Dropout rate of 0.2 between layers

Fully Connected Layers: First FC layer: 64 -> 32 units, ReLU activation

Second FC layer: 32 -> 1 unit (output)

Training Parameters

Sequence Length: 24 time steps

Batch Size: 32

Learning Rate: 0.001

Number of Epochs: 100

Model Characteristics
The model processes sequences of 24 time steps, each with 35 features.

It uses a moderate hidden size (64) and number of layers (3), balancing complexity and computational efficiency.

The dropout rate of 0.2 helps prevent overfitting. The learning rate of 0.001 is a common starting point for Adam optimizer. With 100 epochs and a batch size of 32, the model has ample opportunity to learn from the data.

3. Results

Best parameter and the results in the training and validation are : hidden_size=64, dropout_rate=0.2, learning_rate=0.0001, val_loss=0.7952, time=56.09s

I performed hyper-parameter tuning on 3 features with 3 possible values assigned for each of them. As a result, there were 27 total combinations. The R2 value is respectable and shows our model has learned decently to make predictions. Adding more hidden layers, decreasing the LR had a relatively negative impact on my model of stacked rnn as my validation loss increased.

4. Limitations

Stacked RNN architectures have certain limitations. The most obvious ones are, increase in compute time due to the increased complexity. There is a slight potential for overfitting and the need for more data. The UCI Air quality data has only roughly 9400 entries which usually is insufficient to achieve world class performance on complex architectures.

5. Future Improvements

Try out different architectures, bidirectional RNNs, LSTMs, GRUs on the dataset and also increase the complexity even more. Adding attention mechanisms and trying out ensemble architectures to make the pipeline more robust.



STEP 5: REFERENCES

- https://www.geeksforgeeks.org/introduction-to-recurrent-neural-network/
- https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-recurrent-neural-networks
- https://medium.com/@poudelsushmita878/recurrent-neural-network-rnn-architecture-explained-1d69560541ef
- https://machinelearningmastery.com/recurrent-neural-network-algorithms-for-deep-learning/
- https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
