In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
from torch.utils.data import DataLoader, TensorDataset
from torchinfo import summary
import copy
import os

In [None]:
data_path = "/Users/hp/Downloads/archive/realAWSCloudwatch/realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv"
df = pd.read_csv(data_path)

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
target_feature = df.columns[1]

plt.figure(figsize=(12, 4))
plt.plot(df['timestamp'], df[target_feature], color='b')
plt.xlabel("Time")
plt.ylabel(target_feature)
plt.title(f"Time-Series Plot of {target_feature}")
plt.show()

In [None]:
df_numeric = df.select_dtypes(include=[np.number])
df_numeric.boxplot()
plt.title("Boxplot of Numeric Features")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df[target_feature], bins=30, kde=True)
plt.xlabel(target_feature)
plt.title(f"Distribution of {target_feature}")
plt.show()

In [None]:
print("Dataset Head:")
print(df.head())
print("\nDataset Shape:", df.shape)
print("Number of samples (time points):", df.shape[0])
print("Number of features:", df.select_dtypes(include=[np.number]).shape[1])

print("\nMissing values in each column:\n", df.isnull().sum())

In [None]:
print("\nDescriptive Statistics:")
print(df.describe().T)

This dataset represents the CPU utilization of Amazon EC2 instances which is collected via AWS CloudWatch. It provides time-series measurements with key variables such as a timestamp and CPU utilization percentage, which can be used to monitor and analyze the performance of cloud resources.
link to dataset: https://www.kaggle.com/datasets/boltzmannbrain/nab

In [None]:
df_numeric = df.select_dtypes(include=[np.number])

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

In [None]:
data_tensor = torch.tensor(df_scaled, dtype=torch.float32)

train_data, test_data = train_test_split(data_tensor, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [None]:
batch_size = 32
train_loader = DataLoader(TensorDataset(train_data), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(val_data), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(test_data), batch_size=batch_size, shuffle=False)

In [None]:
def compute_test_loss(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    count = 0
    with torch.no_grad():
        for batch in test_loader:
            batch_data = batch[0]
            output = model(batch_data)
            loss = criterion(output, batch_data)
            total_loss += loss.item() * batch_data.size(0)
            count += batch_data.size(0)
    return total_loss / count

In [None]:
class DenseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_sizes, dropout_rate):
        super(DenseAutoencoder, self).__init__()
        encoder_layers = []
        prev_dim = input_dim
        for h in hidden_sizes:
            encoder_layers.append(nn.Linear(prev_dim, h))
            encoder_layers.append(nn.BatchNorm1d(h))
            encoder_layers.append(nn.LeakyReLU(0.1))
            encoder_layers.append(nn.Dropout(dropout_rate))
            prev_dim = h
        self.encoder = nn.Sequential(*encoder_layers)
        
        decoder_layers = []
        rev_hidden = list(reversed(hidden_sizes))
        for i, h in enumerate(rev_hidden[1:]):
            decoder_layers.append(nn.Linear(prev_dim, h))
            decoder_layers.append(nn.BatchNorm1d(h))
            decoder_layers.append(nn.LeakyReLU(0.1))
            prev_dim = h
        decoder_layers.append(nn.Linear(prev_dim, input_dim))
        self.decoder = nn.Sequential(*decoder_layers)
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
class Conv1DAutoencoder(nn.Module):
    def __init__(self, input_dim, channels, kernel_size=3):
        super(Conv1DAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=channels[0], kernel_size=kernel_size, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(channels[0], channels[1], kernel_size=kernel_size, stride=1, padding=1),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(channels[1], channels[0], kernel_size=kernel_size, stride=1, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(channels[0], 1, kernel_size=kernel_size, stride=1, padding=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

In [None]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, seq_length=10):
        super(LSTMAutoencoder, self).__init__()
        self.seq_length = seq_length
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, input_dim)
        
    def forward(self, x):
        _, (hidden, cell) = self.encoder(x)
        decoder_input = torch.zeros((x.shape[0], self.seq_length, hidden.shape[2]), device=x.device)
        decoded, _ = self.decoder(decoder_input, (hidden, cell))
        decoded = self.output_layer(decoded)
        return decoded

In [None]:
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
    return torch.stack(sequences)

In [None]:
train_data_conv1d = torch.tensor(scaler.fit_transform(train_data.numpy()), dtype=torch.float32).unsqueeze(1)
val_data_conv1d = torch.tensor(scaler.transform(val_data.numpy()), dtype=torch.float32).unsqueeze(1)
test_data_conv1d = torch.tensor(scaler.transform(test_data.numpy()), dtype=torch.float32).unsqueeze(1)

train_loader_conv1d = DataLoader(TensorDataset(train_data_conv1d), batch_size=batch_size, shuffle=True)
val_loader_conv1d = DataLoader(TensorDataset(val_data_conv1d), batch_size=batch_size, shuffle=False)
test_loader_conv1d = DataLoader(TensorDataset(test_data_conv1d), batch_size=batch_size, shuffle=False)

In [None]:
seq_length = 10
train_data_lstm = create_sequences(train_data, seq_length)
val_data_lstm = create_sequences(val_data, seq_length)
test_data_lstm = create_sequences(test_data, seq_length)

train_loader_lstm = DataLoader(TensorDataset(train_data_lstm), batch_size=batch_size, shuffle=True)
val_loader_lstm = DataLoader(TensorDataset(val_data_lstm), batch_size=batch_size, shuffle=False)
test_loader_lstm = DataLoader(TensorDataset(test_data_lstm), batch_size=batch_size, shuffle=False)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50, model_name="model"):
    best_val_loss = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())
    train_losses, val_losses = [], []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            batch_data = batch[0]
            optimizer.zero_grad()
            output = model(batch_data)
            loss = criterion(output, batch_data)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch_data = batch[0]
                output = model(batch_data)
                loss = criterion(output, batch_data)
                val_loss += loss.item()
                
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        print(f"[{model_name}] Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            # Save best model weights for this architecture to a dedicated file
            torch.save(best_model_wts, f"/Users/hp/Desktop/best_{model_name}_weights.pt")
        
        # Early stopping if validation loss increases after 10 epochs
        if epoch > 10 and avg_val_loss > val_losses[-2]:
            print(f"[{model_name}] Early stopping at epoch {epoch+1}")
            break
    
    model.load_state_dict(best_model_wts)
    return train_losses, val_losses

In [None]:
def train_model_with_scheduler(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=50, model_name="model"):
    best_val_loss = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())
    train_losses, val_losses = [], []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            batch_data = batch[0]
            optimizer.zero_grad()
            output = model(batch_data)
            loss = criterion(output, batch_data)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch_data = batch[0]
                output = model(batch_data)
                loss = criterion(output, batch_data)
                val_loss += loss.item()
                
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        scheduler.step(avg_val_loss)
        print(f"[{model_name}] Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(best_model_wts, f"/Users/hp/Desktop/best_{model_name}_weights.pt")
            
        if epoch > 10 and avg_val_loss > val_losses[-2]:
            print(f"[{model_name}] Early stopping at epoch {epoch+1}")
            break
    
    model.load_state_dict(best_model_wts)
    return train_losses, val_losses


In [None]:
def compute_reconstruction_error(model, test_loader):
    model.eval()
    errors = []
    with torch.no_grad():
        for batch in test_loader:
            batch_data = batch[0]
            output = model(batch_data)
            # Compute MSE per sample
            loss = torch.mean((output - batch_data) ** 2, dim=1)
            errors.extend(loss.cpu().numpy())
    return np.array(errors)


In [None]:
def plot_loss_curves(train_losses, val_losses, title):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel("Epochs")
    plt.ylabel("Loss (MSE)")
    plt.legend()
    plt.title(title)
    plt.show()

In [None]:
def plot_reconstruction_error_distribution(errors, title):
    plt.figure(figsize=(10, 5))
    sns.histplot(errors, bins=50, kde=True)
    plt.xlabel("Reconstruction Error (MSE)")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.show()

In [None]:
def plot_anomaly_detection(errors, threshold, title):
    anomalies = errors > threshold
    plt.figure(figsize=(10, 5))
    plt.plot(errors, label="Reconstruction Error")
    plt.axhline(y=threshold, color='r', linestyle='--', label="Threshold (95th percentile)")
    plt.scatter(np.where(anomalies)[0], errors[anomalies], color='red', label="Anomalies")
    plt.xlabel("Test Data Points")
    plt.ylabel("Reconstruction Error (MSE)")
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
dense_configs = [
    {"name": "dense_config1", "hidden_sizes": [64, 32, 16], "dropout_rate": 0.2},
    {"name": "dense_config2", "hidden_sizes": [128, 64, 32], "dropout_rate": 0.3}
]

In [None]:
best_dense_val_loss = float('inf')
best_dense_model = None
dense_experiment_results = {}

for config in dense_configs:
    print(f"\nTraining Dense Autoencoder with config: {config}")
    model_dense = DenseAutoencoder(input_dim=df_scaled.shape[1],
                                   hidden_sizes=config["hidden_sizes"],
                                   dropout_rate=config["dropout_rate"])
    print("Dense Model Summary:")
    print(summary(model_dense, (1, df_scaled.shape[1])))
    optimizer_dense = optim.Adam(model_dense.parameters(), lr=0.001)
    train_losses_dense, val_losses_dense = train_model(model_dense, train_loader, val_loader, nn.MSELoss(), optimizer_dense, epochs=50, model_name=config["name"])
    min_val_loss = min(val_losses_dense)
    dense_experiment_results[config["name"]] = min_val_loss
    
    if min_val_loss < best_dense_val_loss:
        best_dense_val_loss = min_val_loss
        best_dense_model = copy.deepcopy(model_dense)
        torch.save(best_dense_model.state_dict(), "/Users/hp/Desktop/best_dense_autoencoder_weights.pt")

print("\nDense Autoencoder Experiment Results (Validation Loss):")
for cfg, loss in dense_experiment_results.items():
    print(f"Config {cfg}: Best Val Loss = {loss:.6f}")
print("Best Dense Autoencoder saved as 'best_dense_autoencoder_weights.pt'.")

In [None]:
conv_configs = [
    {"name": "conv_config1", "channels": [16, 8], "kernel_size": 3},
    {"name": "conv_config2", "channels": [32, 16], "kernel_size": 3}
]

In [None]:
best_conv_val_loss = float('inf')
best_conv_model = None
conv_experiment_results = {}

for config in conv_configs:
    print(f"\nTraining Conv1D Autoencoder with config: {config}")
    model_conv = Conv1DAutoencoder(input_dim=df_scaled.shape[1],
                                   channels=config["channels"],
                                   kernel_size=config["kernel_size"])
    initialize_weights(model_conv)
    optimizer_conv = optim.Adam(model_conv.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_conv, mode='min', patience=5, factor=0.5, verbose=True)
    train_losses_conv, val_losses_conv = train_model_with_scheduler(model_conv, train_loader_conv1d, val_loader_conv1d, nn.MSELoss(), optimizer_conv, scheduler, epochs=50, model_name=config["name"])
    min_val_loss = min(val_losses_conv)
    conv_experiment_results[config["name"]] = min_val_loss
    
    if min_val_loss < best_conv_val_loss:
        best_conv_val_loss = min_val_loss
        best_conv_model = copy.deepcopy(model_conv)
        torch.save(best_conv_model.state_dict(), "/Users/hp/Desktop/best_conv1d_autoencoder_weights.pt")

print("\nConv1D Autoencoder Experiment Results (Validation Loss):")
for cfg, loss in conv_experiment_results.items():
    print(f"Config {cfg}: Best Val Loss = {loss:.6f}")
print("Best Conv1D Autoencoder saved as 'best_conv1d_autoencoder_weights.pt'.")

In [None]:
lstm_configs = [
    {"hidden_dim": 32, "num_layers": 1},
    {"hidden_dim": 64, "num_layers": 2},
    {"hidden_dim": 128, "num_layers": 2}
]

In [None]:
best_lstm_val_loss = float('inf')
best_lstm_config = None
best_lstm_model = None
lstm_experiment_results = {}

for config in lstm_configs:
    config_name = f"lstm_hd{config['hidden_dim']}_nl{config['num_layers']}"
    print(f"\nTraining LSTM Autoencoder with config: hidden_dim={config['hidden_dim']}, num_layers={config['num_layers']}")
    model_lstm = LSTMAutoencoder(input_dim=df_scaled.shape[1], hidden_dim=config['hidden_dim'], num_layers=config['num_layers'], seq_length=seq_length)
    print("LSTM Model Summary:")
    print(summary(model_lstm, (1, seq_length, df_scaled.shape[1])))
    optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.001)
    train_losses_lstm, val_losses_lstm = train_model(model_lstm, train_loader_lstm, val_loader_lstm, nn.MSELoss(), optimizer_lstm, epochs=50, model_name=config_name)
    min_val_loss = min(val_losses_lstm)
    lstm_experiment_results[config_name] = min_val_loss
    
    if min_val_loss < best_lstm_val_loss:
        best_lstm_val_loss = min_val_loss
        best_lstm_config = config
        best_lstm_model = copy.deepcopy(model_lstm)

print("\nLSTM Autoencoder Experiment Results (Validation Loss):")
for cfg, loss in lstm_experiment_results.items():
    print(f"Config {cfg}: Best Val Loss = {loss:.6f}")
print("Best LSTM Configuration:", best_lstm_config)

In [None]:
torch.save(best_lstm_model.state_dict(), "/Users/hp/Desktop/best_lstm_autoencoder_weights.pt")
print("Best LSTM Autoencoder weights saved as 'best_lstm_autoencoder_weights.pt'.")

In [None]:
criterion = nn.MSELoss()

test_loss_dense = compute_test_loss(best_dense_model, test_loader, criterion)
reconstruction_errors_dense = compute_reconstruction_error(best_dense_model, test_loader)
threshold_dense = np.percentile(reconstruction_errors_dense, 95)


print("\n[Dense] Test Loss (MSE):", test_loss_dense)
print("[Dense] Anomaly Detection Threshold (95th percentile):", threshold_dense)

In [None]:
anomalies_dense = reconstruction_errors_dense > threshold_dense
print("[Dense] Number of anomalies detected:", np.sum(anomalies_dense))

predictions_dense = best_dense_model(test_data).detach().numpy()
test_data_np = test_data.numpy()
shift_value = abs(min(test_data_np.min(), predictions_dense.min())) + 1
mae_dense = mean_absolute_error(test_data_np, predictions_dense)
rmse_dense = np.sqrt(mean_squared_error(test_data_np, predictions_dense))
r2_dense = r2_score(test_data_np, predictions_dense)
msle_dense = mean_squared_log_error(test_data_np + shift_value, predictions_dense + shift_value)
print(f"[Dense] MAE: {mae_dense:.6f}, RMSE: {rmse_dense:.6f}, R²: {r2_dense:.6f}, MSLE: {msle_dense:.6f}")

In [None]:
test_loss_conv = compute_test_loss(best_conv_model, test_loader_conv1d, criterion)
reconstruction_errors_conv = compute_reconstruction_error(best_conv_model, test_loader_conv1d)
threshold_conv = np.percentile(reconstruction_errors_conv, 95)

print("\n[Conv1D] Test Loss (MSE):", test_loss_conv)
print("[Conv1D] Anomaly Detection Threshold (95th percentile):", threshold_conv)

In [None]:
anomalies_conv = reconstruction_errors_conv > threshold_conv
print("[Conv1D] Number of anomalies detected:", np.sum(anomalies_conv))

test_data_np_conv = test_data_conv1d.squeeze(1).numpy()
predictions_conv = best_conv_model(test_data_conv1d.clone().detach()).detach().squeeze(1).numpy()
shift_value = abs(min(test_data_np_conv.min(), predictions_conv.min())) + 1
mae_conv = mean_absolute_error(test_data_np_conv, predictions_conv)
rmse_conv = np.sqrt(mean_squared_error(test_data_np_conv, predictions_conv))
r2_conv = r2_score(test_data_np_conv, predictions_conv)
msle_conv = mean_squared_log_error(test_data_np_conv + shift_value, predictions_conv + shift_value)
print(f"[Conv1D] MAE: {mae_conv:.6f}, RMSE: {rmse_conv:.6f}, R²: {r2_conv:.6f}, MSLE: {msle_conv:.6f}")

In [None]:
test_loss_lstm = compute_test_loss(best_lstm_model, test_loader_lstm, criterion)
reconstruction_errors_lstm = compute_reconstruction_error(best_lstm_model, test_loader_lstm)
threshold_lstm = np.percentile(reconstruction_errors_lstm, 95)

print("\n[LSTM] Test Loss (MSE):", test_loss_lstm)
print("[LSTM] Anomaly Detection Threshold (95th percentile):", threshold_lstm)

In [None]:
anomalies_lstm = reconstruction_errors_lstm > threshold_lstm
print("[LSTM] Number of anomalies detected:", np.sum(anomalies_lstm))

test_data_np_lstm = test_data_lstm.numpy().squeeze(-1)
predictions_lstm = best_lstm_model(test_data_lstm.clone().detach()).detach().numpy().squeeze(-1)
shift_value = abs(min(test_data_np_lstm.min(), predictions_lstm.min())) + 1
mae_lstm = mean_absolute_error(test_data_np_lstm, predictions_lstm)
rmse_lstm = np.sqrt(mean_squared_error(test_data_np_lstm, predictions_lstm))
r2_lstm = r2_score(test_data_np_lstm, predictions_lstm)
msle_lstm = mean_squared_log_error(test_data_np_lstm + shift_value, predictions_lstm + shift_value)
print(f"[LSTM] MAE: {mae_lstm:.6f}, RMSE: {rmse_lstm:.6f}, R²: {r2_lstm:.6f}, MSLE: {msle_lstm:.6f}")

In [None]:
torch.save(best_lstm_model.state_dict(), "/Users/hp/Desktop/best_selected_lstm_autoencoder_weights.pt")
print("Best selected LSTM Autoencoder weights saved as 'best_selected_lstm_autoencoder_weights.pt'.")

In [None]:
plot_loss_curves(train_losses_dense, val_losses_dense, "Dense Autoencoder - Training & Validation Loss")

In [None]:
plot_reconstruction_error_distribution(reconstruction_errors_dense, "Dense Autoencoder - Reconstruction Error Distribution")

In [None]:
plot_anomaly_detection(reconstruction_errors_dense, threshold_dense, "Dense Autoencoder - Anomaly Detection")

In [None]:
plot_loss_curves(train_losses_conv, val_losses_conv, "Conv1D Autoencoder - Training & Validation Loss")

In [None]:
plot_reconstruction_error_distribution(reconstruction_errors_conv, "Conv1D Autoencoder - Reconstruction Error Distribution")

In [None]:
plot_anomaly_detection(reconstruction_errors_conv, threshold_conv, "Conv1D Autoencoder - Anomaly Detection")

In [None]:
plot_loss_curves(train_losses_lstm, val_losses_lstm, "LSTM Autoencoder - Training & Validation Loss")

In [None]:
plot_reconstruction_error_distribution(reconstruction_errors_lstm, "LSTM Autoencoder - Reconstruction Error Distribution")

In [None]:
plot_anomaly_detection(reconstruction_errors_lstm, threshold_lstm, "LSTM Autoencoder - Anomaly Detection")

Here I have chosen 95 percentile as a thresold.

**Final Architecture**

- The final autoencoder that we have decided is the LSTM. It uses the best configuration discovered during experimentation, which is 2 LSTM layers with a hidden dimension of 128. In this model, the encoder processes input sequences of 10 time steps and compresses the temporal features into a 128-dimensional hidden representation, this captures both short-term and long-term dependencies in the CPU utilization data.
- Here, the decoder mirrors the encoder by using 2 LSTM layers to reconstruct the original sequence from the learned representation. Then, there is a fully connected layer that maps the decoder output back to the original feature space. We haven't included dropout in this specific configuration but it can be added if regularization becomes necessary.

**Result and Analyis**

- We have evaluated the performance of the LSTM autoencoder using reconstruction metrics such as Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), R2, and Mean Squared Log Error (MSLE).
- Here, the training and validation loss curves shows smooth convergence, which indicates that the model was able to effectively learn the temporal dynamics. We have set the anomaly threshold at the 95th percentile of the reconstruction error distribution, this enables the detection of unusual patterns in CPU utilization.
- One issue was the model occasionally under-predicted sudden spikes in CPU usage. So, it is important to tune hyperparameters like increasing the hidden dimension to 128 and using 2 layers can improve the model’s capacity to capture complex temporal patterns.
- R2 is very high and I think that's fine because we have already defined a thresold of 95 and it allows some level of reconstruction error and even after that some of the points are not reconstructed close to actual value. But the good thing is the reconstruction error is very less.

**Strength and Limitations**

- Autoencoders like LSTM-based models are good in unsupervised anomaly detection by learning a compressed representation of normal behavior without requiring labeled anomalies. This makes them highly effective when anomalies are rare or vary widely in appearance. They are able to reconstruct input data, which allows to flag instances with high reconstruction error as potential anomalies. 
- However, choosing an appropriate threshold for anomaly detection is challenging and can lead to false positives or negatives. Additionally, the performance of autoencoders is sensitive to hyperparameter choices. Here, increasing capacity like using a hidden dimension of 128 and multiple layers enhances pattern recognition but it also requires careful tuning to avoid overfitting as well as to ensure that subtle anomalies like short-lived spikes may not cause issues.

**References:** <br>

https://matplotlib.org/stable/plot_types/index.html <br>
https://numpy.org/doc/stable/user/index.html#user <br>
https://seaborn.pydata.org/tutorial/introduction.html <br>
https://pytorch.org/docs/stable/index.html <br>
https://scikit-learn.org/stable/ <br>
https://pandas.pydata.org/docs/user_guide/index.html#user-guide <br>
https://arxiv.org/pdf/2003.05991