# Evaluation with Ottawa feature data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
import joblib

# Load the Ottawa dataset
ottawa_data = pd.read_csv("Ottawa_features.csv")

# Separate features and labels
X_ottawa = ottawa_data.drop(columns=["AnomalyLabel"]).values 
y_ottawa = ottawa_data["AnomalyLabel"].values 


In [3]:
#  Transformer Autoencoder 
class TransformerAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout_rate):
        super(TransformerAutoencoder, self).__init__()
        adjusted_dim = hidden_dim * num_heads 
        self.input_projection = nn.Linear(input_dim, adjusted_dim)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=adjusted_dim, nhead=num_heads, dropout=dropout_rate, norm_first=True
            ),
            num_layers=num_layers,
        )
        self.decoder = nn.Sequential(
            nn.Linear(adjusted_dim, input_dim * 2),
            nn.ReLU(),
            nn.Linear(input_dim * 2, input_dim),
        )

    def forward(self, x):
        x = self.input_projection(x)
        x = x.unsqueeze(0) 
        x = self.encoder(x)
        x = x.squeeze(0)
        return self.decoder(x)

In [4]:
# Diff transformer
class DiffTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, lambda_init=0.8):
        super(DiffTransformerEncoderLayer, self).__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.lambda_scalar = nn.Parameter(torch.tensor(lambda_init, requires_grad=True))  

        # 投影层
        self.qk_proj = nn.Linear(d_model, d_model * 2)  
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        # FFN 和 LayerNorm
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model),
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # 计算 Q1, Q2, K1, K2, V
        qk = self.qk_proj(src).chunk(2, dim=-1)  
        Q1, Q2, K1, K2 = qk[0], qk[1], qk[0], qk[1]
        V = self.v_proj(src)

        A1 = torch.softmax(Q1 @ K1.transpose(-2, -1) / (self.d_model ** 0.5), dim=-1)
        A2 = torch.softmax(Q2 @ K2.transpose(-2, -1) / (self.d_model ** 0.5), dim=-1)

        attention_output = (A1 - self.lambda_scalar * A2) @ V
        attention_output = self.out_proj(attention_output)

        src = src + self.dropout(self.norm1(attention_output))
        src = src + self.dropout(self.norm2(self.ffn(src)))
        return src

# diff autoencoder
class DiffTransformerAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers):
        super(DiffTransformerAutoencoder, self).__init__()
        adjusted_dim = hidden_dim * num_heads  
        self.input_projection = nn.Linear(input_dim, adjusted_dim)
        self.encoder = nn.ModuleList([
            DiffTransformerEncoderLayer(d_model=adjusted_dim, nhead=num_heads)
            for _ in range(num_layers)
        ])
        self.decoder = nn.Linear(adjusted_dim, input_dim)

    def forward(self, x):
        x = self.input_projection(x)
        for layer in self.encoder:
            x = layer(x)
        return self.decoder(x)

In [5]:
# Load Transformer autoencoder
transformer_model = TransformerAutoencoder(input_dim=X_ottawa.shape[1], hidden_dim=128, num_heads=4, num_layers=2, dropout_rate=0.1)
transformer_model.load_state_dict(torch.load("Transformer.pth"))
transformer_model.eval()

# Load DiffTransformer autoencoder
diff_transformer_model = DiffTransformerAutoencoder(input_dim=X_ottawa.shape[1], hidden_dim=128, num_heads=4, num_layers=2)
diff_transformer_model.load_state_dict(torch.load("Diff_Transformer.pth"))
diff_transformer_model.eval()


  transformer_model.load_state_dict(torch.load("Transformer.pth"))
  diff_transformer_model.load_state_dict(torch.load("Diff_Transformer.pth"))


DiffTransformerAutoencoder(
  (input_projection): Linear(in_features=10, out_features=512, bias=True)
  (encoder): ModuleList(
    (0-1): 2 x DiffTransformerEncoderLayer(
      (qk_proj): Linear(in_features=512, out_features=1024, bias=True)
      (v_proj): Linear(in_features=512, out_features=512, bias=True)
      (out_proj): Linear(in_features=512, out_features=512, bias=True)
      (ffn): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder): Linear(in_features=512, out_features=10, bias=True)
)

In [6]:
# Convert to PyTorch tensor 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_ottawa_tensor = torch.tensor(X_ottawa, dtype=torch.float32).to(device)
transformer_model.to(device)
diff_transformer_model.to(device)

# Get model predictions
with torch.no_grad():
    transformer_reconstructed = transformer_model(X_ottawa_tensor)
    diff_transformer_reconstructed = diff_transformer_model(X_ottawa_tensor)

# Compute Reconstruction errors 
transformer_errors = torch.mean((X_ottawa_tensor - transformer_reconstructed) ** 2, dim=1).cpu().numpy()
diff_transformer_errors = torch.mean((X_ottawa_tensor - diff_transformer_reconstructed) ** 2, dim=1).cpu().numpy()

# Normalize rec error
transformer_errors = (transformer_errors - np.mean(transformer_errors)) / np.std(transformer_errors)
diff_transformer_errors = (diff_transformer_errors - np.mean(diff_transformer_errors)) / np.std(diff_transformer_errors)


In [7]:
# Find optimal threshold for transformer
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_ottawa, transformer_errors)
optimal_idx = np.argmax(tpr - fpr) 
optimal_threshold_transformer = thresholds[optimal_idx]

# Find optimal threshold for diffTransformer
fpr, tpr, thresholds = roc_curve(y_ottawa, diff_transformer_errors)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_diff_transformer = thresholds[optimal_idx]

In [8]:
# Convert errors into anomaly predictions using optimal threshold
transformer_preds = (transformer_errors > optimal_threshold_transformer).astype(int)
diff_transformer_preds = (diff_transformer_errors > optimal_threshold_diff_transformer).astype(int)


In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Print evaluation metrics for Transformer
print("Transformer performance matrics:")
print(classification_report(y_ottawa, transformer_preds, target_names=["Healthy", "Faulty"]))

# Print evaluation metrics for DiffTransformer
print("DiffTransformer performance matrics:")
print(classification_report(y_ottawa, diff_transformer_preds, target_names=["Healthy", "Faulty"]))


Transformer performance matrics:
              precision    recall  f1-score   support

     Healthy       0.54      0.79      0.64       780
      Faulty       0.60      0.32      0.42       780

    accuracy                           0.55      1560
   macro avg       0.57      0.55      0.53      1560
weighted avg       0.57      0.55      0.53      1560

DiffTransformer performance matrics:
              precision    recall  f1-score   support

     Healthy       0.64      0.99      0.78       780
      Faulty       0.99      0.44      0.60       780

    accuracy                           0.71      1560
   macro avg       0.81      0.71      0.69      1560
weighted avg       0.81      0.71      0.69      1560



Healthy recall is 0.99 for diff transformer show the model correctly identified all samples while the value for transformer is significanly lower. Faulty precision for diff is 0.99 as well means it can correctly predicts a sample as faulty. For faulty recall both are lower but diff is still performing better than transformer. The overall accuracy of diff is 0.71, which is higher than transformer of 0.55, which proves diff outperforms transformer in this evaluation. 

As for the diff performance in general, it shows relative big errors that is mainly due to the distribution of the ottawa dataset is significantly different with the NASA dataset. Fine tune of the model shall be necessary to achieve better performance.  