In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import logging
import os

# ==============================
# Logging Configuration
# ==============================
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Define the format
    handlers=[
        logging.FileHandler("system_log.log"),  # Save logs to a file
        logging.StreamHandler()  # Also display logs on the console
    ]
)
data_path = '../data/'

# Load the datasets
dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'), nrows=1000) 
fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'),nrows=1000)
gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'), nrows=1000)
rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'), nrows=1000)

# Concatenate datasets
data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], axis=0)
logging.info(f"Data loaded and concatenated. Total rows: {data.shape[0]}, Total columns: {data.shape[1]}")

# Separate numeric and categorical columns
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Define preprocessing for numeric features (impute missing values with mean, scale them)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())])  # Normalize the numeric features

# Define preprocessing for categorical features (impute missing values with mode, one-hot encode them)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value (mode)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])  # One-hot encode the categorical features

# Combine both transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply transformations to the data
data_preprocessed = preprocessor.fit_transform(data)
logging.info(f"Data preprocessing complete. Shape after preprocessing: {data_preprocessed.shape}")

# Convert the processed data into a PyTorch tensor
data_tensor = torch.tensor(data_preprocessed, dtype=torch.float32)

# Create DataLoader
batch_size = 64
dataset = TensorDataset(data_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
logging.info(f"DataLoader created with batch size {batch_size}.")

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size):
        super(Autoencoder, self).__init__()
        # Encoder: Compressing input data
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        # Decoder: Reconstructing the original data
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_size),
            nn.Sigmoid()  # Using sigmoid to bring values between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Initialize the Autoencoder
input_size = data_preprocessed.shape[1]  # Number of features (columns) after preprocessing
model = Autoencoder(input_size)
logging.info(f"Autoencoder initialized with input size {input_size}.")

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for reconstruction error
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the Autoencoder
num_epochs = 50
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch[0]  # DataLoader returns a tuple
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    logging.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Function to calculate reconstruction errors
def calculate_reconstruction_errors(data_loader, model):
    model.eval()  # Set the model to evaluation mode
    reconstruction_errors = []
    for batch in data_loader:
        inputs = batch[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        reconstruction_errors.append(loss.item())  # Collect the reconstruction error for each batch
    logging.info("Reconstruction errors calculated.")
    return reconstruction_errors

# Step 2: Calculate reconstruction errors for the training data
reconstruction_errors = calculate_reconstruction_errors(dataloader, model)

# Step 3: Dynamically calculate the threshold based on the mean and standard deviation of reconstruction errors
mean_error = np.mean(reconstruction_errors)
std_error = np.std(reconstruction_errors)

# Set threshold as mean + 2 standard deviations
threshold = mean_error + 2 * std_error
logging.info(f"Dynamically calculated threshold based on data: {threshold}")

# Function to detect anomalies and get their corresponding row indices and reconstruction error values
def detect_anomalies_with_values(data_loader, model, threshold):
    model.eval()  # Set the model to evaluation mode
    anomalies = []
    anomaly_indices = []  # List to store indices of the anomalies
    anomaly_values = []  # List to store reconstruction errors (anomaly values)
    idx = 0  # Index counter for the data rows
    for batch in data_loader:
        inputs = batch[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        reconstruction_error = loss.item()  # Reconstruction error for this batch
        
        # If the reconstruction error exceeds the threshold, flag as anomaly
        if reconstruction_error > threshold:
            anomalies.append(inputs)
            anomaly_indices.append(idx)  # Store the index of the anomaly
            anomaly_values.append(reconstruction_error)  # Store the value of the anomaly (reconstruction error)
        idx += 1  # Increment index counter
    
    logging.info(f"Anomalies detected at rows (indices): {anomaly_indices}")
    logging.info(f"Anomaly values (reconstruction errors): {anomaly_values}")
    return anomalies, anomaly_indices, anomaly_values

# Detect anomalies, their indices, and values
anomalies, anomaly_indices, anomaly_values = detect_anomalies_with_values(dataloader, model, threshold)

# Print the number of anomalies detected, their corresponding row indices, and their reconstruction error values
print(f"Number of anomalies detected: {len(anomalies)}")
print(f"Anomalies detected at rows (indices): {anomaly_indices}")
print(f"Anomaly values (reconstruction errors): {anomaly_values}")

# ==============================
# Mitigation Section with Logging
# ==============================

# A. Log Anomalies
def log_anomalies(anomaly_indices, anomaly_values, data):
    # Log anomalies with details
    with open("anomaly_log.txt", "w") as log_file:
        for i, idx in enumerate(anomaly_indices):
            log_file.write(f"Anomaly at row {idx}, Error: {anomaly_values[i]}, Data: {data[idx]}\n")
    logging.info("Anomalies logged.")

log_anomalies(anomaly_indices, anomaly_values, data_tensor)

# B. Alert Driver if critical anomaly is detected
def alert_driver(message):
    logging.warning(f"Driver Alert: {message}")

# If critical anomaly is detected (e.g., anomaly in sensor data, ECU data), alert driver
for idx in anomaly_indices:
    alert_driver(f"Critical anomaly detected at row {idx}")

# C. Isolate the ECU if compromised (pseudo-code)
def isolate_ecu(ecu_id):
    logging.error(f"ECU {ecu_id} is isolated due to suspicious activity.")

# Example: Isolate a specific ECU if anomaly is related to that ECU (replace 'ecu_id' with real data)
isolate_ecu('ECU_1')

# D. Trigger Safe Mode in case of critical failure
def trigger_safe_mode():
    logging.critical("Entering safe mode due to anomaly detection...")

# Trigger safe mode for severe anomalies
if len(anomaly_indices) > 5:  # Example condition: more than 5 anomalies trigger safe mode
    trigger_safe_mode()


2024-09-20 16:49:37,727 - INFO - Data loaded and concatenated. Total rows: 4000, Total columns: 32
2024-09-20 16:49:38,089 - INFO - Data preprocessing complete. Shape after preprocessing: (4000, 1665)
2024-09-20 16:49:38,147 - INFO - DataLoader created with batch size 64.
2024-09-20 16:49:38,160 - INFO - Autoencoder initialized with input size 1665.
2024-09-20 16:49:42,579 - INFO - Epoch 1/50, Loss: 0.008496733382344246
2024-09-20 16:49:44,489 - INFO - Epoch 2/50, Loss: 0.009959322400391102
2024-09-20 16:49:46,236 - INFO - Epoch 3/50, Loss: 0.008872109465301037
2024-09-20 16:49:48,614 - INFO - Epoch 4/50, Loss: 0.00819880049675703
2024-09-20 16:49:51,338 - INFO - Epoch 5/50, Loss: 0.007909242063760757
2024-09-20 16:49:53,030 - INFO - Epoch 6/50, Loss: 0.007878273725509644
2024-09-20 16:49:54,629 - INFO - Epoch 7/50, Loss: 0.006869259290397167
2024-09-20 16:49:56,276 - INFO - Epoch 8/50, Loss: 0.007425455842167139
2024-09-20 16:49:57,627 - INFO - Epoch 9/50, Loss: 0.010191013105213642
2

Number of anomalies detected: 5
Anomalies detected at rows (indices): [13, 19, 37, 43, 47]
Anomaly values (reconstruction errors): [0.007195169106125832, 0.007422026712447405, 0.00728395814076066, 0.007373359519988298, 0.00749355461448431]
