In [6]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from scipy import sparse
from sklearn.model_selection import train_test_split
import logging
import matplotlib.pyplot as plt
import numpy as np

# Function to set up logging
def setup_logging(debug=False):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    file_handler = logging.FileHandler('../../logs/data_preprocessing_anomaly_detection1.log')
    file_handler.setLevel(logging.INFO)
    
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    logger.addHandler(file_handler)

    if debug:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
    else:
        logger.info("Console logging is disabled; only logging to file.")

setup_logging(debug=True)  # Set to True to enable console logging, False for file only

logging.info('Starting data preprocessing for anomaly detection.')

# Define the path to your data folder
data_path = "../../data/"

# Load only the first 1000 rows of each dataset
try:
    dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'), nrows=1000)
    fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'), nrows=1000)
    gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'), nrows=1000)
    rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'), nrows=1000)
    logging.info('Datasets loaded successfully with the first 1000 rows from each file.')
except Exception as e:
    logging.error(f"Error loading datasets: {e}")

# Combine datasets into a single DataFrame
combined_data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], ignore_index=True)
logging.info('Datasets combined successfully.')

# Check for NaN or infinite values and handle them
if combined_data.isna().sum().sum() > 0:
    logging.warning('Dataset contains NaN values. Filling with mean.')
    combined_data.fillna(combined_data.mean(), inplace=True)

# Check for infinite values and replace them
combined_data.replace([np.inf, -np.inf], np.nan, inplace=True)
combined_data.fillna(combined_data.mean(), inplace=True)

logging.info("Checked for NaN and infinite values in the dataset.")

# Separate numeric and categorical columns
numeric_columns = combined_data.select_dtypes(include=['number']).columns
categorical_columns = combined_data.select_dtypes(exclude=['number']).columns

logging.info(f"Numeric columns: {numeric_columns}")
logging.info(f"Categorical columns: {categorical_columns}")

# Handle categorical data using One-Hot Encoding with sparse matrix output
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(sparse_output=True, drop='first')  # Keep output sparse to reduce memory usage
    categorical_encoded = encoder.fit_transform(combined_data[categorical_columns])
    
    # Convert numeric data into a sparse format
    numeric_data = combined_data[numeric_columns]
    numeric_data_sparse = sparse.csr_matrix(numeric_data.values)
    
    # Concatenate sparse categorical and numeric data
    combined_data_sparse = sparse.hstack([numeric_data_sparse, categorical_encoded])
    logging.info('Categorical columns encoded using One-Hot Encoding (sparse format).')
else:
    combined_data_sparse = sparse.csr_matrix(combined_data[numeric_columns].values)
    logging.info('No categorical columns to encode.')

# Normalize the numeric data (can only apply normalization to the numeric columns)
scaler = StandardScaler(with_mean=False)  # with_mean=False is needed for sparse matrices
numeric_data_scaled = scaler.fit_transform(combined_data[numeric_columns])
logging.info('Numeric data normalized using StandardScaler.')

# Convert scaled numeric data to sparse matrix and concatenate with the sparse categorical data
numeric_data_scaled_sparse = sparse.csr_matrix(numeric_data_scaled)
combined_data_encoded = sparse.hstack([numeric_data_scaled_sparse, categorical_encoded])

# Convert the sparse matrix to a dense format for PyTorch tensor (for training)
X_dense = combined_data_encoded.toarray()
logging.info('Data concatenated and converted to dense format for PyTorch.')

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_dense, dtype=torch.float32)
logging.info('Data converted to PyTorch tensors.')

# Step 1: Train-Test Split
X_train, X_test = train_test_split(X_tensor, test_size=0.2, random_state=42)
logging.info(f"Data split into training and test sets with a test size of 20%.")

# Create DataLoaders for both training and test sets
train_dataset = TensorDataset(X_train)
test_dataset = TensorDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
logging.info('DataLoader created for both training and test sets.')

# Step 2: Define the Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()  # Sigmoid to ensure outputs are between 0 and 1
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the autoencoder model
input_dim = X_tensor.shape[1]  # Number of input features
autoencoder = Autoencoder(input_dim)
logging.info(f"Autoencoder model initialized with input dimension: {input_dim}.")

# Step 3: Define Loss Function and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error for reconstruction loss
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
logging.info('Loss function and optimizer set up.')

# Train the Autoencoder
n_epochs = 50
for epoch in range(n_epochs):
    running_loss = 0.0
    for data in train_loader:
        inputs = data[0]
        
        # Forward pass
        outputs = autoencoder(inputs)
        loss = criterion(outputs, inputs)  # Compare output with input
        
        # Check if loss is NaN
        if torch.isnan(loss):
            logging.error(f"NaN loss encountered at epoch {epoch + 1}. Stopping training.")
            break
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    logging.info(f'Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader)}')
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader)}')

logging.info('Training complete.')

# Step 4: Anomaly Detection
def detect_anomalies(data_loader, model):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for data in data_loader:
            inputs = data[0]
            outputs = model(inputs)
            reconstruction_error = torch.mean((outputs - inputs) ** 2, dim=1)
            reconstruction_errors += reconstruction_error.cpu().numpy().tolist()
    
    return np.array(reconstruction_errors)

# Step 5: Dynamic Threshold Setting using Percentile
def determine_threshold(reconstruction_errors, percentile=95):
    # Ensure we don't have NaN reconstruction errors
    reconstruction_errors = reconstruction_errors[~np.isnan(reconstruction_errors)]
    threshold = np.percentile(reconstruction_errors, percentile)
    logging.info(f"Dynamic threshold set at the {percentile}th percentile: {threshold}")
    return threshold

# Plot reconstruction errors and mark the threshold
def plot_reconstruction_errors(reconstruction_errors, threshold):
    # Ensure no NaN values
    clean_reconstruction_errors = reconstruction_errors[~np.isnan(reconstruction_errors)]
    
    plt.hist(clean_reconstruction_errors, bins=50, alpha=0.7, label='Reconstruction Error')
    plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold ({threshold:.4f})')
    plt.xlabel('Reconstruction Error')
    plt.ylabel('Number of Data Points')
    plt.title('Reconstruction Errors and Anomaly Threshold')
    plt.legend()
    plt.show()

# Call the anomaly detection and threshold determination
reconstruction_errors = detect_anomalies(test_loader, autoencoder)
threshold = determine_threshold(reconstruction_errors, percentile=95)

# Plot the reconstruction errors with the dynamic threshold
plot_reconstruction_errors(reconstruction_errors, threshold)

# Detect anomalies using the dynamically determined threshold
anomalies = reconstruction_errors > threshold
num_anomalies = np.sum(anomalies)
print(f"Number of anomalies detected: {num_anomalies}")
logging.info(f"Number of anomalies detected: {num_anomalies}")


2024-09-19 12:12:23,352 - INFO - Starting data preprocessing for anomaly detection.
2024-09-19 12:12:23,352 - INFO - Starting data preprocessing for anomaly detection.
2024-09-19 12:12:23,394 - INFO - Datasets loaded successfully with the first 1000 rows from each file.
2024-09-19 12:12:23,394 - INFO - Datasets loaded successfully with the first 1000 rows from each file.
2024-09-19 12:12:23,406 - INFO - Datasets combined successfully.
2024-09-19 12:12:23,406 - INFO - Datasets combined successfully.


TypeError: can only concatenate str (not "int") to str