In [7]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
import logging

# Set up logging configuration
logging.basicConfig(
    filename='../logs/data_preprocessing.log',  
    level=logging.INFO,                         
    format='%(asctime)s - %(levelname)s - %(message)s',  
)

logging.info('Starting unsupervised learning data preprocessing.')

# Define the path to your data folder
data_path = "../data/"

# Load the datasets
try:
    dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'))
    fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'))
    gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'))
    rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'))
    logging.info('Datasets loaded successfully.')
except Exception as e:
    logging.error(f"Error loading datasets: {e}")

# Combine datasets into a single DataFrame
combined_data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], ignore_index=True)
logging.info('Datasets combined successfully.')

# Separate numeric and categorical columns
numeric_columns = combined_data.select_dtypes(include=['number']).columns
categorical_columns = combined_data.select_dtypes(exclude=['number']).columns

logging.info(f"Numeric columns: {numeric_columns}")
logging.info(f"Categorical columns: {categorical_columns}")

# Handle categorical data using One-Hot Encoding
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # Updated argument
    categorical_encoded = encoder.fit_transform(combined_data[categorical_columns])
    categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_columns))
    
    # Concatenate the encoded categorical data with numeric data
    combined_data_encoded = pd.concat([combined_data[numeric_columns], categorical_encoded_df], axis=1)
    logging.info('Categorical columns encoded using One-Hot Encoding.')
else:
    combined_data_encoded = combined_data[numeric_columns]
    logging.info('No categorical columns to encode.')

# Normalize the combined data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(combined_data_encoded)
logging.info('Data normalized using StandardScaler.')

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
logging.info('Data converted to PyTorch tensors.')

# Create DataLoader
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
logging.info(f'DataLoader created with batch size of 64. Total batches: {len(data_loader)}')

# Print summary and log completion
print(f"Number of data points: {len(X_tensor)}")
print(f"Number of batches in DataLoader: {len(data_loader)}")
logging.info('Data preprocessing completed successfully.')


MemoryError: Unable to allocate 838. GiB for an array with shape (16569471, 6790) and data type float64

In [10]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
import logging
from scipy import sparse

# Set up logging configuration
logging.basicConfig(
    filename='../logs/data_preprocessing1.log',  
    level=logging.INFO,                         
    format='%(asctime)s - %(levelname)s - %(message)s',  
)

logging.info('Starting unsupervised learning data preprocessing.')

# Define the path to your data folder
data_path = "../data/"

# Load the datasets
try:
    dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'))
    fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'))
    gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'))
    rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'))
    logging.info('Datasets loaded successfully.')
except Exception as e:
    logging.error(f"Error loading datasets: {e}")

# Combine datasets into a single DataFrame
combined_data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], ignore_index=True)
logging.info('Datasets combined successfully.')

# Separate numeric and categorical columns
numeric_columns = combined_data.select_dtypes(include=['number']).columns
categorical_columns = combined_data.select_dtypes(exclude=['number']).columns

logging.info(f"Numeric columns: {numeric_columns}")
logging.info(f"Categorical columns: {categorical_columns}")

# Handle categorical data using One-Hot Encoding with sparse matrix output
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(sparse_output=True, drop='first')  # Keep output sparse to reduce memory usage
    categorical_encoded = encoder.fit_transform(combined_data[categorical_columns])
    
    # Convert numeric data into a sparse format
    numeric_data = combined_data[numeric_columns]
    numeric_data_sparse = sparse.csr_matrix(numeric_data.values)
    
    # Concatenate sparse categorical and numeric data
    combined_data_sparse = sparse.hstack([numeric_data_sparse, categorical_encoded])
    logging.info('Categorical columns encoded using One-Hot Encoding (sparse format).')
else:
    combined_data_sparse = sparse.csr_matrix(combined_data[numeric_columns].values)
    logging.info('No categorical columns to encode.')

# Normalize the numeric data (can only apply normalization to the numeric columns)
scaler = StandardScaler(with_mean=False)  # with_mean=False is needed for sparse matrices
numeric_data_scaled = scaler.fit_transform(combined_data[numeric_columns])

# Convert scaled numeric data to sparse matrix and concatenate with the sparse categorical data
numeric_data_scaled_sparse = sparse.csr_matrix(numeric_data_scaled)
combined_data_encoded = sparse.hstack([numeric_data_scaled_sparse, categorical_encoded])

# Convert the sparse matrix to a dense format for PyTorch tensor (for training)
X_dense = combined_data_encoded.toarray()

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_dense, dtype=torch.float32)
logging.info('Data converted to PyTorch tensors.')

# Create DataLoader for batch processing
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
logging.info(f'DataLoader created with batch size of 64. Total batches: {len(data_loader)}')

# Print summary and log completion
print(f"Number of data points: {len(X_tensor)}")
print(f"Number of batches in DataLoader: {len(data_loader)}")
logging.info('Data preprocessing completed successfully.')


MemoryError: Unable to allocate 3.33 GiB for an array with shape (27, 16569471) and data type object

In [14]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
from scipy import sparse
import logging

# Set up logging configuration
logging.basicConfig(
    filename='../logs/data_preprocessing.log',  
    level=logging.INFO,                         
    format='%(asctime)s - %(levelname)s - %(message)s',  
)

logging.info('Starting unsupervised learning data preprocessing.')

# Define the path to your data folder
data_path = "../data/"

# Load the datasets using Dask for efficient memory usage
try:
    dos_data = dd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'))
    fuzzy_data = dd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'))
    gear_data = dd.read_csv(os.path.join(data_path, 'gear_dataset.csv'))
    rpm_data = dd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'))
    logging.info('Datasets loaded successfully using Dask.')
except Exception as e:
    logging.error(f"Error loading datasets: {e}")

# Combine datasets into a single Dask DataFrame
combined_data = dd.concat([dos_data, fuzzy_data, gear_data, rpm_data])
logging.info('Datasets combined successfully.')

# Check column types to identify numeric and categorical columns
numeric_columns = combined_data.select_dtypes(include=['number']).columns
categorical_columns = combined_data.select_dtypes(exclude=['number']).columns

logging.info(f"Numeric columns: {numeric_columns}")
logging.info(f"Categorical columns: {categorical_columns}")

# Convert Dask DataFrame to a Pandas DataFrame for processing
combined_data_pandas = combined_data.compute()  # This will load the entire dataset into memory, but processed in chunks

# Handle categorical data using One-Hot Encoding with sparse matrix output
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(sparse_output=True, drop='first')  # Keep output sparse to reduce memory usage
    categorical_encoded = encoder.fit_transform(combined_data_pandas[categorical_columns])
    
    # Convert numeric data into a sparse format
    numeric_data = combined_data_pandas[numeric_columns]
    numeric_data_sparse = sparse.csr_matrix(numeric_data.values)
    
    # Concatenate sparse categorical and numeric data
    combined_data_sparse = sparse.hstack([numeric_data_sparse, categorical_encoded])
    logging.info('Categorical columns encoded using One-Hot Encoding (sparse format).')
else:
    combined_data_sparse = sparse.csr_matrix(combined_data_pandas[numeric_columns].values)
    logging.info('No categorical columns to encode.')

# Normalize the numeric data (can only apply normalization to the numeric columns)
scaler = StandardScaler(with_mean=False)  # with_mean=False is needed for sparse matrices
numeric_data_scaled = scaler.fit_transform(combined_data_pandas[numeric_columns])

# Convert scaled numeric data to sparse matrix and concatenate with the sparse categorical data
numeric_data_scaled_sparse = sparse.csr_matrix(numeric_data_scaled)
combined_data_encoded = sparse.hstack([numeric_data_scaled_sparse, categorical_encoded])

# Convert the sparse matrix to a dense format for PyTorch tensor (for training)
X_dense = combined_data_encoded.toarray()

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_dense, dtype=torch.float32)
logging.info('Data converted to PyTorch tensors.')

# Create DataLoader for batch processing
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
logging.info(f'DataLoader created with batch size of 64. Total batches: {len(data_loader)}')

# Print summary and log completion
print(f"Number of data points: {len(X_tensor)}")
print(f"Number of batches in DataLoader: {len(data_loader)}")
logging.info('Data preprocessing completed successfully.')


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+--------+--------+----------+
| Column | Found  | Expected |
+--------+--------+----------+
| 00.1   | object | int64    |
| 00.4   | object | float64  |
+--------+--------+----------+

The following columns also raised exceptions on conversion:

- 00.1
  ValueError("invalid literal for int() with base 10: '7f'")
- 00.4
  ValueError("could not convert string to float: 'd1'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'00.1': 'object',
       '00.4': 'object'}

to the call to `read_csv`/`read_table`.

In [16]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
from scipy import sparse
import logging

# Set up logging configuration
logging.basicConfig(
    filename='../logs/data_preprocessing.log',  
    level=logging.INFO,                         
    format='%(asctime)s - %(levelname)s - %(message)s',  
)

logging.info('Starting unsupervised learning data preprocessing.')

# Define the path to your data folder
data_path = "../data/"

# Specify the correct dtypes for problematic columns
dtype_spec = {
    '00.1': 'object',  # Treat this column as a string
    '00.4': 'object'   # Treat this column as a string
}

# Load the datasets using Dask for efficient memory usage
try:
    dos_data = dd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'), dtype=dtype_spec)
    fuzzy_data = dd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'), dtype=dtype_spec)
    gear_data = dd.read_csv(os.path.join(data_path, 'gear_dataset.csv'), dtype=dtype_spec)
    rpm_data = dd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'), dtype=dtype_spec)
    logging.info('Datasets loaded successfully using Dask with specified dtypes.')
except Exception as e:
    logging.error(f"Error loading datasets: {e}")

# Combine datasets into a single Dask DataFrame
combined_data = dd.concat([dos_data, fuzzy_data, gear_data, rpm_data])
logging.info('Datasets combined successfully.')

# Check column types to identify numeric and categorical columns
numeric_columns = combined_data.select_dtypes(include=['number']).columns
categorical_columns = combined_data.select_dtypes(exclude=['number']).columns

logging.info(f"Numeric columns: {numeric_columns}")
logging.info(f"Categorical columns: {categorical_columns}")

# Convert Dask DataFrame to a Pandas DataFrame for processing
combined_data_pandas = combined_data.compute()  # This will load the entire dataset into memory, but processed in chunks

# Handle categorical data using One-Hot Encoding with sparse matrix output
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(sparse_output=True, drop='first')  # Keep output sparse to reduce memory usage
    categorical_encoded = encoder.fit_transform(combined_data_pandas[categorical_columns])
    
    # Convert numeric data into a sparse format
    numeric_data = combined_data_pandas[numeric_columns]
    numeric_data_sparse = sparse.csr_matrix(numeric_data.values)
    
    # Concatenate sparse categorical and numeric data
    combined_data_sparse = sparse.hstack([numeric_data_sparse, categorical_encoded])
    logging.info('Categorical columns encoded using One-Hot Encoding (sparse format).')
else:
    combined_data_sparse = sparse.csr_matrix(combined_data_pandas[numeric_columns].values)
    logging.info('No categorical columns to encode.')

# Normalize the numeric data (can only apply normalization to the numeric columns)
scaler = StandardScaler(with_mean=False)  # with_mean=False is needed for sparse matrices
numeric_data_scaled = scaler.fit_transform(combined_data_pandas[numeric_columns])

# Convert scaled numeric data to sparse matrix and concatenate with the sparse categorical data
numeric_data_scaled_sparse = sparse.csr_matrix(numeric_data_scaled)
combined_data_encoded = sparse.hstack([numeric_data_scaled_sparse, categorical_encoded])

# Convert the sparse matrix to a dense format for PyTorch tensor (for training)
X_dense = combined_data_encoded.toarray()

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_dense, dtype=torch.float32)
logging.info('Data converted to PyTorch tensors.')

# Create DataLoader for batch processing
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
logging.info(f'DataLoader created with batch size of 64. Total batches: {len(data_loader)}')

# Print summary and log completion
print(f"Number of data points: {len(X_tensor)}")
print(f"Number of batches in DataLoader: {len(data_loader)}")
logging.info('Data preprocessing completed successfully.')


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+--------+--------+----------+
| Column | Found  | Expected |
+--------+--------+----------+
| 00.2   | object | int64    |
+--------+--------+----------+

The following columns also raised exceptions on conversion:

- 00.2
  ValueError("invalid literal for int() with base 10: 'd0'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'00.2': 'object'}

to the call to `read_csv`/`read_table`.

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Load the DoS dataset
# Assuming DoS_dataset.csv contains data with numeric columns related to network activity
data = pd.read_csv('../data/DoS_dataset.csv')

# Step 2: Preprocessing - Fill missing values and scale data
# Fill missing values (if any)
data.fillna(0, inplace=True)

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Step 3: Split the data into training and testing sets
X_train, X_test = train_test_split(scaled_data, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Step 4: Define the Autoencoder model in PyTorch
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        # Encoder: Compress the input data
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU(True),
            nn.Linear(encoding_dim, 7),
            nn.ReLU(True)
        )
        # Decoder: Reconstruct the input data
        self.decoder = nn.Sequential(
            nn.Linear(7, encoding_dim),
            nn.ReLU(True),
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()  # Use sigmoid for reconstruction between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Step 5: Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
encoding_dim = 14  # Compression factor (adjust as needed)
model = Autoencoder(input_dim, encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Step 6: Train the Autoencoder model
num_epochs = 50
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, X_train.shape[0], batch_size):
        batch = X_train[i:i + batch_size]
        
        # Forward pass
        outputs = model(batch)
        loss = criterion(outputs, batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 7: Anomaly detection - Calculate reconstruction error on the test set
model.eval()  # Switch to evaluation mode
with torch.no_grad():
    reconstructions = model(X_test)
    mse = torch.mean((X_test - reconstructions) ** 2, dim=1)

# Step 8: Set a threshold for anomaly detection
threshold = torch.mean(mse) + 2 * torch.std(mse)

# Step 9: Detect anomalies (samples with errors exceeding the threshold)
anomalies = mse > threshold

print(f"Detected {torch.sum(anomalies).item()} anomalies out of {mse.shape[0]} samples.")


ValueError: could not convert string to float: '018f'

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('../data/DoS_dataset.csv')

# Step 1: Remove non-numeric columns
# This will automatically drop any columns that contain non-numeric data
numeric_data = data.select_dtypes(include=[np.number])

# Step 2: Check if any rows or columns were dropped
print(f"Original dataset shape: {data.shape}")
print(f"Numeric dataset shape: {numeric_data.shape}")

# Step 3: Scale the numeric data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 4: Split the data into training and testing sets
X_train, X_test = train_test_split(scaled_data, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU(True),
            nn.Linear(encoding_dim, 7),
            nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
            nn.Linear(7, encoding_dim),
            nn.ReLU(True),
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
encoding_dim = 14
model = Autoencoder(input_dim, encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Train the Autoencoder
num_epochs = 50
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, X_train.shape[0], batch_size):
        batch = X_train[i:i + batch_size]
        outputs = model(batch)
        loss = criterion(outputs, batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Anomaly detection
model.eval()
with torch.no_grad():
    reconstructions = model(X_test)
    mse = torch.mean((X_test - reconstructions) ** 2, dim=1)

# Set threshold for anomaly detection
threshold = torch.mean(mse) + 2 * torch.std(mse)
anomalies = mse > threshold

print(f"Detected {torch.sum(anomalies).item()} anomalies out of {mse.shape[0]} samples.")



Original dataset shape: (3665770, 12)
Numeric dataset shape: (3665770, 2)
Epoch [1/50], Loss: 0.2633
Epoch [2/50], Loss: 0.2634
Epoch [3/50], Loss: 0.2633
Epoch [4/50], Loss: 0.2633
Epoch [5/50], Loss: 0.2633
Epoch [6/50], Loss: 0.2633
Epoch [7/50], Loss: 0.2633
Epoch [8/50], Loss: 0.2633
Epoch [9/50], Loss: 0.2633
Epoch [10/50], Loss: 0.2633
Epoch [11/50], Loss: 0.2633
Epoch [12/50], Loss: 0.2633
Epoch [13/50], Loss: 0.2633
Epoch [14/50], Loss: 0.2633
Epoch [15/50], Loss: 0.2633
Epoch [16/50], Loss: 0.2633
Epoch [17/50], Loss: 0.2633
Epoch [18/50], Loss: 0.2633
Epoch [19/50], Loss: 0.2633
Epoch [20/50], Loss: 0.2633
Epoch [21/50], Loss: 0.2633
Epoch [22/50], Loss: 0.2633
Epoch [23/50], Loss: 0.2633
Epoch [24/50], Loss: 0.2633
Epoch [25/50], Loss: 0.2633
Epoch [26/50], Loss: 0.2633
Epoch [27/50], Loss: 0.2633
Epoch [28/50], Loss: 0.2633
Epoch [29/50], Loss: 0.2633
Epoch [30/50], Loss: 0.2633
Epoch [31/50], Loss: 0.2633
Epoch [32/50], Loss: 0.2633
Epoch [33/50], Loss: 0.2633
Epoch [34/5