In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
import logging
import os

# ==============================
# Logging Configuration
# ==============================
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Define the format
    handlers=[
        logging.FileHandler("system_log.log"),  # Save logs to a file
        logging.StreamHandler()  # Also display logs on the console
    ]
)

# Load the datasets
data_path = "../../data/"

# Load only the first 1000 rows of each dataset

dos_data = pd.read_csv(os.path.join(data_path, 'DoS_dataset.csv'), nrows=1000) 
fuzzy_data = pd.read_csv(os.path.join(data_path, 'Fuzzy_dataset.csv'),nrows=1000)
gear_data = pd.read_csv(os.path.join(data_path, 'gear_dataset.csv'), nrows=1000)
rpm_data = pd.read_csv(os.path.join(data_path, 'RPM_dataset.csv'), nrows=1000)

# Concatenate datasets
data = pd.concat([dos_data], axis=0)

# Concatenate datasets
data = pd.concat([dos_data, fuzzy_data, gear_data, rpm_data], axis=0)
logging.info(f"Data loaded and concatenated. Total rows: {data.shape[0]}, Total columns: {data.shape[1]}")

# Separate numeric and categorical columns
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Define preprocessing for numeric features (impute missing values with mean, scale them)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())])  # Normalize the numeric features

# Define preprocessing for categorical features (impute missing values with mode, one-hot encode them)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value (mode)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])  # One-hot encode the categorical features

# Combine both transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply transformations to the data
data_preprocessed = preprocessor.fit_transform(data)
logging.info(f"Data preprocessing complete. Shape after preprocessing: {data_preprocessed.shape}")

2024-09-20 16:36:51,989 - INFO - Data loaded and concatenated. Total rows: 4000, Total columns: 32
2024-09-20 16:36:52,335 - INFO - Data preprocessing complete. Shape after preprocessing: (4000, 1665)
