In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from src.data.loader import DataLoader
from src.data.preprocessing import DataPreprocessor
from src.utils.helpers import load_config

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## Load Data

In [None]:
# Load configuration
config = load_config('../config/config.yaml')

# Load dataset
loader = DataLoader()
df = loader.load_dataset('../data/raw/cybersecurity_attacks.csv')

print(f"Original dataset shape: {df.shape}")
target_col = 'attack_type' if 'attack_type' in df.columns else df.columns[-1]
print(f"Target column: {target_col}")

## Stage 1: Remove Redundant Columns

In [None]:
preprocessor = DataPreprocessor()

# Remove redundant columns
df_stage1 = preprocessor.remove_redundant_columns(
    df.copy(), 
    correlation_threshold=0.95, 
    variance_threshold=0.01
)

removed_cols = set(df.columns) - set(df_stage1.columns)
print(f"Removed {len(removed_cols)} columns")
print(f"Remaining: {df_stage1.shape[1]} columns")

if removed_cols:
    print(f"\nRemoved columns: {list(removed_cols)[:10]}..." if len(removed_cols) > 10 else f"\nRemoved columns: {list(removed_cols)}")

## Stage 2: Encode Categorical Features

In [None]:
# Identify categorical columns
categorical_cols = df_stage1.select_dtypes(include=['object', 'category']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Categorical columns: {categorical_cols[:5]}..." if len(categorical_cols) > 5 else f"Categorical columns: {categorical_cols}")

# Encode
df_stage2, encoders = preprocessor.encode_categorical(
    df_stage1.copy(),
    categorical_columns=categorical_cols,
    method='onehot'
)

print(f"\nShape after encoding: {df_stage2.shape}")
print(f"Added {df_stage2.shape[1] - df_stage1.shape[1]} encoded features")

## Stage 3: Handle Missing Values

In [None]:
# Check missing values before
missing_before = df_stage2.isnull().sum().sum()
print(f"Missing values before: {missing_before}")

# Handle missing values
df_stage3 = preprocessor.handle_missing_values(
    df_stage2.copy(),
    strategy='mean'
)

missing_after = df_stage3.isnull().sum().sum()
print(f"Missing values after: {missing_after}")
print(f"‚úÖ Imputed {missing_before - missing_after} missing values")

## Stage 4 & 5: Detect and Handle Outliers

In [None]:
# Detect outliers
outlier_mask = preprocessor.detect_outliers(
    df_stage3.copy(),
    method='isolation_forest',
    contamination=0.1
)

n_outliers = outlier_mask.sum()
print(f"Detected {n_outliers} outlier samples ({n_outliers/len(df_stage3)*100:.2f}%)")

# Visualize outliers
if n_outliers > 0:
    plt.figure(figsize=(10, 6))
    plt.bar(['Normal', 'Outlier'], [len(df_stage3) - n_outliers, n_outliers], 
            color=['green', 'red'], alpha=0.7)
    plt.title('Outlier Detection Results')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Handle outliers
df_stage5 = preprocessor.handle_outliers(
    df_stage3.copy(),
    outlier_mask=outlier_mask,
    method='cap'  # Cap outliers instead of removing
)

print(f"Shape after handling outliers: {df_stage5.shape}")

## Stage 6: Standardize Features

In [None]:
# Separate features and target
X = df_stage5.drop(columns=[target_col])
y = df_stage5[target_col]

# Show before standardization
print("Before standardization:")
print(X.iloc[:, :3].describe())

# Standardize
X_scaled, scaler = preprocessor.standardize_features(
    X.copy(),
    method='standard'
)

print("\nAfter standardization:")
print(pd.DataFrame(X_scaled, columns=X.columns).iloc[:, :3].describe())

In [None]:
# Visualize standardization effect
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before
axes[0].boxplot([X.iloc[:, i] for i in range(min(5, X.shape[1]))], 
                labels=X.columns[:5])
axes[0].set_title('Before Standardization')
axes[0].set_ylabel('Value')
axes[0].tick_params(axis='x', rotation=45)

# After
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
axes[1].boxplot([X_scaled_df.iloc[:, i] for i in range(min(5, X.shape[1]))], 
                labels=X.columns[:5])
axes[1].set_title('After Standardization')
axes[1].set_ylabel('Value')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Stage 7: Handle Class Imbalance

In [None]:
# Check class distribution before
print("Class distribution before balancing:")
print(y.value_counts())
print(f"\nImbalance ratio: {y.value_counts().max() / y.value_counts().min():.2f}:1")

# Apply SMOTE
X_balanced, y_balanced = preprocessor.handle_class_imbalance(
    X_scaled,
    y,
    method='smote'
)

print("\nClass distribution after balancing:")
print(pd.Series(y_balanced).value_counts())

In [None]:
# Visualize class balance
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Before SMOTE', 'After SMOTE'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}]]
)

# Before
counts_before = y.value_counts()
fig.add_trace(
    go.Bar(x=counts_before.index.astype(str), y=counts_before.values,
           marker_color='lightcoral', name='Before'),
    row=1, col=1
)

# After
counts_after = pd.Series(y_balanced).value_counts()
fig.add_trace(
    go.Bar(x=counts_after.index.astype(str), y=counts_after.values,
           marker_color='lightgreen', name='After'),
    row=1, col=2
)

fig.update_layout(title_text='Class Balance Comparison', showlegend=False)
fig.show()

## Stage 8: Split Data

In [None]:
# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(
    X_balanced,
    y_balanced,
    test_size=0.2,
    stratify=True
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest class distribution:")
print(pd.Series(y_test).value_counts())

## üéØ Complete Pipeline Execution

In [None]:
# Run entire pipeline in one go
print("Running complete 7-stage pipeline...\n")

preprocessor_full = DataPreprocessor()
X_train_full, X_test_full, y_train_full, y_test_full = preprocessor_full.run_pipeline(
    df.copy(),
    target_column=target_col,
    correlation_threshold=0.95,
    variance_threshold=0.01,
    encoding_method='onehot',
    missing_strategy='mean',
    outlier_detection_method='isolation_forest',
    outlier_handling_method='cap',
    scaling_method='standard',
    balance_method='smote',
    test_size=0.2
)

print(f"\n‚úÖ Pipeline complete!")
print(f"Training set: {X_train_full.shape}")
print(f"Test set: {X_test_full.shape}")

## üìä Preprocessing Summary

In [None]:
# Create summary
summary = {
    'Stage': [
        '1. Remove Redundant',
        '2. Encode Categorical',
        '3. Handle Missing',
        '4-5. Handle Outliers',
        '6. Standardize',
        '7. Balance Classes',
        '8. Split Data'
    ],
    'Action': [
        f'Removed {len(removed_cols)} columns',
        f'Encoded {len(categorical_cols)} features',
        f'Imputed {missing_before} values',
        f'Handled {n_outliers} outliers',
        f'Standardized {X.shape[1]} features',
        f'SMOTE: {len(y)} ‚Üí {len(y_balanced)} samples',
        f'80/20 split: {len(y_train)}/{len(y_test)}'
    ]
}

summary_df = pd.DataFrame(summary)
display(summary_df)

## üíæ Save Preprocessed Data

In [None]:
# Save preprocessed data
import os
os.makedirs('../data/processed', exist_ok=True)

np.save('../data/processed/X_train.npy', X_train_full)
np.save('../data/processed/X_test.npy', X_test_full)
np.save('../data/processed/y_train.npy', y_train_full)
np.save('../data/processed/y_test.npy', y_test_full)

print("‚úÖ Preprocessed data saved to data/processed/")

## üìù Next Steps

1. ‚úÖ Data is now ready for model training
2. Proceed to **03_model_training.ipynb** to train ML and DL models
3. Experiment with different preprocessing parameters
4. Try different balancing methods (ADASYN, undersampling)