# 02 - Data Preprocessing: CICIDS2017 Dataset

This notebook handles data cleaning, transformation, and preparation for model training.

## Objectives:
1. Load raw dataset
2. Clean data (remove duplicates, handle missing values)
3. Handle infinite and outlier values
4. Feature selection and engineering
5. Label encoding
6. Data normalization/standardization
7. Train-test split
8. Handle class imbalance
9. Save processed data

## 1. Setup and Imports

In [1]:
# Add parent directory to path
import sys
sys.path.append('..')

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import yaml
import joblib
import os

# Scikit-learn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Imbalanced-learn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Custom module
from src.data_processing import DataProcessor

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


## 2. Initialize Data Processor

In [2]:
# Initialize processor
processor = DataProcessor(config_path='../config/config.yaml')

print("‚úì Data processor initialized")
print(f"\nConfiguration:")
print(f"  Raw data path: {processor.config['paths']['data_raw']}")
print(f"  Processed data path: {processor.config['paths']['data_processed']}")
print(f"  Normalization method: {processor.config['preprocessing']['normalization']}")
print(f"  Sampling method: {processor.config['preprocessing']['sampling']['method']}")

INFO:src.data_processing:Configuration loaded from ../config/config.yaml


‚úì Data processor initialized

Configuration:
  Raw data path: data/raw/
  Processed data path: data/processed/
  Normalization method: standard
  Sampling method: SMOTE


## 3. Load Dataset

For this notebook, we'll use a sample of the data for faster processing.

In [3]:
# Load dataset (using 10% sample for notebook demonstration)
SAMPLE_FRACTION = 0.1  # Use 10% of data

print(f"Loading {SAMPLE_FRACTION*100}% of dataset...")
df = processor.load_dataset(sample_frac=SAMPLE_FRACTION)

print(f"\n‚úì Dataset loaded successfully")
print(f"  Shape: {df.shape}")
print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

INFO:src.data_processing:Loading dataset...


Loading 10.0% of dataset...


ValueError: No objects to concatenate

## 4. Initial Data Quality Check

In [None]:
print("\n=== Initial Data Quality ===")
print(f"Total rows: {len(df):,}")
print(f"Total columns: {df.shape[1]}")
print(f"\nMissing values: {df.isnull().sum().sum():,}")
print(f"Duplicate rows: {df.duplicated().sum():,}")

# Check for infinite values
numeric_cols = df.select_dtypes(include=[np.number]).columns
inf_count = np.isinf(df[numeric_cols]).sum().sum()
print(f"Infinite values: {inf_count:,}")

# Display first few rows
print("\nFirst 3 rows:")
df.head(3)

## 5. Data Cleaning

In [None]:
print("\n=== Step 1: Data Cleaning ===")

# Clean data
df_clean = processor.clean_data(df)

print(f"\n‚úì Cleaning complete")
print(f"  Original rows: {len(df):,}")
print(f"  After cleaning: {len(df_clean):,}")
print(f"  Rows removed: {len(df) - len(df_clean):,} ({((len(df) - len(df_clean))/len(df)*100):.2f}%)")

In [None]:
# Verify cleaning
print("\nPost-cleaning verification:")
print(f"  Missing values: {df_clean.isnull().sum().sum()}")
print(f"  Duplicate rows: {df_clean.duplicated().sum()}")
print(f"  Infinite values: {np.isinf(df_clean.select_dtypes(include=[np.number])).sum().sum()}")

## 6. Feature Preparation

In [None]:
print("\n=== Step 2: Feature Preparation ===")

# Prepare features and labels
X, y = processor.prepare_features(df_clean)

print(f"\n‚úì Features prepared")
print(f"  Feature matrix shape: {X.shape}")
print(f"  Label vector shape: {y.shape}")
print(f"  Number of features: {X.shape[1]}")

# Save feature names
feature_names = X.columns.tolist()
print(f"\nFeature names (first 10):")
for i, name in enumerate(feature_names[:10], 1):
    print(f"  {i}. {name}")

In [None]:
# Label distribution before encoding
print("\nLabel distribution:")
label_dist = y.value_counts()
print(label_dist)

# Visualize
plt.figure(figsize=(12, 6))
label_dist.plot(kind='bar', color='steelblue')
plt.title('Label Distribution (Before Encoding)', fontsize=14, fontweight='bold')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Label Encoding

In [None]:
print("\n=== Step 3: Label Encoding ===")

# Choose encoding type
BINARY_CLASSIFICATION = False  # Set to True for binary, False for multi-class

print(f"Classification type: {'Binary' if BINARY_CLASSIFICATION else 'Multi-class'}")

# Encode labels
y_encoded = processor.encode_labels(y, binary=BINARY_CLASSIFICATION)

print(f"\n‚úì Labels encoded")
print(f"  Unique labels: {len(np.unique(y_encoded))}")
print(f"  Label range: [{y_encoded.min()}, {y_encoded.max()}]")

if not BINARY_CLASSIFICATION:
    print(f"\nLabel mapping:")
    for idx, label in enumerate(processor.label_encoder.classes_):
        print(f"  {idx}: {label}")

In [None]:
# Visualize encoded label distribution
plt.figure(figsize=(12, 6))
unique, counts = np.unique(y_encoded, return_counts=True)
plt.bar(unique, counts, color='coral')
plt.title('Encoded Label Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Encoded Label')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Class distribution
print("\nEncoded label distribution:")
for label, count in zip(unique, counts):
    percentage = (count / len(y_encoded)) * 100
    print(f"  Label {label}: {count:,} ({percentage:.2f}%)")

## 8. Train-Validation-Test Split

In [None]:
print("\n=== Step 4: Data Splitting ===")

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = processor.split_data(X, y_encoded)

print(f"\n‚úì Data split complete")
print(f"\nSplit sizes:")
print(f"  Training:   {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Validation: {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"  Test:       {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

In [None]:
# Verify stratification
print("\nLabel distribution across splits:")
print(f"\nTraining set:")
print(pd.Series(y_train).value_counts().sort_index())
print(f"\nValidation set:")
print(pd.Series(y_val).value_counts().sort_index())
print(f"\nTest set:")
print(pd.Series(y_test).value_counts().sort_index())

## 9. Feature Normalization

In [None]:
print("\n=== Step 5: Feature Normalization ===")

# Before normalization - show statistics
print("\nBefore normalization (training set):")
print(f"  Mean: {X_train.mean().mean():.3f}")
print(f"  Std:  {X_train.std().mean():.3f}")
print(f"  Min:  {X_train.min().min():.3f}")
print(f"  Max:  {X_train.max().max():.3f}")

# Normalize
X_train_scaled, X_val_scaled = processor.normalize_features(X_train, X_val)
_, X_test_scaled = processor.normalize_features(X_train, X_test)

print(f"\n‚úì Normalization complete")
print(f"  Method: {processor.config['preprocessing']['normalization']}")

# After normalization
print("\nAfter normalization (training set):")
print(f"  Mean: {X_train_scaled.mean():.3f}")
print(f"  Std:  {X_train_scaled.std():.3f}")
print(f"  Min:  {X_train_scaled.min():.3f}")
print(f"  Max:  {X_train_scaled.max():.3f}")

In [None]:
# Visualize normalization effect
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before normalization
sample_feature_idx = 0
axes[0].hist(X_train.iloc[:, sample_feature_idx], bins=50, color='skyblue', edgecolor='black')
axes[0].set_title(f'Before Normalization\n{feature_names[sample_feature_idx]}', fontweight='bold')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].grid(alpha=0.3)

# After normalization
axes[1].hist(X_train_scaled[:, sample_feature_idx], bins=50, color='coral', edgecolor='black')
axes[1].set_title(f'After Normalization\n{feature_names[sample_feature_idx]}', fontweight='bold')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Handle Class Imbalance

In [None]:
print("\n=== Step 6: Handling Class Imbalance ===")

# Show imbalance before
print("\nClass distribution before resampling:")
unique_before, counts_before = np.unique(y_train, return_counts=True)
for label, count in zip(unique_before, counts_before):
    print(f"  Label {label}: {count:,}")

# Calculate imbalance ratio
imbalance_ratio = counts_before.max() / counts_before.min()
print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")

# Apply resampling
X_train_resampled, y_train_resampled = processor.handle_imbalance(X_train_scaled, y_train)

# Show distribution after
print("\nClass distribution after resampling:")
unique_after, counts_after = np.unique(y_train_resampled, return_counts=True)
for label, count in zip(unique_after, counts_after):
    print(f"  Label {label}: {count:,}")

print(f"\n‚úì Resampling complete")
print(f"  Original training samples: {len(y_train):,}")
print(f"  After resampling: {len(y_train_resampled):,}")
print(f"  Samples added: {len(y_train_resampled) - len(y_train):,}")

In [None]:
# Visualize resampling effect
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before resampling
axes[0].bar(unique_before, counts_before, color='skyblue')
axes[0].set_title('Before Resampling', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Class Label')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# After resampling
axes[1].bar(unique_after, counts_after, color='coral')
axes[1].set_title(f'After Resampling ({processor.config["preprocessing"]["sampling"]["method"]})', 
                  fontweight='bold', fontsize=12)
axes[1].set_xlabel('Class Label')
axes[1].set_ylabel('Count')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Save Processed Data

In [None]:
print("\n=== Step 7: Saving Processed Data ===")

# Create processed directory if it doesn't exist
processed_path = processor.config['paths']['data_processed']
os.makedirs(processed_path, exist_ok=True)

# Save all data
processor.save_processed_data(
    X_train_resampled, X_val_scaled, X_test_scaled,
    y_train_resampled, y_val, y_test,
    feature_names
)

print("\n‚úì All data saved successfully!")

In [None]:
# Verify saved files
print("\nSaved files:")
for file in Path(processed_path).glob('*'):
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  ‚úì {file.name} ({size_mb:.2f} MB)")

## 12. Summary Statistics

In [None]:
print("\n" + "="*70)
print("DATA PREPROCESSING - SUMMARY")
print("="*70)

print(f"\nüìä Final Dataset Shapes:")
print(f"  Training:   X={X_train_resampled.shape}, y={y_train_resampled.shape}")
print(f"  Validation: X={X_val_scaled.shape}, y={y_val.shape}")
print(f"  Test:       X={X_test_scaled.shape}, y={y_test.shape}")

print(f"\nüîß Preprocessing Steps Applied:")
print(f"  ‚úì Data cleaning (duplicates, missing values, infinites)")
print(f"  ‚úì Feature selection ({len(feature_names)} features retained)")
print(f"  ‚úì Label encoding ({'Binary' if BINARY_CLASSIFICATION else 'Multi-class'})")
print(f"  ‚úì Train-validation-test split")
print(f"  ‚úì Feature normalization ({processor.config['preprocessing']['normalization']})")
print(f"  ‚úì Class imbalance handling ({processor.config['preprocessing']['sampling']['method']})")

print(f"\nüìÅ Files Saved to: {processed_path}")
print(f"  ‚úì X_train.npy, X_val.npy, X_test.npy")
print(f"  ‚úì y_train.npy, y_val.npy, y_test.npy")
print(f"  ‚úì scaler.pkl")
print(f"  ‚úì label_encoder.pkl")
print(f"  ‚úì feature_names.pkl")

print(f"\nüìà Class Distribution (Final Training Set):")
for label, count in zip(unique_after, counts_after):
    percentage = (count / len(y_train_resampled)) * 100
    if not BINARY_CLASSIFICATION and hasattr(processor.label_encoder, 'classes_'):
        label_name = processor.label_encoder.classes_[label]
        print(f"  {label_name}: {count:,} ({percentage:.2f}%)")
    else:
        print(f"  Label {label}: {count:,} ({percentage:.2f}%)")

print(f"\n‚úÖ Data preprocessing complete!")
print(f"\nüìù Next Steps:")
print(f"  1. Run 03_feature_engineering.ipynb for feature selection")
print(f"  2. Run 04_model_training.ipynb to train ML models")
print(f"  3. Run 05_model_evaluation.ipynb to evaluate performance")

print("\n" + "="*70)

## 13. Optional: Test Loading Processed Data

In [None]:
# Test loading the saved data
print("\n=== Testing Data Loading ===")

# Load processed data
X_train_loaded, X_val_loaded, X_test_loaded, y_train_loaded, y_val_loaded, y_test_loaded = processor.load_processed_data()

print("\n‚úì Data loaded successfully")
print(f"\nLoaded shapes:")
print(f"  X_train: {X_train_loaded.shape}")
print(f"  X_val: {X_val_loaded.shape}")
print(f"  X_test: {X_test_loaded.shape}")

# Verify data integrity
print(f"\nData integrity check:")
print(f"  Training data match: {np.array_equal(X_train_resampled, X_train_loaded)}")
print(f"  Validation data match: {np.array_equal(X_val_scaled, X_val_loaded)}")
print(f"  Test data match: {np.array_equal(X_test_scaled, X_test_loaded)}")

# Load feature names
loaded_features = joblib.load(os.path.join(processed_path, 'feature_names.pkl'))
print(f"\n‚úì Feature names loaded: {len(loaded_features)} features")