In [2]:
import pickle
import numpy as np
from pathlib import Path

## Method 1: Direct Load (Manual)

In [3]:
# Load binary classification data
with open('processed_artifacts/binary_preprocessed.pkl', 'rb') as f:
    binary_artifacts = pickle.load(f)

X_train_bin = binary_artifacts['X_train']
X_test_bin = binary_artifacts['X_test']
y_train_bin = binary_artifacts['y_train']
y_test_bin = binary_artifacts['y_test']

print(f"Binary Classification Data Loaded:")
print(f"  X_train: {X_train_bin.shape}")
print(f"  X_test: {X_test_bin.shape}")
print(f"  y_train: {y_train_bin.shape}")
print(f"  y_test: {y_test_bin.shape}")

Binary Classification Data Loaded:
  X_train: (168834, 34)
  X_test: (42209, 34)
  y_train: (168834,)
  y_test: (42209,)


In [4]:
# Load multi-class classification data
with open('processed_artifacts/multiclass_preprocessed.pkl', 'rb') as f:
    multi_artifacts = pickle.load(f)

X_train_multi = multi_artifacts['X_train']
X_test_multi = multi_artifacts['X_test']
y_train_multi = multi_artifacts['y_train']
y_test_multi = multi_artifacts['y_test']
num_classes = multi_artifacts['num_classes']
class_names = multi_artifacts['class_names']

print(f"Multi-class Classification Data Loaded:")
print(f"  X_train: {X_train_multi.shape}")
print(f"  X_test: {X_test_multi.shape}")
print(f"  y_train: {y_train_multi.shape}")
print(f"  y_test: {y_test_multi.shape}")
print(f"  Classes ({num_classes}): {class_names}")

Multi-class Classification Data Loaded:
  X_train: (168834, 34)
  X_test: (42209, 34)
  y_train: (168834,)
  y_test: (42209,)
  Classes (10): ['backdoor', 'ddos', 'dos', 'injection', 'mitm', 'normal', 'password', 'ransomware', 'scanning', 'xss']


## Method 2: Using Utility Function

In [5]:
# Import utility function from preprocessing_pipeline.py
import sys
sys.path.insert(0, '/home/elnoersan/Skripsi/Paper/NotebookTODO/EDA')
from preprocessing_pipeline import load_preprocessed_data

ðŸš€ GPU Detected: 1 device(s)


2025-12-10 03:45:44.095746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ðŸš€ TensorFlow GPU: 1 device(s) available


In [6]:
# Load binary data
binary_data = load_preprocessed_data('binary')

X_train_bin = binary_data['X_train']
X_test_bin = binary_data['X_test']
y_train_bin = binary_data['y_train']
y_test_bin = binary_data['y_test']

âœ“ Loaded binary artifacts from /home/elnoersan/Skripsi/Paper/NotebookTODO/EDA/processed_artifacts/binary_preprocessed.pkl
  Train samples: 168,834
  Test samples: 42,209
  Features: 34


In [7]:
# Load multi-class data
multi_data = load_preprocessed_data('multiclass')

X_train_multi = multi_data['X_train']
X_test_multi = multi_data['X_test']
y_train_multi = multi_data['y_train']
y_test_multi = multi_data['y_test']
num_classes = multi_data['num_classes']

âœ“ Loaded multiclass artifacts from /home/elnoersan/Skripsi/Paper/NotebookTODO/EDA/processed_artifacts/multiclass_preprocessed.pkl
  Train samples: 168,834
  Test samples: 42,209
  Features: 34


## Inspect Metadata

In [8]:
print("Binary Classification Metadata:")
for key, value in binary_data['metadata'].items():
    print(f"  {key}: {value}")

Binary Classification Metadata:
  n_samples_train: 168834
  n_samples_test: 42209
  n_features_original: 35
  n_features_encoded: 881
  n_features_final: 34
  test_size: 0.2
  random_state: 42
  feature_selection_method: VarianceThreshold


In [9]:
print("Multi-class Classification Metadata:")
for key, value in multi_data['metadata'].items():
    print(f"  {key}: {value}")

Multi-class Classification Metadata:
  n_samples_train: 168834
  n_samples_test: 42209
  n_features_original: 35
  n_features_encoded: 881
  n_features_final: 34
  test_size: 0.2
  random_state: 42
  num_classes: 10
  feature_selection_method: VarianceThreshold


## Verify Data Quality

In [10]:
# Check for NaN/Inf in binary data
print("Binary Data Quality:")
print(f"  X_train - NaN: {np.isnan(X_train_bin).sum()}, Inf: {np.isinf(X_train_bin).sum()}")
print(f"  X_test - NaN: {np.isnan(X_test_bin).sum()}, Inf: {np.isinf(X_test_bin).sum()}")
print(f"  y_train unique: {np.unique(y_train_bin)}")
print(f"  y_test unique: {np.unique(y_test_bin)}")

Binary Data Quality:
  X_train - NaN: 0, Inf: 0
  X_test - NaN: 0, Inf: 0
  y_train unique: [0 1]
  y_test unique: [0 1]


In [11]:
# Check for NaN/Inf in multi-class data
print("Multi-class Data Quality:")
print(f"  X_train - NaN: {np.isnan(X_train_multi).sum()}, Inf: {np.isinf(X_train_multi).sum()}")
print(f"  X_test - NaN: {np.isnan(X_test_multi).sum()}, Inf: {np.isinf(X_test_multi).sum()}")
print(f"  y_train unique: {np.unique(y_train_multi)}")
print(f"  y_test unique: {np.unique(y_test_multi)}")

Multi-class Data Quality:
  X_train - NaN: 0, Inf: 0
  X_test - NaN: 0, Inf: 0
  y_train unique: [0 1 2 3 4 5 6 7 8 9]
  y_test unique: [0 1 2 3 4 5 6 7 8 9]


## Ready for Training!

Now you can use `X_train_bin`, `X_test_bin`, `y_train_bin`, `y_test_bin` directly in your FL notebook.

**Time saved:** ~5-10 minutes per notebook run!