# üìÖ Day 4: Level 2 ‚Äî 8-Class Family Classification
## DDoS, DoS, Mirai, Recon, Spoofing, Web, BruteForce, Benign

---

**Steps:**
1. Load preprocessed data
2. Round 1: Train with class weights only
3. Round 2: Apply SMOTE for Web + BruteForce
4. Round 3: Undersample DDoS + SMOTE minorities
5. Hyperparameter Tuning with Optuna (best model)
6. Compare all rounds
7. Per-class analysis + Confusion Matrix (8√ó8)

---

In [None]:
import os
os.add_dll_directory(r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\x64')

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             classification_report, confusion_matrix)
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import json
import joblib
from datetime import datetime

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 12

os.makedirs('models', exist_ok=True)
os.makedirs('figures', exist_ok=True)

print(f"‚úÖ Ready | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## üì• Load Data

In [None]:
print("üì• Loading preprocessed data...")
t0 = time.time()

X_train = np.load('processed/X_train.npy')
X_test = np.load('processed/X_test.npy')
y_train = np.load('processed/y_family_train.npy')
y_test = np.load('processed/y_family_test.npy')

with open('processed/preprocessing_metadata.json', 'r') as f:
    meta = json.load(f)
feature_names = meta['feature_names']
family_classes = meta['family_classes']
n_classes = len(family_classes)

print(f"‚úÖ Loaded in {time.time()-t0:.1f}s")
print(f"   X_train: {X_train.shape} | X_test: {X_test.shape}")
print(f"   Classes ({n_classes}): {family_classes}")

# Class distribution
print(f"\nüìä Training class distribution:")
for i, name in enumerate(family_classes):
    count = (y_train == i).sum()
    print(f"   {i}: {name:<20s} ‚Üí {count:>10,} ({count/len(y_train)*100:.2f}%)")

## üèãÔ∏è Round 1: Class Weights Only (No Data Manipulation)

In [None]:
round1_results = {}
round1_predictions = {}

# Compute sample weights for XGBoost
sample_weights = compute_sample_weight('balanced', y_train)

model_configs = {
    'Decision Tree': {
        'model': DecisionTreeClassifier(class_weight='balanced', max_depth=15, random_state=42)
    },
    'Random Forest': {
        'model': RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=15, n_jobs=-1, random_state=42)
    }
}

# Train sklearn models
for name, cfg in model_configs.items():
    print(f"\n{'='*60}")
    print(f"üèãÔ∏è Round 1 ‚Äî {name}")
    print(f"{'='*60}")
    
    t0 = time.time()
    cfg['model'].fit(X_train, y_train)
    train_time = time.time() - t0
    
    y_pred = cfg['model'].predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1_mac = f1_score(y_test, y_pred, average='macro')
    f1_wtd = f1_score(y_test, y_pred, average='weighted')
    
    round1_results[name] = {'accuracy': acc, 'f1_macro': f1_mac, 'f1_weighted': f1_wtd, 'train_time': train_time}
    round1_predictions[name] = y_pred
    
    print(f"   ‚è±Ô∏è Train: {train_time:.1f}s")
    print(f"   ‚úÖ Accuracy: {acc*100:.4f}% | F1-Macro: {f1_mac*100:.4f}% | F1-Weighted: {f1_wtd*100:.4f}%")
    
    joblib.dump(cfg['model'], f'models/family_r1_{name.lower().replace(" ","_")}.joblib')

In [None]:
# XGBoost GPU ‚Äî Round 1
print(f"\n{'='*60}")
print(f"üéÆ Round 1 ‚Äî XGBoost GPU")
print(f"{'='*60}")

dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights, feature_names=feature_names)
dtest_xgb = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)

xgb_params = {
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'multi:softprob',
    'num_class': n_classes,
    'eval_metric': ['mlogloss', 'merror'],
    'max_depth': 8,
    'learning_rate': 0.1,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': 1,
    'seed': 42
}

evals_result_xgb = {}
t0 = time.time()
bst_xgb = xgb.train(
    xgb_params, dtrain,
    num_boost_round=300,
    evals=[(dtrain, 'train'), (dtest_xgb, 'test')],
    early_stopping_rounds=20,
    evals_result=evals_result_xgb,
    verbose_eval=50
)
train_time_xgb = time.time() - t0

y_prob_xgb = bst_xgb.predict(dtest_xgb, iteration_range=(0, bst_xgb.best_iteration + 1))
y_pred_xgb = y_prob_xgb.argmax(axis=1)

acc = accuracy_score(y_test, y_pred_xgb)
f1_mac = f1_score(y_test, y_pred_xgb, average='macro')
f1_wtd = f1_score(y_test, y_pred_xgb, average='weighted')

round1_results['XGBoost GPU'] = {'accuracy': acc, 'f1_macro': f1_mac, 'f1_weighted': f1_wtd, 'train_time': train_time_xgb}
round1_predictions['XGBoost GPU'] = y_pred_xgb

print(f"\n   üéÆ GPU Training | ‚è±Ô∏è {train_time_xgb:.1f}s | Best iter: {bst_xgb.best_iteration}")
print(f"   ‚úÖ Accuracy: {acc*100:.4f}% | F1-Macro: {f1_mac*100:.4f}% | F1-Weighted: {f1_wtd*100:.4f}%")

bst_xgb.save_model('models/family_r1_xgb_gpu.json')
del dtrain; gc.collect()

In [None]:
# LightGBM GPU ‚Äî Round 1
print(f"\n{'='*60}")
print(f"üéÆ Round 1 ‚Äî LightGBM GPU")
print(f"{'='*60}")

lgb_params = {
    'objective': 'multiclass',
    'num_class': n_classes,
    'metric': ['multi_logloss', 'multi_error'],
    'device': 'gpu',
    'gpu_use_dp': False,
    'class_weight': 'balanced',
    'max_depth': 8,
    'learning_rate': 0.1,
    'num_leaves': 127,
    'min_child_samples': 50,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': 1,
    'seed': 42,
    'n_jobs': -1
}

lgb_train = lgb.Dataset(X_train, label=y_train, feature_name=feature_names, free_raw_data=False)
lgb_test = lgb.Dataset(X_test, label=y_test, feature_name=feature_names, reference=lgb_train, free_raw_data=False)

evals_result_lgb = {}
t0 = time.time()
bst_lgb = lgb.train(
    lgb_params, lgb_train,
    num_boost_round=300,
    valid_sets=[lgb_train, lgb_test],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.log_evaluation(50),
        lgb.early_stopping(20),
        lgb.record_evaluation(evals_result_lgb)
    ]
)
train_time_lgb = time.time() - t0

y_prob_lgb = bst_lgb.predict(X_test, num_iteration=bst_lgb.best_iteration)
y_pred_lgb = y_prob_lgb.argmax(axis=1)

acc = accuracy_score(y_test, y_pred_lgb)
f1_mac = f1_score(y_test, y_pred_lgb, average='macro')
f1_wtd = f1_score(y_test, y_pred_lgb, average='weighted')

round1_results['LightGBM GPU'] = {'accuracy': acc, 'f1_macro': f1_mac, 'f1_weighted': f1_wtd, 'train_time': train_time_lgb}
round1_predictions['LightGBM GPU'] = y_pred_lgb

print(f"\n   üéÆ GPU Training | ‚è±Ô∏è {train_time_lgb:.1f}s | Best iter: {bst_lgb.best_iteration}")
print(f"   ‚úÖ Accuracy: {acc*100:.4f}% | F1-Macro: {f1_mac*100:.4f}% | F1-Weighted: {f1_wtd*100:.4f}%")

bst_lgb.save_model('models/family_r1_lgb_gpu.txt')

In [None]:
# Round 1 Summary
print("\n" + "="*80)
print("üìä ROUND 1 RESULTS ‚Äî Class Weights Only")
print("="*80)
r1_df = pd.DataFrame(round1_results).T
r1_df.columns = ['Accuracy', 'F1-Macro', 'F1-Weighted', 'Train Time']
r1_df[['Accuracy', 'F1-Macro', 'F1-Weighted']] *= 100
print(r1_df.to_string(float_format=lambda x: f'{x:.4f}'))

# Identify best model
best_model_name = r1_df['F1-Macro'].idxmax()
print(f"\nüèÜ Best model (F1-Macro): {best_model_name}")

## üß™ Round 2: SMOTE for Web + BruteForce

In [None]:
# Apply SMOTE to oversample minority classes
print("üß™ Round 2: Applying SMOTE for minority classes...")
t0 = time.time()

# Find minority classes' indices
class_counts = pd.Series(y_train).value_counts()
print(f"   Before SMOTE: {dict(class_counts)}")

# Set target: bring minorities up to at least median count
median_count = int(class_counts.median())
sampling_strategy = {}
for cls_id, count in class_counts.items():
    if count < median_count:
        sampling_strategy[cls_id] = min(median_count, count * 10)  # Cap at 10x

print(f"   SMOTE targets: {sampling_strategy}")

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"   After SMOTE: {X_train_smote.shape[0]:,} samples (was {X_train.shape[0]:,})")
print(f"   ‚è±Ô∏è SMOTE done in {time.time()-t0:.1f}s")

# Show new distribution
print(f"\n   New distribution:")
for i, name in enumerate(family_classes):
    count = (y_train_smote == i).sum()
    print(f"   {i}: {name:<20s} ‚Üí {count:>10,}")

In [None]:
# Train best model with SMOTE data on GPU
print(f"\n{'='*60}")
print(f"üéÆ Round 2 ‚Äî XGBoost GPU (SMOTE data)")
print(f"{'='*60}")

dtrain_smote = xgb.DMatrix(X_train_smote, label=y_train_smote, feature_names=feature_names)

t0 = time.time()
bst_xgb_r2 = xgb.train(
    xgb_params, dtrain_smote,
    num_boost_round=300,
    evals=[(dtrain_smote, 'train'), (dtest_xgb, 'test')],
    early_stopping_rounds=20,
    verbose_eval=50
)
train_time_r2 = time.time() - t0

y_pred_r2 = bst_xgb_r2.predict(dtest_xgb, iteration_range=(0, bst_xgb_r2.best_iteration + 1)).argmax(axis=1)

acc_r2 = accuracy_score(y_test, y_pred_r2)
f1_mac_r2 = f1_score(y_test, y_pred_r2, average='macro')
f1_wtd_r2 = f1_score(y_test, y_pred_r2, average='weighted')

print(f"\n   üéÆ GPU Training | ‚è±Ô∏è {train_time_r2:.1f}s")
print(f"   ‚úÖ Accuracy: {acc_r2*100:.4f}% | F1-Macro: {f1_mac_r2*100:.4f}% | F1-Weighted: {f1_wtd_r2*100:.4f}%")

bst_xgb_r2.save_model('models/family_r2_xgb_gpu.json')
del dtrain_smote, X_train_smote, y_train_smote; gc.collect()

## üß™ Round 3: Undersample Majority + SMOTE Minorities

In [None]:
print("üß™ Round 3: Undersample majority + SMOTE minorities...")
t0 = time.time()

class_counts = pd.Series(y_train).value_counts()

# Undersample: cap majority classes at 30K
under_strategy = {}
for cls_id, count in class_counts.items():
    if count > 30000:
        under_strategy[cls_id] = 30000

# SMOTE: bring minorities up to 5000
over_strategy = {}
for cls_id, count in class_counts.items():
    effective_count = min(count, under_strategy.get(cls_id, count))
    if effective_count < 5000:
        over_strategy[cls_id] = 5000

print(f"   Under-sampling: {under_strategy}")
print(f"   Over-sampling: {over_strategy}")

pipeline = ImbPipeline([
    ('under', RandomUnderSampler(sampling_strategy=under_strategy, random_state=42)),
    ('over', SMOTE(sampling_strategy=over_strategy, random_state=42, n_jobs=-1))
])

X_train_r3, y_train_r3 = pipeline.fit_resample(X_train, y_train)
print(f"   Result: {X_train_r3.shape[0]:,} samples")
print(f"   ‚è±Ô∏è Done in {time.time()-t0:.1f}s")

for i, name in enumerate(family_classes):
    count = (y_train_r3 == i).sum()
    print(f"   {i}: {name:<20s} ‚Üí {count:>10,}")

In [None]:
# Train XGBoost GPU on Round 3 data
print(f"\n{'='*60}")
print(f"üéÆ Round 3 ‚Äî XGBoost GPU (Under+SMOTE data)")
print(f"{'='*60}")

dtrain_r3 = xgb.DMatrix(X_train_r3, label=y_train_r3, feature_names=feature_names)

t0 = time.time()
bst_xgb_r3 = xgb.train(
    xgb_params, dtrain_r3,
    num_boost_round=300,
    evals=[(dtrain_r3, 'train'), (dtest_xgb, 'test')],
    early_stopping_rounds=20,
    verbose_eval=50
)
train_time_r3 = time.time() - t0

y_pred_r3 = bst_xgb_r3.predict(dtest_xgb, iteration_range=(0, bst_xgb_r3.best_iteration + 1)).argmax(axis=1)

acc_r3 = accuracy_score(y_test, y_pred_r3)
f1_mac_r3 = f1_score(y_test, y_pred_r3, average='macro')
f1_wtd_r3 = f1_score(y_test, y_pred_r3, average='weighted')

print(f"\n   üéÆ GPU Training | ‚è±Ô∏è {train_time_r3:.1f}s")
print(f"   ‚úÖ Accuracy: {acc_r3*100:.4f}% | F1-Macro: {f1_mac_r3*100:.4f}% | F1-Weighted: {f1_wtd_r3*100:.4f}%")

bst_xgb_r3.save_model('models/family_r3_xgb_gpu.json')
del dtrain_r3, X_train_r3, y_train_r3; gc.collect()

## üîß Hyperparameter Tuning with Optuna (Best Model ‚Äî XGBoost GPU)

In [None]:
print("üîß Optuna Hyperparameter Tuning ‚Äî XGBoost GPU")
print("   Optimizing: Macro F1-Score")
print("   Trials: 30")

dtrain_opt = xgb.DMatrix(X_train, label=y_train, weight=sample_weights, feature_names=feature_names)

def objective(trial):
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'multi:softprob',
        'num_class': n_classes,
        'eval_metric': 'mlogloss',
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'verbosity': 0,
        'seed': 42
    }
    
    n_rounds = trial.suggest_int('n_estimators', 100, 500)
    
    bst = xgb.train(
        params, dtrain_opt,
        num_boost_round=n_rounds,
        evals=[(dtest_xgb, 'test')],
        early_stopping_rounds=20,
        verbose_eval=False
    )
    
    y_prob = bst.predict(dtest_xgb, iteration_range=(0, bst.best_iteration + 1))
    y_pred = y_prob.argmax(axis=1)
    f1_mac = f1_score(y_test, y_pred, average='macro')
    
    return f1_mac

study = optuna.create_study(direction='maximize', study_name='family_xgb_gpu')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print(f"\nüèÜ Best trial:")
print(f"   F1-Macro: {study.best_value*100:.4f}%")
print(f"   Params: {study.best_params}")

In [None]:
# Train final model with best params
print(f"\n{'='*60}")
print(f"üéÆ Training Final Model with Best Params (GPU)")
print(f"{'='*60}")

best_params = study.best_params.copy()
n_rounds_best = best_params.pop('n_estimators')
best_params.update({
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'multi:softprob',
    'num_class': n_classes,
    'eval_metric': ['mlogloss', 'merror'],
    'verbosity': 1,
    'seed': 42
})

evals_result_best = {}
t0 = time.time()
bst_best = xgb.train(
    best_params, dtrain_opt,
    num_boost_round=n_rounds_best,
    evals=[(dtrain_opt, 'train'), (dtest_xgb, 'test')],
    early_stopping_rounds=20,
    evals_result=evals_result_best,
    verbose_eval=50
)
train_time_best = time.time() - t0

y_pred_best = bst_best.predict(dtest_xgb, iteration_range=(0, bst_best.best_iteration + 1)).argmax(axis=1)

acc_best = accuracy_score(y_test, y_pred_best)
f1_mac_best = f1_score(y_test, y_pred_best, average='macro')
f1_wtd_best = f1_score(y_test, y_pred_best, average='weighted')

print(f"\n   üéÆ GPU | ‚è±Ô∏è {train_time_best:.1f}s | Best iter: {bst_best.best_iteration}")
print(f"   ‚úÖ Accuracy: {acc_best*100:.4f}% | F1-Macro: {f1_mac_best*100:.4f}% | F1-Weighted: {f1_wtd_best*100:.4f}%")

bst_best.save_model('models/family_best_xgb_gpu.json')
print("   üíæ Saved to models/family_best_xgb_gpu.json")
del dtrain_opt; gc.collect()

## üìä Evaluation & Visualization

In [None]:
# Per-class classification report for best model
print("="*60)
print("üìã Best Model ‚Äî Per-Class Classification Report")
print("="*60)
print(classification_report(y_test, y_pred_best, target_names=family_classes, digits=4))

In [None]:
# üìä 8x8 Confusion Matrix Heatmap
cm = confusion_matrix(y_test, y_pred_best)
cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

fig, axes = plt.subplots(1, 2, figsize=(22, 8))

sns.heatmap(cm, annot=True, fmt=',d', cmap='YlOrRd', xticklabels=family_classes, yticklabels=family_classes,
            ax=axes[0], linewidths=1, linecolor='white', annot_kws={'fontsize': 9})
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold', color='white')
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('Actual', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)

sns.heatmap(cm_pct, annot=True, fmt='.1f', cmap='RdYlGn', xticklabels=family_classes, yticklabels=family_classes,
            ax=axes[1], linewidths=1, linecolor='white', annot_kws={'fontsize': 9})
axes[1].set_title('Confusion Matrix (% per class)', fontsize=14, fontweight='bold', color='white')
axes[1].set_xlabel('Predicted', fontsize=12)
axes[1].set_ylabel('Actual', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)

plt.suptitle('üìä 8-Class Family Classification ‚Äî Confusion Matrix', fontsize=16, fontweight='bold', color='#00D4AA', y=1.02)
plt.tight_layout()
plt.savefig('figures/family_confusion_matrix.png', dpi=150, bbox_inches='tight', facecolor='#1a1a2e')
plt.show()
print("üíæ Saved to figures/family_confusion_matrix.png")

In [None]:
# üìä Round comparison
print("\n" + "="*80)
print("üìä ALL ROUNDS COMPARISON")
print("="*80)

rounds_df = pd.DataFrame({
    'Round': ['R1: Class Weights', 'R2: SMOTE', 'R3: Under+SMOTE', 'Optuna Tuned'],
    'F1-Macro (%)': [
        round1_results['XGBoost GPU']['f1_macro']*100,
        f1_mac_r2*100,
        f1_mac_r3*100,
        f1_mac_best*100
    ],
    'Accuracy (%)': [
        round1_results['XGBoost GPU']['accuracy']*100,
        acc_r2*100,
        acc_r3*100,
        acc_best*100
    ]
})
print(rounds_df.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# Save all results
family_results = {
    'timestamp': datetime.now().isoformat(),
    'level': '8-Class Family',
    'device': 'GPU (CUDA)',
    'round1': round1_results,
    'round2_smote': {'accuracy': acc_r2, 'f1_macro': f1_mac_r2, 'f1_weighted': f1_wtd_r2},
    'round3_under_smote': {'accuracy': acc_r3, 'f1_macro': f1_mac_r3, 'f1_weighted': f1_wtd_r3},
    'optuna_best': {
        'accuracy': acc_best, 'f1_macro': f1_mac_best, 'f1_weighted': f1_wtd_best,
        'params': study.best_params
    }
}
with open('models/family_results.json', 'w') as f:
    json.dump(family_results, f, indent=2, default=str)

print("\nüèÜ" * 20)
print(f"  ‚úÖ FAMILY CLASSIFICATION COMPLETE!")
print(f"  üéÆ All GPU-accelerated")
print("üèÜ" * 20)