# üìÖ Day 5: Level 3 ‚Äî 34-Class Sub-Type Classification
## Full Granularity: All Attack Sub-Types + GPU Training

---

**Steps:**
1. Load data
2. Heavy imbalance handling (Undersample + SMOTE + Class weights)
3. Train best model (XGBoost GPU)
4. Per-class F1 deep dive
5. Feature importance

---

In [None]:
import os
os.add_dll_directory(r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\x64')

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import (accuracy_score, f1_score, classification_report, confusion_matrix)
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import json
import joblib
from datetime import datetime

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 12

os.makedirs('models', exist_ok=True)
os.makedirs('figures', exist_ok=True)

print(f"‚úÖ Ready | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
print("üì• Loading preprocessed data...")
t0 = time.time()

X_train = np.load('processed/X_train.npy')
X_test = np.load('processed/X_test.npy')
y_train = np.load('processed/y_subtype_train.npy')
y_test = np.load('processed/y_subtype_test.npy')

with open('processed/preprocessing_metadata.json', 'r') as f:
    meta = json.load(f)
feature_names = meta['feature_names']
subtype_classes = meta['subtype_classes']
n_classes = len(subtype_classes)

print(f"‚úÖ Loaded in {time.time()-t0:.1f}s")
print(f"   X_train: {X_train.shape} | X_test: {X_test.shape}")
print(f"   Classes: {n_classes}")

# Class distribution
print(f"\nüìä Training class distribution:")
class_counts = pd.Series(y_train).value_counts().sort_index()
for cls_id in class_counts.index:
    count = class_counts[cls_id]
    print(f"   {cls_id:2d}: {subtype_classes[cls_id]:<35s} ‚Üí {count:>10,} ({count/len(y_train)*100:.3f}%)")

## ‚öñÔ∏è Step 2: Heavy Imbalance Handling

In [None]:
# Strategy: Undersample top-5 majority to 20K each, SMOTE minorities <500 up
print("‚öñÔ∏è Applying imbalance handling pipeline...")
t0 = time.time()

class_counts_dict = pd.Series(y_train).value_counts().to_dict()

# Under-sampling: cap at 20K
under_strategy = {}
for cls_id, count in class_counts_dict.items():
    if count > 20000:
        under_strategy[cls_id] = 20000

# Over-sampling: bring classes <500 up to 500  
over_strategy = {}
for cls_id, count in class_counts_dict.items():
    effective_count = min(count, under_strategy.get(cls_id, count))
    # Need at least k_neighbors+1 samples for SMOTE (default k=5, so need 6)
    if effective_count < 500 and effective_count >= 6:
        over_strategy[cls_id] = 500

print(f"   Under-sampling {len(under_strategy)} classes to 20K max")
print(f"   Over-sampling {len(over_strategy)} classes to 500")

steps = []
if under_strategy:
    steps.append(('under', RandomUnderSampler(sampling_strategy=under_strategy, random_state=42)))
if over_strategy:
    steps.append(('over', SMOTE(sampling_strategy=over_strategy, random_state=42, k_neighbors=3, n_jobs=-1)))

if steps:
    pipeline = ImbPipeline(steps)
    X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train)
    print(f"   ‚úÖ Before: {len(y_train):,} | After: {len(y_train_balanced):,}")
else:
    X_train_balanced, y_train_balanced = X_train, y_train
    print("   ‚ö†Ô∏è No resampling applied")

print(f"   ‚è±Ô∏è Done in {time.time()-t0:.1f}s")

## üéÆ Step 3: Train XGBoost GPU ‚Äî 34-Class

In [None]:
# Compute sample weights on balanced data
sample_weights = compute_sample_weight('balanced', y_train_balanced)

print(f"{'='*60}")
print(f"üéÆ XGBoost GPU ‚Äî 34-Class Sub-Type Classification")
print(f"{'='*60}")

dtrain = xgb.DMatrix(X_train_balanced, label=y_train_balanced, weight=sample_weights, feature_names=feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)

del X_train_balanced, y_train_balanced
gc.collect()

xgb_params = {
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'multi:softprob',
    'num_class': n_classes,
    'eval_metric': ['mlogloss', 'merror'],
    'max_depth': 10,
    'learning_rate': 0.1,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': 1,
    'seed': 42
}

evals_result = {}
t0 = time.time()
bst = xgb.train(
    xgb_params, dtrain,
    num_boost_round=400,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=25,
    evals_result=evals_result,
    verbose_eval=25
)
t_train = time.time() - t0

y_pred_prob = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
y_pred = y_pred_prob.argmax(axis=1)

acc = accuracy_score(y_test, y_pred)
f1_mac = f1_score(y_test, y_pred, average='macro')
f1_wtd = f1_score(y_test, y_pred, average='weighted')

print(f"\nüéÆ GPU Training Complete!")
print(f"   ‚è±Ô∏è Time: {t_train:.1f}s ({t_train/60:.1f} min)")
print(f"   üèÜ Best iteration: {bst.best_iteration}")
print(f"   ‚úÖ Accuracy: {acc*100:.4f}%")
print(f"   üéØ F1-Macro: {f1_mac*100:.4f}%")
print(f"   üìè F1-Weighted: {f1_wtd*100:.4f}%")

bst.save_model('models/subtype_xgb_gpu.json')
print("   üíæ Saved to models/subtype_xgb_gpu.json")

del dtrain; gc.collect()

In [None]:
# Also train LightGBM GPU for comparison
print(f"\n{'='*60}")
print(f"üéÆ LightGBM GPU ‚Äî 34-Class")
print(f"{'='*60}")

# Reload balanced data for LightGBM
# Use original data with class_weight since we deleted the balanced data
lgb_train = lgb.Dataset(X_train, label=y_train, feature_name=feature_names, free_raw_data=False)
lgb_test_ds = lgb.Dataset(X_test, label=y_test, feature_name=feature_names, reference=lgb_train, free_raw_data=False)

lgb_params = {
    'objective': 'multiclass',
    'num_class': n_classes,
    'metric': ['multi_logloss', 'multi_error'],
    'device': 'gpu',
    'gpu_use_dp': False,
    'class_weight': 'balanced',
    'max_depth': 10,
    'learning_rate': 0.1,
    'num_leaves': 200,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': 1,
    'seed': 42,
    'n_jobs': -1
}

evals_lgb = {}
t0 = time.time()
bst_lgb = lgb.train(
    lgb_params, lgb_train,
    num_boost_round=400,
    valid_sets=[lgb_train, lgb_test_ds],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.log_evaluation(50),
        lgb.early_stopping(25),
        lgb.record_evaluation(evals_lgb)
    ]
)
t_train_lgb = time.time() - t0

y_pred_lgb = bst_lgb.predict(X_test, num_iteration=bst_lgb.best_iteration).argmax(axis=1)

acc_lgb = accuracy_score(y_test, y_pred_lgb)
f1_mac_lgb = f1_score(y_test, y_pred_lgb, average='macro')

print(f"\n   üéÆ GPU | ‚è±Ô∏è {t_train_lgb:.1f}s")
print(f"   ‚úÖ Accuracy: {acc_lgb*100:.4f}% | F1-Macro: {f1_mac_lgb*100:.4f}%")

bst_lgb.save_model('models/subtype_lgb_gpu.txt')

## üìä Step 4: Per-Class F1 Deep Dive

In [None]:
# Per-class report
print("="*80)
print("üìä 34-CLASS CLASSIFICATION REPORT")
print("="*80)
print(classification_report(y_test, y_pred, target_names=subtype_classes, digits=4, zero_division=0))

In [None]:
# Per-class F1 bar chart
from sklearn.metrics import f1_score as f1

per_class_f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
f1_df = pd.DataFrame({'Class': subtype_classes, 'F1-Score': per_class_f1})
f1_df = f1_df.sort_values('F1-Score', ascending=True)

fig, ax = plt.subplots(figsize=(14, 12))

colors = plt.cm.RdYlGn(f1_df['F1-Score'].values)
bars = ax.barh(f1_df['Class'], f1_df['F1-Score']*100, color=colors, edgecolor='white', linewidth=0.5)

for bar, val in zip(bars, f1_df['F1-Score'].values):
    ax.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2.,
            f'{val*100:.1f}%', va='center', fontsize=9, color='white')

ax.axvline(x=90, color='#FFD700', linestyle='--', alpha=0.5, label='90%')
ax.axvline(x=50, color='#FF4C61', linestyle='--', alpha=0.5, label='50%')
ax.set_xlim(0, 105)
ax.set_title('üìä Per-Class F1-Score ‚Äî 34 Sub-Types', fontsize=16, fontweight='bold', color='#00D4AA')
ax.set_xlabel('F1-Score (%)', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, axis='x', alpha=0.2)

plt.tight_layout()
plt.savefig('figures/subtype_per_class_f1.png', dpi=150, bbox_inches='tight', facecolor='#1a1a2e')
plt.show()
print("üíæ Saved to figures/subtype_per_class_f1.png")

In [None]:
# Confusion Matrix (34x34)
cm = confusion_matrix(y_test, y_pred)
cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

fig, ax = plt.subplots(figsize=(22, 18))

sns.heatmap(cm_pct, annot=True, fmt='.0f', cmap='RdYlGn',
            xticklabels=subtype_classes, yticklabels=subtype_classes,
            ax=ax, linewidths=0.5, linecolor='gray',
            annot_kws={'fontsize': 6}, cbar_kws={'label': 'Accuracy %'})
ax.set_title('üìä 34-Class Confusion Matrix (% per class)', fontsize=16, fontweight='bold', color='#00D4AA')
ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.tick_params(axis='x', rotation=90, labelsize=8)
ax.tick_params(axis='y', labelsize=8)

plt.tight_layout()
plt.savefig('figures/subtype_confusion_matrix.png', dpi=150, bbox_inches='tight', facecolor='#1a1a2e')
plt.show()
print("üíæ Saved to figures/subtype_confusion_matrix.png")

## üèÜ Step 5: Feature Importance

In [None]:
# Feature importance
importance = bst.get_score(importance_type='weight')
sorted_imp = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:15]

imp_names = []
imp_scores = []
for fname, score in sorted_imp:
    if fname.startswith('f'):
        try:
            fidx = int(fname[1:])
            fname_actual = feature_names[fidx] if fidx < len(feature_names) else fname
        except ValueError:
            fname_actual = fname
    else:
        fname_actual = fname
    imp_names.append(fname_actual)
    imp_scores.append(score)

fig, ax = plt.subplots(figsize=(12, 8))

colors = plt.cm.plasma(np.linspace(0.2, 0.9, len(imp_names)))
y_pos = range(len(imp_names) - 1, -1, -1)

bars = ax.barh(y_pos, imp_scores, color=colors, edgecolor='white', linewidth=0.5, height=0.7)
ax.set_yticks(y_pos)
ax.set_yticklabels(imp_names, fontsize=11)

for bar, score in zip(bars, imp_scores):
    ax.text(bar.get_width() + max(imp_scores)*0.01, bar.get_y() + bar.get_height()/2.,
            f'{score:.0f}', ha='left', va='center', fontsize=10, color='white')

ax.set_title('üèÜ Top 15 Features ‚Äî 34-Class Classification', fontsize=16, fontweight='bold', color='#00D4AA')
ax.set_xlabel('Importance Score (Weight)', fontsize=12)
ax.grid(True, axis='x', alpha=0.2)

plt.tight_layout()
plt.savefig('figures/subtype_feature_importance.png', dpi=150, bbox_inches='tight', facecolor='#1a1a2e')
plt.show()
print("üíæ Saved to figures/subtype_feature_importance.png")

In [None]:
# Save results
subtype_results = {
    'timestamp': datetime.now().isoformat(),
    'level': '34-Class SubType',
    'device': 'GPU (CUDA)',
    'xgboost': {'accuracy': float(acc), 'f1_macro': float(f1_mac), 'f1_weighted': float(f1_wtd), 'train_time': t_train},
    'lightgbm': {'accuracy': float(acc_lgb), 'f1_macro': float(f1_mac_lgb), 'train_time': t_train_lgb},
    'feature_importance_top15': [{'feature': n, 'score': float(s)} for n, s in zip(imp_names, imp_scores)],
    'per_class_f1': {subtype_classes[i]: float(per_class_f1[i]) for i in range(n_classes)}
}
with open('models/subtype_results.json', 'w') as f:
    json.dump(subtype_results, f, indent=2)

print("\nüèÜ" * 20)
print(f"  ‚úÖ 34-CLASS CLASSIFICATION COMPLETE!")
print(f"  üéÆ GPU Training | Accuracy: {acc*100:.2f}% | F1-Macro: {f1_mac*100:.2f}%")
print("üèÜ" * 20)