# Anomaly Detection Baselines for Bot-IoT

Implementing unsupervised anomaly detection models (Isolation Forest, LOF) with proper preprocessing and train/test split to establish baselines for comparison with LLM-based clustering approaches.

## 1. Import Libraries and Load Data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
file_path = "/Users/nawara/Desktop/LLM-Clustering-Paper/Bot-IoT-Dataset/bot_iot_balanced_subset_300k.csv"
df = pd.read_csv(file_path, low_memory=False)

print("="*80)
print("ANOMALY DETECTION BASELINES - DATA LOADING")
print("="*80)
print(f"\nDataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Features: {list(df.columns)}")

ANOMALY DETECTION BASELINES - DATA LOADING

Dataset loaded: 150,477 rows × 46 columns
Features: ['pkSeqID', 'stime', 'flgs', 'flgs_number', 'proto', 'proto_number', 'saddr', 'sport', 'daddr', 'dport', 'pkts', 'bytes', 'state', 'state_number', 'ltime', 'seq', 'dur', 'mean', 'stddev', 'sum', 'min', 'max', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate', 'TnBPSrcIP', 'TnBPDstIP', 'TnP_PSrcIP', 'TnP_PDstIP', 'TnP_PerProto', 'TnP_Per_Dport', 'AR_P_Proto_P_SrcIP', 'AR_P_Proto_P_DstIP', 'N_IN_Conn_P_DstIP', 'N_IN_Conn_P_SrcIP', 'AR_P_Proto_P_Sport', 'AR_P_Proto_P_Dport', 'Pkts_P_State_P_Protocol_P_DestIP', 'Pkts_P_State_P_Protocol_P_SrcIP', 'attack', 'category', 'subcategory']


## 2. Train/Test Split (No Leakage)

In [2]:
# Separate features and labels
# Keep labels only for evaluation, not for training
X = df.drop(['attack', 'category', 'subcategory', 'pkSeqID'], axis=1)
y = df['attack']  # 0=Normal, 1=Attack

print("\n" + "="*80)
print("TRAIN/TEST SPLIT")
print("="*80)

# 70-30 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42,
    stratify=y  # Maintain class distribution
)

print(f"\nTrain set size: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(df)*100:.1f}%)")
print(f"\nTrain label distribution:")
print(y_train.value_counts())
print(f"\nTest label distribution:")
print(y_test.value_counts())


TRAIN/TEST SPLIT

Train set size: 105,333 samples (70.0%)
Test set size: 45,144 samples (30.0%)

Train label distribution:
attack
1    104999
0       334
Name: count, dtype: int64

Test label distribution:
attack
1    45001
0      143
Name: count, dtype: int64


## 3. Feature Preprocessing (Fit on Train Only)

In [6]:
print("\n" + "="*80)
print("FEATURE PREPROCESSING (FAST)")
print("="*80)

# Separate features again just to be clean
X_train = X_train.drop(columns=['attack', 'category', 'subcategory'] if 'attack' in X_train.columns else [])
X_test = X_test.drop(columns=['attack', 'category', 'subcategory'] if 'attack' in X_test.columns else [])

# Identify columns
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# ────────────────────────────────────────
# Encode categoricals (FAST with OrdinalEncoder)
# ────────────────────────────────────────
print("\n>>> Encoding categorical features (OrdinalEncoder)...")
if len(categorical_features) > 0:
    encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1,
        dtype=np.float64
    )
    X_train_cat = encoder.fit_transform(X_train[categorical_features])
    X_test_cat = encoder.transform(X_test[categorical_features])
    print(f"    Encoded {len(categorical_features)} categorical features")
else:
    X_train_cat = np.array([]).reshape(len(X_train), 0)
    X_test_cat = np.array([]).reshape(len(X_test), 0)

# ────────────────────────────────────────
# Scale numeric features
# ────────────────────────────────────────
print(">>> Scaling numeric features (StandardScaler)...")
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numeric_features])
X_test_num = scaler.transform(X_test[numeric_features])
print(f"    Scaled {len(numeric_features)} numeric features")

# ────────────────────────────────────────
# Combine (FAST with numpy)
# ────────────────────────────────────────
print(">>> Combining features...")
X_train_scaled = np.hstack([X_train_num, X_train_cat]) if X_train_cat.shape[1] > 0 else X_train_num
X_test_scaled = np.hstack([X_test_num, X_test_cat]) if X_test_cat.shape[1] > 0 else X_test_num

print(f"\n✓ Preprocessing complete!")
print(f"  Train matrix: {X_train_scaled.shape}")
print(f"  Test matrix:  {X_test_scaled.shape}")


FEATURE PREPROCESSING (FAST)

Numeric features: 35
Categorical features: 7

>>> Encoding categorical features (OrdinalEncoder)...
    Encoded 7 categorical features
>>> Scaling numeric features (StandardScaler)...
    Scaled 35 numeric features
>>> Combining features...

✓ Preprocessing complete!
  Train matrix: (105333, 42)
  Test matrix:  (45144, 42)


## 4. Isolation Forest - Train & Predict

In [None]:
print("\n" + "="*80)
print("ISOLATION FOREST")
print("="*80)

# Train on TRAIN only
print("\n>>> Training Isolation Forest on TRAIN...")
iso_forest = IsolationForest(
    contamination=0.1,  # Assume 10% anomalies
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_train_scaled)
print("✓ Isolation Forest trained successfully!")

# Score on both TRAIN and TEST
print("\n>>> Scoring on TRAIN and TEST...")
iso_train_scores = iso_forest.score_samples(X_train_scaled)  # Negative = anomaly
iso_test_scores = iso_forest.score_samples(X_test_scaled)

iso_train_pred = iso_forest.predict(X_train_scaled)  # -1 = anomaly, 1 = normal
iso_test_pred = iso_forest.predict(X_test_scaled)

# Convert to anomaly scores (higher = more anomalous)
iso_train_anomaly_scores = -iso_train_scores  # Invert so higher is more anomalous
iso_test_anomaly_scores = -iso_test_scores

print(f"\nTrain anomaly scores - Min: {iso_train_anomaly_scores.min():.4f}, Max: {iso_train_anomaly_scores.max():.4f}, Mean: {iso_train_anomaly_scores.mean():.4f}")
print(f"Test anomaly scores - Min: {iso_test_anomaly_scores.min():.4f}, Max: {iso_test_anomaly_scores.max():.4f}, Mean: {iso_test_anomaly_scores.mean():.4f}")
print(f"\nTrain - Detected anomalies: {(iso_train_pred == -1).sum():,} / {len(iso_train_pred):,}")
print(f"Test - Detected anomalies: {(iso_test_pred == -1).sum():,} / {len(iso_test_pred):,}")


ISOLATION FOREST

>>> Training Isolation Forest on TRAIN...
✓ Isolation Forest trained successfully!

>>> Scoring on TRAIN and TEST...


## 5. Local Outlier Factor (LOF) - Train & Predict

In [None]:
print("\n" + "="*80)
print("LOCAL OUTLIER FACTOR (LOF)")
print("="*80)

# Train on TRAIN only
print("\n>>> Training LOF on TRAIN...")
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1,  # Assume 10% anomalies
    novelty=True,  # Enable scoring on new data
    n_jobs=-1
)

lof.fit(X_train_scaled)
print("✓ LOF trained successfully!")

# Score on both TRAIN and TEST
print("\n>>> Scoring on TRAIN and TEST...")
lof_train_scores = lof.score_samples(X_train_scaled)  # Negative = anomaly
lof_test_scores = lof.score_samples(X_test_scaled)

lof_train_pred = lof.predict(X_train_scaled)  # -1 = anomaly, 1 = normal
lof_test_pred = lof.predict(X_test_scaled)

# Convert to anomaly scores (higher = more anomalous)
lof_train_anomaly_scores = -lof_train_scores  # Invert so higher is more anomalous
lof_test_anomaly_scores = -lof_test_scores

print(f"\nTrain anomaly scores - Min: {lof_train_anomaly_scores.min():.4f}, Max: {lof_train_anomaly_scores.max():.4f}, Mean: {lof_train_anomaly_scores.mean():.4f}")
print(f"Test anomaly scores - Min: {lof_test_anomaly_scores.min():.4f}, Max: {lof_test_anomaly_scores.max():.4f}, Mean: {lof_test_anomaly_scores.mean():.4f}")
print(f"\nTrain - Detected anomalies: {(lof_train_pred == -1).sum():,} / {len(lof_train_pred):,}")
print(f"Test - Detected anomalies: {(lof_test_pred == -1).sum():,} / {len(lof_test_pred):,}")

## 6. Evaluation on Test Set (Using Labels)

In [None]:
print("\n" + "="*80)
print("EVALUATION ON TEST SET")
print("="*80)

# Convert sklearn predictions to binary (0=normal, 1=anomaly)
# sklearn: -1 = anomaly, 1 = normal
iso_test_binary = (iso_test_pred == -1).astype(int)
lof_test_binary = (lof_test_pred == -1).astype(int)
y_test_binary = y_test.values  # 0=Normal, 1=Attack

def evaluate_model(y_true, y_pred, anomaly_scores, model_name):
    """Evaluate anomaly detection model"""
    print(f"\n>>> {model_name}")
    print(f"{'─'*60}")
    
    # Binary classification metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # ROC-AUC using anomaly scores
    roc_auc = roc_auc_score(y_true, anomaly_scores)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(f"\nConfusion Matrix:")
    print(f"  True Negatives:  {tn:,}")
    print(f"  False Positives: {fp:,}")
    print(f"  False Negatives: {fn:,}")
    print(f"  True Positives:  {tp:,}")
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn
    }

# Evaluate both models
iso_results = evaluate_model(y_test_binary, iso_test_binary, iso_test_anomaly_scores, "Isolation Forest")
lof_results = evaluate_model(y_test_binary, lof_test_binary, lof_test_anomaly_scores, "Local Outlier Factor")

## 7. Results Visualization

In [None]:
# Create comparison visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Metrics Comparison
metrics_df = pd.DataFrame([iso_results, lof_results])
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x_pos = np.arange(len(metrics_to_plot))
width = 0.35

ax = axes[0, 0]
ax.bar(x_pos - width/2, metrics_df.loc[0, metrics_to_plot], width, label='Isolation Forest', alpha=0.8, color='steelblue')
ax.bar(x_pos + width/2, metrics_df.loc[1, metrics_to_plot], width, label='LOF', alpha=0.8, color='coral')
ax.set_ylabel('Score', fontsize=11, fontweight='bold')
ax.set_title('Model Performance Comparison', fontsize=12, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(metrics_to_plot, rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.05])

# 2. ROC Curves
ax = axes[0, 1]
fpr_iso, tpr_iso, _ = roc_curve(y_test_binary, iso_test_anomaly_scores)
fpr_lof, tpr_lof, _ = roc_curve(y_test_binary, lof_test_anomaly_scores)

ax.plot(fpr_iso, tpr_iso, label=f'Isolation Forest (AUC={iso_results["ROC-AUC"]:.3f})', linewidth=2, color='steelblue')
ax.plot(fpr_lof, tpr_lof, label=f'LOF (AUC={lof_results["ROC-AUC"]:.3f})', linewidth=2, color='coral')
ax.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
ax.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
ax.set_title('ROC Curves', fontsize=12, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(alpha=0.3)

# 3. Anomaly Score Distributions
ax = axes[1, 0]
ax.hist(iso_test_anomaly_scores[y_test_binary == 0], bins=50, alpha=0.6, label='Normal', color='green')
ax.hist(iso_test_anomaly_scores[y_test_binary == 1], bins=50, alpha=0.6, label='Attack', color='red')
ax.set_xlabel('Anomaly Score', fontsize=11, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax.set_title('Isolation Forest - Score Distribution', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3, axis='y')

# 4. Confusion Matrix Heatmap (Isolation Forest)
ax = axes[1, 1]
cm_iso = confusion_matrix(y_test_binary, iso_test_binary)
sns.heatmap(cm_iso, annot=True, fmt='d', cmap='Blues', ax=ax, cbar_kws={'label': 'Count'})
ax.set_xlabel('Predicted', fontsize=11, fontweight='bold')
ax.set_ylabel('Actual', fontsize=11, fontweight='bold')
ax.set_title('Isolation Forest - Confusion Matrix', fontsize=12, fontweight='bold')
ax.set_xticklabels(['Normal', 'Attack'])
ax.set_yticklabels(['Normal', 'Attack'])

plt.tight_layout()
plt.savefig('/Users/nawara/Desktop/LLM-Clustering-Paper/Anomaly_Baselines_Results.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved!")

## 8. Save Results and Anomaly Scores

In [None]:
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# 1. Save evaluation metrics
metrics_df = pd.DataFrame([iso_results, lof_results])
metrics_csv = "/Users/nawara/Desktop/LLM-Clustering-Paper/baseline_metrics.csv"
metrics_df.to_csv(metrics_csv, index=False)
print(f"\n✓ Metrics saved to: {metrics_csv}")
print(metrics_df.to_string(index=False))

# 2. Save test predictions with anomaly scores
results_df = pd.DataFrame({
    'actual_label': y_test.values,
    'iso_forest_score': iso_test_anomaly_scores,
    'iso_forest_pred': iso_test_binary,
    'lof_score': lof_test_anomaly_scores,
    'lof_pred': lof_test_binary
})

results_csv = "/Users/nawara/Desktop/LLM-Clustering-Paper/baseline_test_predictions.csv"
results_df.to_csv(results_csv, index=False)
print(f"\n✓ Test predictions saved to: {results_csv}")
print(f"  Shape: {results_df.shape}")
print(f"\n  Sample (first 5 rows):")
print(results_df.head())

## 9. Summary and Next Steps

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

summary = f"""
BASELINE ANOMALY DETECTION MODELS
{'='*80}

✓ DATA PREPROCESSING:
  • Train/Test Split: 70% / 30% (stratified)
  • Categorical Encoding: LabelEncoder (fit on train only)
  • Numeric Scaling: StandardScaler (fit on train only)
  • No data leakage ensured

✓ MODELS TRAINED:
  1. Isolation Forest (contamination=0.1)
  2. Local Outlier Factor (n_neighbors=20, contamination=0.1)

✓ TEST SET EVALUATION:
  
  Isolation Forest:
    • Accuracy:  {iso_results['Accuracy']:.4f}
    • Precision: {iso_results['Precision']:.4f}
    • Recall:    {iso_results['Recall']:.4f}
    • F1-Score:  {iso_results['F1-Score']:.4f}
    • ROC-AUC:   {iso_results['ROC-AUC']:.4f}
  
  Local Outlier Factor:
    • Accuracy:  {lof_results['Accuracy']:.4f}
    • Precision: {lof_results['Precision']:.4f}
    • Recall:    {lof_results['Recall']:.4f}
    • F1-Score:  {lof_results['F1-Score']:.4f}
    • ROC-AUC:   {lof_results['ROC-AUC']:.4f}

✓ OUTPUTS GENERATED:
  1. baseline_metrics.csv - Model evaluation metrics
  2. baseline_test_predictions.csv - Predictions + anomaly scores on test
  3. Anomaly_Baselines_Results.png - Visualization

{'='*80}
NEXT STEPS:
  1. Implement LLM-based clustering approach
  2. Use same preprocessing and train/test split
  3. Compare LLM-Clustering results against these baselines
  4. Analyze differences in detected anomalies
  5. Write comparison analysis for paper
{'='*80}
"""

print(summary)