# Wi-Fi Network Security Threat Detection Using Machine Learning

In [1]:
# Define the path to the pcap files and the output directory
pcap_directory = "../datasets/wifi"
output_directory = "./model_output/wifi"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scapy.utils import rdpcap
from scapy.layers.inet import IP
import joblib
import warnings
from datetime import datetime
import os
from scipy import stats

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, precision_recall_curve,
                           average_precision_score, confusion_matrix,
                           roc_curve, auc, silhouette_score)
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

## Load Scaled Data

In [3]:
from typing import Tuple, List, Dict

def load_processed_data(input_dir: str = 'processed_data') -> Tuple[pd.DataFrame, pd.DataFrame]:
  """Load processed data from CSV files."""
  scaled_df = pd.read_csv(f'{input_dir}/scaled_features.csv', index_col=0)
  raw_df = pd.read_csv(f'{input_dir}/raw_features.csv', index_col=0)
  
  print(f"Loaded processed data from {input_dir}/")
  print(f"Scaled features shape: {scaled_df.shape}")
  print(f"Raw features shape: {raw_df.shape}")
  
  return scaled_df, raw_df

In [4]:
X_scaled, raw_features = load_processed_data()

Loaded processed data from processed_data/
Scaled features shape: (39053, 403)
Raw features shape: (39053, 411)


## Model Training

In [5]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

def hyperparameter_tuning_iso_forest(X):
    """
    Manually tune Isolation Forest hyperparameters using unsupervised metrics.
    """
    param_grid = {
        'n_estimators': [100, 200],
        'max_samples': ['auto', 0.5, 0.75],
        'contamination': [0.01, 0.05, 0.1],
        'max_features': [1.0, 0.5, 0.75]
    }
    
    best_score = -np.inf
    best_params = None
    best_model = None
    
    for n_estimators in param_grid['n_estimators']:
        for max_samples in param_grid['max_samples']:
            for contamination in param_grid['contamination']:
                for max_features in param_grid['max_features']:
                    model = IsolationForest(
                        n_estimators=n_estimators,
                        max_samples=max_samples,
                        contamination=contamination,
                        max_features=max_features,
                        random_state=42
                    )
                    model.fit(X)
                    scores = -model.decision_function(X)
                    
                    # Evaluate using Silhouette Score
                    threshold = np.percentile(scores, 100 * (1 - contamination))
                    labels = (scores >= threshold).astype(int)
                    
                    if len(np.unique(labels)) > 1:
                        silhouette = silhouette_score(X, labels)
                    else:
                        silhouette = -1  # Assign a poor score if only one cluster
                    
                    if silhouette > best_score:
                        best_score = silhouette
                        best_params = {
                            'n_estimators': n_estimators,
                            'max_samples': max_samples,
                            'contamination': contamination,
                            'max_features': max_features
                        }
                        best_model = model
                        
    print("Best parameters for Isolation Forest:")
    print(best_params)
    print(f"Best Silhouette Score: {best_score:.4f}")
    
    # Get final scores with best model
    final_scores = -best_model.decision_function(X)
    
    return best_model, final_scores

def hyperparameter_tuning_lof(X):
    """
    Manually tune LOF hyperparameters using unsupervised metrics.
    """
    param_grid = {
        'n_neighbors': [10, 20, 30],
        'leaf_size': [30, 50],
        'metric': ['euclidean', 'manhattan']
    }
    
    best_score = -np.inf
    best_params = None
    best_model = None
    
    for n_neighbors in param_grid['n_neighbors']:
        for leaf_size in param_grid['leaf_size']:
            for metric in param_grid['metric']:
                model = LocalOutlierFactor(
                    n_neighbors=n_neighbors,
                    leaf_size=leaf_size,
                    metric=metric,
                    novelty=True
                )
                model.fit(X)
                scores = -model.decision_function(X)
                
                # Evaluate using Silhouette Score
                contamination = 0.05  # Assume 5% anomalies
                threshold = np.percentile(scores, 100 * (1 - contamination))
                labels = (scores >= threshold).astype(int)
                
                if len(np.unique(labels)) > 1:
                    silhouette = silhouette_score(X, labels)
                else:
                    silhouette = -1  # Assign a poor score if only one cluster
                
                if silhouette > best_score:
                    best_score = silhouette
                    best_params = {
                        'n_neighbors': n_neighbors,
                        'leaf_size': leaf_size,
                        'metric': metric
                    }
                    best_model = model
                        
    print("Best parameters for LOF:")
    print(best_params)
    print(f"Best Silhouette Score: {best_score:.4f}")
    
    # Get final scores with best model
    final_scores = -best_model.decision_function(X)
    
    return best_model, final_scores

import kerastuner as kt
import json
def build_autoencoder_model(hp):
    """
    Build autoencoder with dynamic input dimension handling
    """
    input_dim = X_scaled.shape[1]
    
    # Ensure consistent layer sizes
    encoding_dims = [
        hp.Int('encoding_' + str(i), 
               min_value=32,
               max_value=input_dim,
               step=32)
        for i in range(hp.Int('num_encoding_layers', 1, 3))
    ]
    
    # Build model
    input_layer = Input(shape=(input_dim,))
    encoded = input_layer
    
    # Encoder
    for dim in encoding_dims:
        encoded = Dense(dim, activation='relu')(encoded)
        if hp.Boolean('use_dropout'):
            encoded = Dropout(hp.Float('dropout_rate', 0.1, 0.5))(encoded)
            
    # Decoder (mirror encoder architecture)
    decoded = encoded
    for dim in reversed(encoding_dims[:-1]):
        decoded = Dense(dim, activation='relu')(decoded)
    
    # Final output layer must match input dimension
    decoded = Dense(input_dim, activation='linear')(decoded)
    
    # Create and compile model
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mse'
    )
    
    return autoencoder

def train_autoencoder_tuned(X):
    """
    Train autoencoder with improved dimension handling
    """
    # Create tuner
    tuner = kt.RandomSearch(
        build_autoencoder_model,
        objective='val_loss',
        max_trials=5,
        directory='autoencoder_tuning',
        project_name='wifi_anomaly_detection',
        overwrite=True  # Add this to avoid conflicts
    )
    
    # Add early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    
    # Fit tuner
    try:
        tuner.search(
            X, X,  # Input = Output for autoencoder
            epochs=20,
            batch_size=32,  # Reduced batch size
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )
        
        # Get best model
        best_params = tuner.get_best_hyperparameters(1)[0]
        best_model = tuner.hypermodel.build(best_params)
        
        # Train best model
        best_model.fit(
            X, X,
            epochs=30,  # Increase epochs for final training
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )
        
        # Save hyperparameters
        with open("best_autoencoder_params.json", "w") as f:
            json.dump(best_params.values, f)
            
        # Calculate reconstruction errors
        reconstructed_X = best_model.predict(X)
        mse = np.mean(np.power(X - reconstructed_X, 2), axis=1)
        
        return best_model, mse
        
    except Exception as e:
        print(f"Error during autoencoder training: {str(e)}")
        raise

2024-12-01 23:28:17.043124: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 23:28:17.103565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733124497.109332   29322 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733124497.111009   29322 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 23:28:17.118665: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### 1. Isolation Forest

In [6]:
features_scaled_df = X_scaled.copy()

# Train Isolation Forest
iso_forest_model, iso_scores = hyperparameter_tuning_iso_forest(X_scaled)

# Add scores to the data
features_scaled_df['iso_scores'] = iso_scores

Best parameters for Isolation Forest:
{'n_estimators': 100, 'max_samples': 0.5, 'contamination': 0.01, 'max_features': 0.5}
Best Silhouette Score: 0.9863


### 2. Local Outlier Factor

In [7]:
# Train Local Outlier Factor
lof_model, lof_scores = hyperparameter_tuning_lof(X_scaled)

# Add scores to the data
features_scaled_df['lof_scores'] = lof_scores

Best parameters for LOF:
{'n_neighbors': 10, 'leaf_size': 30, 'metric': 'manhattan'}
Best Silhouette Score: 0.9855


### 3. Autoencoder

In [8]:
# Train Autoencoder
autoencoder_model, ae_scores = train_autoencoder_tuned(X_scaled)

# Add reconstruction errors to the data
features_scaled_df['ae_scores '] = ae_scores 

Trial 5 Complete [00h 00m 06s]
val_loss: 908835.6875

Best val_loss So Far: 708745.25
Total elapsed time: 00h 00m 56s
Epoch 1/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 208006240.0000 - val_loss: 1043362624.0000
Epoch 2/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 43972780.0000 - val_loss: 2206575616.0000
Epoch 3/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 134839488.0000 - val_loss: 92033608.0000
Epoch 4/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 132504696.0000 - val_loss: 15877395.0000
Epoch 5/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 9922196.0000 - val_loss: 79376032.0000
Epoch 6/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 23715012.0000 - val_loss: 967909.0000
Epoch 7/30
[1m977/977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

## Evaluation

In [12]:
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

def evaluate_model_unsupervised(X, scores, n_clusters=3, method='quantile'):
    """
    Evaluate the model using unsupervised metrics with improved robustness.
    
    Parameters:
    -----------
    X : array-like of shape (n_samples, n_features)
        The input samples to evaluate
    scores : array-like of shape (n_samples,)
        The anomaly scores from the model
    n_clusters : int, default=3
        Number of clusters to create from scores
    method : str, default='quantile'
        Method to create clusters: 'quantile' or 'kmeans'
    
    Returns:
    --------
    dict : Evaluation metrics and diagnostic information
    """
    # Input validation
    X = np.array(X)
    scores = np.array(scores)
    
    if len(X) != len(scores):
        raise ValueError(f"Length mismatch: X ({len(X)}) and scores ({len(scores)})")
        
    # Scale the features for better metric calculation
    X_scaled = StandardScaler().fit_transform(X)
    
    # Create clusters using quantiles
    if method == 'quantile':
        quantiles = np.linspace(0, 100, n_clusters+1)
        thresholds = np.percentile(scores, quantiles[1:-1])
        labels = np.zeros(len(scores), dtype=int)
        
        for i, threshold in enumerate(thresholds, 1):
            labels[scores >= threshold] = i
    
    # Validate clusters
    unique_labels = np.unique(labels)
    n_unique_clusters = len(unique_labels)
    
    if n_unique_clusters < 2:
        return {
            'error': 'Insufficient distinct clusters formed',
            'n_clusters_formed': n_unique_clusters,
            'cluster_sizes': {label: np.sum(labels == label) for label in unique_labels},
            'score_range': {'min': scores.min(), 'max': scores.max()}
        }
    
    # Calculate metrics with error handling
    metrics = {}
    try:
        metrics['silhouette_score'] = silhouette_score(X_scaled, labels)
    except Exception as e:
        metrics['silhouette_score'] = {'error': str(e)}
        
    try:
        metrics['calinski_harabasz_score'] = calinski_harabasz_score(X_scaled, labels)
    except Exception as e:
        metrics['calinski_harabasz_score'] = {'error': str(e)}
        
    try:
        metrics['davies_bouldin_score'] = davies_bouldin_score(X_scaled, labels)
    except Exception as e:
        metrics['davies_bouldin_score'] = {'error': str(e)}
    
    # Add diagnostic information
    metrics['diagnostics'] = {
        'n_clusters': n_unique_clusters,
        'cluster_sizes': {label: np.sum(labels == label) for label in unique_labels},
        'cluster_proportions': {label: np.mean(labels == label) for label in unique_labels},
        'score_distribution': {
            'min': scores.min(),
            'max': scores.max(),
            'mean': scores.mean(),
            'std': scores.std()
        }
    }
    
    return metrics

In [14]:
# Evaluate Isolation Forest
iso_metrics = evaluate_model_unsupervised(X_scaled, iso_scores)
print("Isolation Forest Metrics:")
for k, v in iso_metrics.items():
    print(f"{k}: {v}")

# Evaluate LOF
lof_metrics = evaluate_model_unsupervised(X_scaled, lof_scores)
print("\nLocal Outlier Factor Metrics:")
for k, v in lof_metrics.items():
    print(f"{k}: {v}")

# Evaluate Autoencoder
ae_metrics = evaluate_model_unsupervised(X_scaled, ae_scores)
print("\nAutoencoder Metrics:")
for k, v in ae_metrics.items():
    print(f"{k}: {v}")

Isolation Forest Metrics:
error: Insufficient distinct clusters formed
n_clusters_formed: 1
cluster_sizes: {np.int64(2): np.int64(39053)}
score_range: {'min': np.float64(-0.013055160215452055), 'max': np.float64(0.588711862057856)}

Local Outlier Factor Metrics:
silhouette_score: 0.985477222911455
calinski_harabasz_score: 639.5620796425093
davies_bouldin_score: 2.528040757502514
diagnostics: {'n_clusters': 2, 'cluster_sizes': {np.int64(0): np.int64(170), np.int64(2): np.int64(38883)}, 'cluster_proportions': {np.int64(0): np.float64(0.004353058663867052), np.int64(2): np.float64(0.995646941336133)}, 'score_distribution': {'min': np.float64(-0.5719205514496825), 'max': np.float64(90639999999.46509), 'mean': np.float64(6835838.003138249), 'std': np.float64(779972270.9915607)}}

Autoencoder Metrics:
error: Insufficient distinct clusters formed
n_clusters_formed: 1
cluster_sizes: {np.int64(2): np.int64(39053)}
score_range: {'min': np.float64(0.06985746443447098), 'max': np.float64(132905209

## Save Model

In [11]:
# Save all three models
joblib.dump(iso_forest_model, f"{output_directory}/iso_forest_model.pkl")
joblib.dump(lof_model, f"{output_directory}/lof_model.pkl")
autoencoder_model.save(f"{output_directory}/autoencoder_model.keras")