# Wi-Fi Network Security Threat Detection Using Machine Learning

In [1]:
# Define the path to the pcap files and the output directory
pcap_directory = "../datasets/wifi"
output_directory = "./model_output/wifi"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scapy.utils import rdpcap
from scapy.layers.inet import IP
import joblib
import warnings
from datetime import datetime
import os
from scipy import stats

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, precision_recall_curve,
                           average_precision_score, confusion_matrix,
                           roc_curve, auc, silhouette_score)
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

## Load Data

In [3]:
# Load csv file
df = pd.read_csv('../datasets/wifi/output_1201_new.csv')

In [4]:
print(df.shape)
print(df.head())
print(df.columns)
print(df.isna().sum())

(10551408, 24)
   frame.time_epoch  frame.len              frame.protocols  wlan.fc.type  \
0      1.729577e+09       1363  eth:ethertype:ipv6:udp:mdns           NaN   
1      1.729577e+09         66         eth:ethertype:ip:tcp           NaN   
2      1.729577e+09         54         eth:ethertype:ip:tcp           NaN   
3      1.729577e+09         42            eth:ethertype:arp           NaN   
4      1.729577e+09         83    eth:ethertype:ip:udp:data           NaN   

   wlan.fc.subtype wlan.sa wlan.da wlan.bssid  radiotap.channel.freq  \
0              NaN     NaN     NaN        NaN                    NaN   
1              NaN     NaN     NaN        NaN                    NaN   
2              NaN     NaN     NaN        NaN                    NaN   
3              NaN     NaN     NaN        NaN                    NaN   
4              NaN     NaN     NaN        NaN                    NaN   

   radiotap.dbm_antsignal  ...  tcp.dstport udp.srcport udp.dstport  \
0                 

## Feature Engineering

In [None]:
from typing import Tuple, List, Dict

class NetworkFeatureExtractor:
    def __init__(self, window: str = '1Min', chunk_size: int = 100000):
        self.window = window
        self.chunk_size = chunk_size
        self.scaler = None
        
    @staticmethod
    def optimize_dtypes(df: pd.DataFrame) -> pd.DataFrame:
        """Optimize DataFrame memory usage by adjusting data types."""
        df = df.copy()
        
        # Ensure protocol column is string and non-empty
        if '_ws.col.protocol' in df.columns:
            df['_ws.col.protocol'] = df['_ws.col.protocol'].fillna('unknown').astype(str)
            df.loc[df['_ws.col.protocol'].str.len() == 0, '_ws.col.protocol'] = 'unknown'
        
        # Convert numeric columns
        int_columns = ['frame.len', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport']
        for col in int_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], downcast='integer', errors='ignore')
        
        return df
    
    def extract_flags(self, flag_str) -> Dict[str, int]:
        """Extract TCP flags into individual boolean features."""
        flags = {
            'SYN': 0, 'ACK': 0, 'PSH': 0, 'RST': 0, 'FIN': 0, 'URG': 0
        }
        if pd.isna(flag_str):
            return flags
        for flag in flags.keys():
            flags[flag] = 1 if flag in str(flag_str) else 0
        return flags

    def create_basic_features(self, chunk: pd.DataFrame) -> pd.DataFrame:
        """Create basic traffic features with minimal memory usage."""
        features = chunk.resample(self.window, on='datetime').agg({
            'frame.len': ['count', 'mean', 'sum'],
            'ip.src': 'nunique',
            'ip.dst': 'nunique',
            '_ws.col.protocol': 'nunique'
        })
        
        # Flatten column names
        features.columns = ['_'.join(col).strip() for col in features.columns.values]
        return features
    
    def add_protocol_features(self, chunk: pd.DataFrame, base_features: pd.DataFrame) -> pd.DataFrame:
        """Add protocol-specific features using a more direct approach."""
        # Ensure working with a copy and proper types
        chunk = chunk.copy()
        chunk['_ws.col.protocol'] = chunk['_ws.col.protocol'].fillna('unknown').astype(str)
        
        # Create time bins first
        chunk['time_bin'] = chunk['datetime'].dt.floor(self.window)
        
        # Get unique protocols
        protocols = chunk['_ws.col.protocol'].unique()
        
        # Calculate protocol counts for each time bin
        for protocol in protocols:
            mask = chunk['_ws.col.protocol'] == protocol
            counts = chunk[mask].groupby('time_bin').size()
            base_features[f'protocol_{protocol}_count'] = counts
        
        # Fill missing values
        base_features = base_features.fillna(0)
        
        # Calculate ratios
        protocol_columns = [col for col in base_features.columns if col.startswith('protocol_') and col.endswith('_count')]
        total_counts = base_features[protocol_columns].sum(axis=1)
        
        for protocol in protocols:
            count_col = f'protocol_{protocol}_count'
            if count_col in base_features.columns:
                base_features[f'protocol_{protocol}_ratio'] = base_features[count_col] / (total_counts + 1e-10)
        
        return base_features
    
    def add_flag_features(self, chunk: pd.DataFrame, base_features: pd.DataFrame) -> pd.DataFrame:
        """Add TCP flag-based features."""
        if 'tcp.flags' in chunk.columns:
            # Instead of creating a dictionary, process flags directly
            flags = ['SYN', 'ACK', 'PSH', 'RST', 'FIN', 'URG']
            
            for flag in flags:
                # Create boolean column for each flag
                flag_present = chunk['tcp.flags'].fillna('').str.contains(flag).astype(int)
                
                # Resample and sum for each flag
                flag_counts = chunk.assign(**{f'flag_{flag}': flag_present})\
                    .resample(self.window, on='datetime')[f'flag_{flag}'].sum()
                
                base_features[f'tcp_flag_{flag}_count'] = flag_counts
            
            # Add flag ratios
            base_features['syn_ack_ratio'] = base_features['tcp_flag_SYN_count'] / \
                (base_features['tcp_flag_ACK_count'] + 1)
            base_features['reset_syn_ratio'] = base_features['tcp_flag_RST_count'] / \
                (base_features['tcp_flag_SYN_count'] + 1)
        
        return base_features
    
    def process_chunk(self, chunk: pd.DataFrame) -> pd.DataFrame:
        """Process a single chunk of data."""
        try:
            # Create basic features
            features = self.create_basic_features(chunk)
            
            # Add additional feature sets
            features = self.add_protocol_features(chunk, features)
            features = self.add_flag_features(chunk, features)
            features = self.add_port_features(chunk, features)
            features = self.add_statistical_features(features)
            
            return features
            
        except Exception as e:
            print(f"Error in process_chunk: {str(e)}")
            raise
    
    def add_port_features(self, chunk: pd.DataFrame, base_features: pd.DataFrame) -> pd.DataFrame:
        """Add port-related features."""
        # Unique ports accessed
        base_features['unique_ports_accessed'] = chunk.resample(self.window, on='datetime').apply(
            lambda x: len(set(x['tcp.dstport'].dropna()) | set(x['udp.dstport'].dropna()))
        )
        
        return base_features
    
    def add_statistical_features(self, base_features: pd.DataFrame) -> pd.DataFrame:
        """Add statistical and rate-based features."""
        for col in ['frame.len_count', 'ip.src_nunique', 'ip.dst_nunique']:
            if col in base_features.columns:
                roll = base_features[col].rolling(window=5, min_periods=1)
                base_features[f'{col}_rolling_mean'] = roll.mean().fillna(0)
                base_features[f'{col}_rate'] = base_features[col].diff().fillna(0)
        
        # Traffic ratios
        base_features['bytes_per_packet'] = base_features['frame.len_sum'] / \
            (base_features['frame.len_count'] + 1)
        base_features['packets_per_src'] = base_features['frame.len_count'] / \
            (base_features['ip.src_nunique'] + 1)
        
        return base_features
    
    def preprocess_features(self, features: pd.DataFrame) -> Tuple[np.ndarray, List[str]]:
        """Preprocess features for anomaly detection."""
        features_prep = features.copy()
        
        # Handle infinite values and NaN
        features_prep = features_prep.replace([np.inf, -np.inf], np.nan)
        features_prep = features_prep.fillna(0)
        
        # Remove constant and zero variance columns
        constant_cols = [col for col in features_prep.columns if features_prep[col].nunique() == 1]
        zero_var_cols = features_prep.columns[features_prep.std() == 0]
        cols_to_drop = list(set(constant_cols + list(zero_var_cols)))
        
        if cols_to_drop:
            features_prep = features_prep.drop(columns=cols_to_drop)
        
        # Scale features
        self.scaler = RobustScaler()
        scaled_features = self.scaler.fit_transform(features_prep)
        
        # Final NaN check
        scaled_features = np.nan_to_num(scaled_features, 0)
        
        return scaled_features, features_prep.columns.tolist()
    
    def save_processed_data(self, scaled_features: np.ndarray, feature_columns: List[str], 
                          raw_features: pd.DataFrame, output_dir: str = output_directory):
        """Save processed data to CSV files."""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Save scaled features
        scaled_df = pd.DataFrame(scaled_features, columns=feature_columns, 
                               index=raw_features.index)
        scaled_df.to_csv(f'{output_dir}/scaled_features.csv')
        
        # Save raw features
        raw_features.to_csv(f'{output_dir}/raw_features.csv')

        # Save scaler
        joblib.dump(self.scaler, f'{output_dir}/scaler.joblib')
        
        print(f"Saved processed data to {output_dir}/")
        print(f"Scaled features shape: {scaled_df.shape}")
        print(f"Raw features shape: {raw_features.shape}")
        print(f"Scaler saved to {output_dir}/scaler.joblib")

    def convert_categorical_to_string(self, df: pd.DataFrame) -> pd.DataFrame:
        """Convert all categorical columns to string type."""
        df = df.copy()
        categorical_columns = df.select_dtypes(include=['category']).columns
        
        for col in categorical_columns:
            df[col] = df[col].astype(str)
        
        return df
    
    def process_data(self, df: pd.DataFrame, save_output: bool = True) -> Tuple[np.ndarray, List[str], pd.DataFrame]:
        """Main processing function."""
        try:
            print("Starting feature extraction...")

            # Convert categorical columns to string first
            df = self.convert_categorical_to_string(df)
            
            # Convert timestamp and optimize dtypes
            df['datetime'] = pd.to_datetime(df['frame.time_epoch'], unit='s')
            df = self.optimize_dtypes(df)
            
            # Process data in chunks
            chunks = [df[i:i + self.chunk_size] for i in range(0, len(df), self.chunk_size)]
            all_features = []
            
            for i, chunk in enumerate(chunks):
                print(f"Processing chunk {i+1}/{len(chunks)}...")
                chunk_features = self.process_chunk(chunk)
                all_features.append(chunk_features)
            
            # Combine all features
            combined_features = pd.concat(all_features)
            
            # Preprocess features
            scaled_features, feature_columns = self.preprocess_features(combined_features)
            
            if save_output:
                self.save_processed_data(scaled_features, feature_columns, combined_features)
            
            return scaled_features, feature_columns, combined_features
            
        except Exception as e:
            print(f"Error during processing: {e}")
            raise
    
    @staticmethod
    def load_processed_data(input_dir: str = 'processed_data') -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load processed data from CSV files."""
        scaled_df = pd.read_csv(f'{input_dir}/scaled_features.csv', index_col=0)
        raw_df = pd.read_csv(f'{input_dir}/raw_features.csv', index_col=0)
        
        print(f"Loaded processed data from {input_dir}/")
        print(f"Scaled features shape: {scaled_df.shape}")
        print(f"Raw features shape: {raw_df.shape}")
        
        return scaled_df, raw_df

In [8]:
extractor = NetworkFeatureExtractor(window='1Min', chunk_size=100000)

try:
    # Process data
    extractor.process_data(df)
    
    # Or load previously processed data
    # scaled_df, raw_df = NetworkFeatureExtractor.load_processed_data()
    
except Exception as e:
    print(f"Error: {e}")

Starting feature extraction...
Processing chunk 1/106...
Error during processing: '<lambda_0>'
Error: '<lambda_0>'
