# Wi-Fi Network Security Threat Detection Using Machine Learning

In [1]:
# Define the path to the pcap files and the output directory
pcap_directory = "../datasets/wifi"
output_directory = "./model_output/wifi"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scapy.utils import rdpcap
from scapy.layers.inet import IP
import joblib
import warnings
from datetime import datetime
import os
from scipy import stats

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, precision_recall_curve,
                           average_precision_score, confusion_matrix,
                           roc_curve, auc, silhouette_score)
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

## 1. Load PCAP Files

In [3]:
# Load csv file
raw_df = pd.read_csv('../datasets/wifi/output_1201_new.csv')
synthetic_df = pd.read_csv('../datasets/synthetic_wifi_new.csv')

# Merge two dataframes
df = pd.concat([raw_df, synthetic_df], ignore_index=True)

print(df.columns)

Index(['frame.time_epoch', 'frame.len', 'frame.protocols', 'wlan.fc.type',
       'wlan.fc.subtype', 'wlan.sa', 'wlan.da', 'wlan.bssid',
       'radiotap.channel.freq', 'radiotap.dbm_antsignal', 'radiotap.datarate',
       'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport',
       'udp.dstport', 'tcp.flags', 'tcp.len', 'udp.length', 'tcp.stream',
       'tcp.seq', 'tcp.ack', '_ws.col.protocol', 'label'],
      dtype='object')


In [4]:
print(df.shape)
print(df.head())
print(df.columns)

(14427149, 25)
   frame.time_epoch  frame.len              frame.protocols  wlan.fc.type  \
0      1.729577e+09       1363  eth:ethertype:ipv6:udp:mdns           NaN   
1      1.729577e+09         66         eth:ethertype:ip:tcp           NaN   
2      1.729577e+09         54         eth:ethertype:ip:tcp           NaN   
3      1.729577e+09         42            eth:ethertype:arp           NaN   
4      1.729577e+09         83    eth:ethertype:ip:udp:data           NaN   

   wlan.fc.subtype wlan.sa wlan.da wlan.bssid  radiotap.channel.freq  \
0              NaN     NaN     NaN        NaN                    NaN   
1              NaN     NaN     NaN        NaN                    NaN   
2              NaN     NaN     NaN        NaN                    NaN   
3              NaN     NaN     NaN        NaN                    NaN   
4              NaN     NaN     NaN        NaN                    NaN   

   radiotap.dbm_antsignal  ...  udp.srcport udp.dstport tcp.flags  tcp.len  \
0          

In [5]:
# Display NaN amount of each column
print(df.isna().sum())

frame.time_epoch                 0
frame.len                        0
frame.protocols                  0
wlan.fc.type              14427144
wlan.fc.subtype           14427144
wlan.sa                   14427144
wlan.da                   14427144
wlan.bssid                14427144
radiotap.channel.freq     14427149
radiotap.dbm_antsignal    14427149
radiotap.datarate         14427149
ip.src                      381880
ip.dst                      381880
tcp.srcport                1076691
tcp.dstport                1076691
udp.srcport               13396535
udp.dstport               13396535
tcp.flags                  1076691
tcp.len                    1077317
udp.length                13396535
tcp.stream                 1076691
tcp.seq                    1076691
tcp.ack                    1076691
_ws.col.protocol                 0
label                     10551408
dtype: int64


In [6]:
# Drop NaN columns including 99% NaN columns
columns_to_drop = [
    'wlan.fc.type', 'wlan.fc.subtype', 'wlan.sa', 
    'wlan.da', 'wlan.bssid', 'radiotap.channel.freq', 
    'radiotap.dbm_antsignal', 'radiotap.datarate'
]
df = df.drop(columns=columns_to_drop)


In [7]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

ip.src           381880
ip.dst           381880
tcp.srcport     1076691
tcp.dstport     1076691
udp.srcport    13396535
udp.dstport    13396535
tcp.flags       1076691
tcp.len         1077317
udp.length     13396535
tcp.stream      1076691
tcp.seq         1076691
tcp.ack         1076691
label          10551408
dtype: int64


In [8]:
# df = df.dropna()

In [9]:
df.columns

Index(['frame.time_epoch', 'frame.len', 'frame.protocols', 'ip.src', 'ip.dst',
       'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', 'tcp.flags',
       'tcp.len', 'udp.length', 'tcp.stream', 'tcp.seq', 'tcp.ack',
       '_ws.col.protocol', 'label'],
      dtype='object')

In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Example for low cardinality categorical features
categorical_features = ['frame.protocols', '_ws.col.protocol']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Alternatively, for high cardinality, use label encoding
label_encoder = LabelEncoder()
df['ip.src'] = label_encoder.fit_transform(df['ip.src'])
df['ip.dst'] = label_encoder.fit_transform(df['ip.dst'])

: 

In [15]:
df.shape

(0, 15)

## Feature Engineering

## Exploratory Data Analysis (EDA)

## Model Training

## 6. Model Evaluation

## 7. Noise and Drift Tests
