In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
#import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

In [2]:
main_folder_path = "/Users/egkubo/AWID/AWID3_Dataset_CSV/CSV"
columns_to_read = ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration', 'radiotap.present.tsft', 'radiotap.channel.freq', 'radiotap.channel.flags.cck', 'radiotap.channel.flags.ofdm', 'wlan.fc.type', 'wlan.fc.subtype', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.retry', 'wlan.fc.pwrmgt', 'wlan.fc.moredata', 'wlan.fc.protected', 'Label']

csv_files = glob.glob(f"{main_folder_path}/**/*.csv", recursive=True)

dfs = []

# Loop through each CSV file, read it into a DataFrame, and append it to the list
for csv_file in csv_files:
    df = pd.read_csv(csv_file, usecols=columns_to_read, low_memory=False)
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
data = pd.concat(dfs, ignore_index=True)
data

Unnamed: 0,frame.len,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,radiotap.channel.freq,radiotap.dbm_antsignal,radiotap.length,radiotap.present.tsft,wlan.duration,wlan.fc.ds,wlan.fc.frag,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.fc.type,wlan.fc.retry,wlan.fc.subtype,Label
0,76.0,0.0,1.0,2472.0,-32-35-32,56.0,1-0-0,154.0,0x00000000,0.0,0.0,0.0,0.0,1.0,0.0,11,Normal
1,70.0,0.0,1.0,2472.0,-63-63-65,56.0,1-0-0,112.0,0x00000000,0.0,0.0,0.0,0.0,1.0,0.0,12,Normal
2,170.0,0.0,0.0,2472.0,-32-35-32,64.0,0-0-0,48.0,0x00000002,0.0,0.0,1.0,0.0,2.0,0.0,8,Normal
3,88.0,0.0,1.0,2472.0,-63-63-65,56.0,1-0-0,2.0,0x00000000,0.0,0.0,0.0,0.0,1.0,0.0,9,Normal
4,76.0,0.0,1.0,2472.0,-62-65-62,56.0,1-0-0,246.0,0x00000000,0.0,0.0,0.0,0.0,1.0,0.0,11,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36962894,1618.0,0.0,1.0,5180.0,-96.0,64.0,0-0-0,48.0,0x00000002,0.0,0.0,1.0,0.0,2.0,0.0,8,Normal
36962895,1618.0,0.0,1.0,5180.0,-96.0,64.0,0-0-0,48.0,0x00000002,0.0,0.0,1.0,0.0,2.0,0.0,8,Normal
36962896,1618.0,0.0,1.0,5180.0,-96.0,64.0,0-0-0,48.0,0x00000002,0.0,0.0,1.0,0.0,2.0,0.0,8,Normal
36962897,1618.0,0.0,1.0,5180.0,-96.0,64.0,0-0-0,48.0,0x00000002,0.0,0.0,1.0,0.0,2.0,0.0,8,Normal


In [3]:
data_modified = data.loc[:, ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration', 'radiotap.present.tsft', 'radiotap.channel.freq', 'radiotap.channel.flags.cck', 'radiotap.channel.flags.ofdm', 'wlan.fc.type', 'wlan.fc.subtype', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.retry', 'wlan.fc.pwrmgt', 'wlan.fc.moredata', 'wlan.fc.protected', 'Label']]

data_modified = data_modified.replace(r'^\s*$', pd.NA, regex=True)
data_cleaned = data_modified.dropna()

In [4]:
X = data_cleaned.loc[:, ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration', 'radiotap.present.tsft', 'radiotap.channel.freq', 'radiotap.channel.flags.cck', 'radiotap.channel.flags.ofdm', 'wlan.fc.type', 'wlan.fc.subtype', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.retry', 'wlan.fc.pwrmgt', 'wlan.fc.moredata', 'wlan.fc.protected']]
y = data_cleaned[['Label']]
X
# radiotap.present.tsft geri eklemeyi unutma

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft,radiotap.channel.freq,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,wlan.fc.type,wlan.fc.subtype,wlan.fc.ds,wlan.fc.frag,wlan.fc.retry,wlan.fc.pwrmgt,wlan.fc.moredata,wlan.fc.protected
0,76.0,56.0,-32-35-32,154.0,1-0-0,2472.0,0.0,1.0,1.0,11,0x00000000,0.0,0.0,0.0,0.0,0.0
1,70.0,56.0,-63-63-65,112.0,1-0-0,2472.0,0.0,1.0,1.0,12,0x00000000,0.0,0.0,0.0,0.0,0.0
2,170.0,64.0,-32-35-32,48.0,0-0-0,2472.0,0.0,0.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
3,88.0,56.0,-63-63-65,2.0,1-0-0,2472.0,0.0,1.0,1.0,9,0x00000000,0.0,0.0,0.0,0.0,0.0
4,76.0,56.0,-62-65-62,246.0,1-0-0,2472.0,0.0,1.0,1.0,11,0x00000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36962894,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962895,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962896,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962897,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0


In [5]:
def convert_to_integer(value): # Function to cover the cases where there are multiple dbm_antsignal values.
    if pd.isna(value) or value == '':
        return None 
    elif isinstance(value, int) or isinstance(value, float):
        return value
    else:
        signal_strengths = [int(v) for v in value.split('-') if v]
        if signal_strengths:
            average_strength = round(sum(signal_strengths) / len(signal_strengths))
            return -average_strength
        else:
            return None

In [6]:
X['radiotap.dbm_antsignal'] = X['radiotap.dbm_antsignal'].apply(convert_to_integer)
X

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft,radiotap.channel.freq,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,wlan.fc.type,wlan.fc.subtype,wlan.fc.ds,wlan.fc.frag,wlan.fc.retry,wlan.fc.pwrmgt,wlan.fc.moredata,wlan.fc.protected
0,76.0,56.0,-33.0,154.0,1-0-0,2472.0,0.0,1.0,1.0,11,0x00000000,0.0,0.0,0.0,0.0,0.0
1,70.0,56.0,-64.0,112.0,1-0-0,2472.0,0.0,1.0,1.0,12,0x00000000,0.0,0.0,0.0,0.0,0.0
2,170.0,64.0,-33.0,48.0,0-0-0,2472.0,0.0,0.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
3,88.0,56.0,-64.0,2.0,1-0-0,2472.0,0.0,1.0,1.0,9,0x00000000,0.0,0.0,0.0,0.0,0.0
4,76.0,56.0,-63.0,246.0,1-0-0,2472.0,0.0,1.0,1.0,11,0x00000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36962894,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962895,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962896,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0
36962897,1618.0,64.0,-96.0,48.0,0-0-0,5180.0,0.0,1.0,2.0,8,0x00000002,0.0,0.0,0.0,0.0,1.0


In [7]:
# Columns for min-max scaling
columns_to_scale = ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration']

# Columns for one-hot encoding (excluding the ones to be scaled)
columns_to_one_hot_encode = [col for col in X.columns if col not in columns_to_scale]

# Min-max scaling
scaler = MinMaxScaler()
X_scaled = X  # Create a copy to avoid modifying the original DataFrame
X_scaled[columns_to_scale] = scaler.fit_transform(X_scaled[columns_to_scale])

# One-hot encoding
X_encoded = pd.get_dummies(X_scaled, columns=columns_to_one_hot_encode)

# Display the preprocessed DataFrame
X_encoded

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft_0-0-0,radiotap.present.tsft_1-0-0,radiotap.channel.freq_2417.0,radiotap.channel.freq_2472.0,radiotap.channel.freq_5180.0,radiotap.channel.flags.cck_0.0,...,wlan.fc.frag_0.0,wlan.fc.frag_1.0,wlan.fc.retry_0.0,wlan.fc.retry_1.0,wlan.fc.pwrmgt_0.0,wlan.fc.pwrmgt_1.0,wlan.fc.moredata_0.0,wlan.fc.moredata_1.0,wlan.fc.protected_0.0,wlan.fc.protected_1.0
0,0.001532,0.5,0.965812,0.004727,False,True,False,True,False,True,...,True,False,True,False,True,False,True,False,True,False
1,0.000000,0.5,0.833333,0.003438,False,True,False,True,False,True,...,True,False,True,False,True,False,True,False,True,False
2,0.025536,1.0,0.965812,0.001473,True,False,False,True,False,True,...,True,False,True,False,True,False,True,False,False,True
3,0.004597,0.5,0.833333,0.000061,False,True,False,True,False,True,...,True,False,True,False,True,False,True,False,True,False
4,0.001532,0.5,0.837607,0.007550,False,True,False,True,False,True,...,True,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36962894,0.395301,1.0,0.696581,0.001473,True,False,False,False,True,True,...,True,False,True,False,True,False,True,False,False,True
36962895,0.395301,1.0,0.696581,0.001473,True,False,False,False,True,True,...,True,False,True,False,True,False,True,False,False,True
36962896,0.395301,1.0,0.696581,0.001473,True,False,False,False,True,True,...,True,False,True,False,True,False,True,False,False,True
36962897,0.395301,1.0,0.696581,0.001473,True,False,False,False,True,True,...,True,False,True,False,True,False,True,False,False,True


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=41)

In [9]:
# Initialize the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the model
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
nb_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {nb_accuracy:.2f}")

# Display additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)


: 

: 