In [1]:
from __future__ import annotations

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import svm, neighbors, tree, ensemble, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler

sns.set_theme()

In [2]:
# Path to folder containing the CSVs
data_folder = "data/traces/"

def read_date(dates: [str]):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for date in dates:
        for i in range(50):
            try:
                frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
                data = pd.concat([data, frame], axis=0)
            except OSError:
                pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date(["16-09","16-10"])

data.set_index("timestamp")

data

Unnamed: 0,packet_id,timestamp,packet_size,eth_src,device_name,protocol,iot
0,0,1474552802257569024,70,18:b7:9e:02:20:44,,Raw,False
1,1,1474552802257692160,66,18:b7:9e:02:20:44,,TCP,False
2,2,1474552802323085056,66,14:cc:20:51:33:ea,,TCP,False
3,3,1474552802383173888,60,d0:52:a8:00:67:5e,,Padding,False
4,4,1474552802396326144,98,14:cc:20:51:33:ea,,Raw,False
...,...,...,...,...,...,...,...
4918278,4948801,1476242930915371008,1434,14:cc:20:51:33:ea,,Raw,False
4918279,4948802,1476242930915453952,1434,14:cc:20:51:33:ea,,Raw,False
4918280,4948803,1476242930915572992,1434,14:cc:20:51:33:ea,,Raw,False
4918281,4948804,1476242930915685888,1434,14:cc:20:51:33:ea,,Raw,False


In [3]:
def devices(df : pd.DataFrame, incl_camera : bool) -> pd.DataFrame:
    if incl_camera :
        return df.drop('incl', axis=1)
    else :
        df[df['incl']].drop('incl', axis=1)

devices = devices(pd.read_csv("./data/list_of_devices.csv"), True)

devices

Unnamed: 0,device_name,eth_src,connection_type,iot
0,Smart Things,d0:52:a8:00:67:5e,Wired,True
1,Amazon Echo,44:65:0d:56:cc:d3,Wireless,True
2,Netatmo Welcome,70:ee:50:18:34:43,Wireless,True
3,TP-Link Day Night Cloud camera,f4:f2:6d:93:51:f1,Wireless,True
4,Samsung SmartCam,00:16:6c:ab:6b:88,Wireless,True
5,Dropcam,30:8c:fb:2f:e4:b2,Wireless,True
6,Insteon Camera,00:62:6e:51:27:2e,Wired,True
7,Unknown,e8:ab:fa:19:de:4f,Wireless,False
8,Withings Smart Baby Monitor,00:24:e4:11:18:a8,Wired,True
9,Belkin Wemo switch,ec:1a:59:79:f4:89,Wireless,True


In [9]:
def mode_mean_med(df: pd.DataFrame, devices: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""

    # Keep only the IOT devices their device name and mac address
    device_names = devices[devices['iot']].loc[:, ['device_name', 'eth_src']]

    # Keep only the mac address name and package size
    df_size =  df.loc[:, ['eth_src', 'packet_size']]

    # Merge the stats with the devices names
    merge_df = device_names.merge(df_size, on='eth_src')

    # Group the merged frame and calculate the stats
    stats = merge_df.groupby(['device_name']).agg({'packet_size': ['mean', 'median', lambda x: x.mode().iat[0]]})

    return stats

statistics = mode_mean_med(data, devices)

statistics

Unnamed: 0_level_0,packet_size,packet_size,packet_size
Unnamed: 0_level_1,mean,median,<lambda_0>
device_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Amazon Echo,111.107594,66.0,66
Belkin Wemo switch,401.417551,118.0,66
Belkin wemo motion sensor,120.807015,66.0,66
Blipcare Blood Pressure meter,108.03125,59.0,54
Dropcam,185.323517,156.0,156
HP Printer,119.924045,86.0,140
Insteon Camera,114.641222,73.0,60
Light Bulbs LiFX Smart Bulb,96.533507,92.0,123
NEST Protect smoke alarm,303.680796,350.0,509
Netatmo Welcome,561.266181,74.0,1510


In [None]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']
    windowed_frame = timestamped_frame.set_index('timestamp', drop=False)

    ns_in_sec = int(1e9)
    number_of_windows = int((last_packet_time - first_packet_time) / ns_in_sec) + 1

    try:
        windowed_frame.insert(0, value=0, column="window_id")
    except Exception as e:
        print("It seems like the column 'window_id' already exists. Just updating values...")
        print(e)
    finally:
        windowed_frame.window_id.astype(np.int64)

    for window_id in range(0, number_of_windows):
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec
        windowed_frame.loc[from_time:to_time, 'window_id'] = window_id

    windowed_frame.set_index('window_id', inplace=True)
    return windowed_frame


def attach_window_id_fast(timestamped_frame: pd.DataFrame) -> pd.DataFrame:
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']

    ns_in_sec = int(1e9)
    windows = range(first_packet_time - 1, last_packet_time + ns_in_sec, ns_in_sec)

    windowed_frame = pd.cut(timestamped_frame['timestamp'], windows, labels=range(len(windows) - 1)).to_frame()
    windowed_frame.columns = ["window_id"]
    return pd.concat([timestamped_frame, windowed_frame], axis=1).set_index('window_id')

windowed_frame = attach_window_id_fast(data)

windowed_frame

In [None]:
def generate_features_labeled(windowed_frame: pd.DataFrame, device_frame: pd.DataFrame):
    """Calculate the mean, std, and number of bytes of a window for each device"""
    features = pd.DataFrame()

    for device_index in device_frame.index:
        device = device_frame.iloc[device_index]
        device_packets = windowed_frame.loc[windowed_frame['eth_src'] == device['eth_src']]
        device_features = generate_features(device_packets)

        device_features.insert(len(device_features.columns), "device_name", device['device_name'])
        device_features.insert(len(device_features.columns), "iot", device['iot'])

        features = pd.concat([features, device_features])

    features.columns = ["mean", "std", "n_bytes", "device_name", "iot"]
    return features

def generate_features(windowed_frame: pd.DataFrame):
    window_groups = windowed_frame.groupby("window_id", observed=True)

    mean = window_groups['packet_size'].mean()
    std = window_groups['packet_size'].std()
    n_bytes = window_groups['packet_size'].sum()

    # TODO Not sure if fillna is a good idea
    return pd.concat([mean, std, n_bytes], axis=1).fillna(0)

def get_devices_in_window(windowed_frame: pd.DataFrame):
    devices_in_window = pd.DataFrame()

    for window_id in windowed_frame.index.unique():
        devices = windowed_frame.loc[window_id]['device_name']
        # iot = windowed_frame.loc[window_id]['iot']

        if not isinstance(devices, str):
            devices = devices.unique()

        devices_in_window = pd.concat([devices_in_window, pd.DataFrame({"window_id": window_id, "devices": devices})])

    return devices_in_window

labeled_features = generate_features_labeled(windowed_frame, devices)

labeled_features

In [None]:
from enum import StrEnum

class Balancing(StrEnum):
    NONE = "none"
    STRATIFIED = "stratified"
    OVER_UNDER = "over_under"
    OVER_UNDER_RUS = "over_under_rus"

def _balance_over_under(X, y, rus=False, random_state: int | None = None):
    oversampling = SMOTE(random_state=random_state)

    # Resource [32] is actually really old (2009), NearMiss is a better alternative than RUS
    # but I left it as an option
    if rus:
        undersampling = RandomUnderSampler(random_state=random_state)
    else:
        undersampling = NearMiss()

    # first oversample, then undersample
    X, y = oversampling.fit_resample(X, y)
    X, y = undersampling.fit_resample(X, y)

    return X, y

def _get_models():
    models = {
        "Knn" : neighbors.KNeighborsClassifier(n_neighbors=5, weights="distance"),
        # "SVM" : svm.SVC(),
        "DT"  : tree.DecisionTreeClassifier(),
        "RF"  : ensemble.RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    }
    # I don't really see the advantage of using an ensemble balancing method in conjuction with the
    # under- and oversample techniques. I can't find any indication in the paper the referenced [32]
    # that it actually works. So, I'm just keeping this here.
    models["Majority Voting"] = ensemble.VotingClassifier(list(models.items()), voting='hard')

    return models

def train_and_test_classifiers(labeled_features, target_label, balancing=Balancing.NONE, n_splits=10, shuffle=False, random_state=None):
    """
    Trains and tests a series of models on the given, labeled data
    (Knn, RF, DT, SVM, Majority Voting)

    When n_splits == 1, KFold is not used and the test set has a size of 25%
    """
    assert not (balancing == "stratified" and n_splits < 1) # KFold cannot be disable if using the stratified balancing strategy

    X = labeled_features.iloc[:, 1:3]
    y = labeled_features.iloc[:][target_label]

    # kf cross validation is still a good idea for testing the iot performance, not actually done in the paper tho
    if n_splits > 1:
        kf = model_selection.KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    # Balancing
    if balancing == "under_over":
        X, y = _balance_over_under(X, y, random_state=random_state)
    if balancing == "under_over_rus":
        X, y = _balance_over_under(X, y, rus=True, random_state=random_state)
    if balancing == "stratified":
        kf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    models = _get_models()
    scores = {"model" : [], "accuracy" : [], "label" : [], "balancing" : []}
    trained = {}

    if n_splits > 1:
        folds = [ (X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]) for (train, test) in kf.split(X, y) ]
    else:
        folds = [model_selection.train_test_split(X, y, test_size=.25, random_state=random_state)]

    print("Fitting on target {} using '{}' balancing and {} folds".format(target_label, balancing, "no" if n_splits == 1 else n_splits))
    for (name, model) in models.items():
        for (X_train, X_test, y_train, y_test) in folds:
            trained[name] = model.fit(X_train, y_train)

            scores["model"].append(name)
            scores["accuracy"].append(metrics.accuracy_score(y_test, model.predict(X_test)))
            scores["label"].append(target_label)
            scores["balancing"].append(balancing)

    return trained, pd.DataFrame(scores)

# for the device classifier, we only consider iot devices
device_features = labeled_features.loc[labeled_features['iot'] == True]

# Balancing as done in the paper
iot_classifier, iot_scores = train_and_test_classifiers(labeled_features, 'iot', balancing=Balancing.OVER_UNDER_RUS, n_splits=1)
device_classifier, device_scores = train_and_test_classifiers(device_features, 'device_name', balancing=Balancing.STRATIFIED)

# Balancing strategies reversed
iot_classifier_diff, iot_scores_diff = train_and_test_classifiers(labeled_features, 'iot', balancing=Balancing.STRATIFIED)
device_classifier_diff, device_scores_diff = train_and_test_classifiers(device_features, 'device_name', balancing=Balancing.OVER_UNDER_RUS, n_splits=1)

# Balancing without RUS (with NearMiss)
iot_classifier_nm, iot_scores_nm = train_and_test_classifiers(labeled_features, 'iot', balancing=Balancing.OVER_UNDER, n_splits=1)

# No balancing for either
iot_classifier_unb, iot_scores_unb = train_and_test_classifiers(labeled_features, 'iot', balancing=Balancing.NONE)
device_classifier_unb, device_scores_unb = train_and_test_classifiers(device_features, 'device_name', balancing=Balancing.NONE)

In [None]:
def draw_performance_plot(scores):
    sns.catplot(scores, x="model", y="accuracy", kind="bar")
    plt.ylim(0, 1)

def draw_multiple_perf(scores):
    assert "balancing" in scores.columns

    sns.catplot(scores, x="model", y="accuracy", col="balancing", row="label", kind="bar")
    plt.ylim(0, 1)

all_scores = pd.concat([iot_scores, device_scores, iot_scores_unb, device_scores_unb, iot_scores_diff, device_scores_diff, iot_scores_nm])

draw_multiple_perf(all_scores)

In [None]:
from sklearn import metrics

def test_realistic(iot_classifier, device_classifier, unlabeled_features, devices_in_window):
    assert len(unlabeled_features) == len(devices_in_window)

    # accuracy = metrics.accuracy_score(devices_in_window, iot_classifier.predict(unlabeled_features))
    accuracy = metrics.accuracy_score(devices_in_window, device_classifier.predict(unlabeled_features))
    print("Accuracy:", accuracy)


unlabeled_features = generate_features(windowed_frame)
devices_in_window = get_devices_in_window(windowed_frame)

test_realistic(iot_classifier, device_classifier, unlabeled_features, devices_in_window)