In [139]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay

In [153]:
data_folder = "../data/traces/"

def read_date(date: str):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for i in range(50):
        try:
            frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
            data = pd.concat([data, frame], axis=0)
        except OSError:
            pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date_part("16-09-23")
print(data)

        packet_id            timestamp  packet_size            eth_src  \
0               0  1474552802257569024           70  18:b7:9e:02:20:44   
1               1  1474552802257692160           66  18:b7:9e:02:20:44   
2               2  1474552802323085056           66  14:cc:20:51:33:ea   
3               3  1474552802383173888           60  d0:52:a8:00:67:5e   
4               5  1474552802447910912           88  70:ee:50:18:34:43   
...           ...                  ...          ...                ...   
902933     947067  1474639199059330816           54  14:cc:20:51:33:ea   
902934     947068  1474639199620186112           60  00:24:e4:11:18:a8   
902935     947069  1474639199620225024           42  14:cc:20:51:33:ea   
902936     947070  1474639199771064064          156  30:8c:fb:2f:e4:b2   
902937     947071  1474639199986316032           66  14:cc:20:51:33:ea   

                               device_name protocol    iot  
0                            Triby Speaker      Ra

In [154]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    timestamped_frame.set_index('timestamp')

    first_packet_time = timestamped_frame.iloc[0].timestamp
    last_packet_time = timestamped_frame.iloc[-1].timestamp

    frames = []

    ns_in_sec = int(1e9)
    window_id = 0

    while first_packet_time + window_id * ns_in_sec <= last_packet_time:
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec

        window_mask = (from_time <= timestamped_frame.timestamp) \
            & (timestamped_frame.timestamp <= to_time)

        frames.append(timestamped_frame[window_mask])
        window_id += 1 

    return frames

devices = pd.read_csv("../data/list_of_devices.csv")
windowed_frame = attach_window_id(data)


In [158]:
def mode_mean_med(traffic_frame: pd.DataFrame, device_frame: pd.DataFrame, window_id: int):
    """Calculate the mode, mean, and median of a window"""
    mmm = pd.DataFrame()

    for i in device_frame.index:
        mode = traffic_frame.loc[traffic_frame['eth_src'] ==
                                 device_frame['eth_src'][i], 'packet_size'].mode()
        mean = traffic_frame.loc[traffic_frame['eth_src'] ==
                                 device_frame['eth_src'][i], 'packet_size'].mean()
        median = traffic_frame.loc[traffic_frame['eth_src'] ==
                                   device_frame['eth_src'][i], 'packet_size'].median()

        if len(mode) == 0:
            continue
        
        dev_mmm = pd.DataFrame([[window_id, device_frame['device_name'][i], mode.iloc[-1], mean, median, device_frame['iot'][i]]])
        mmm = pd.concat([mmm, dev_mmm], axis=0)

    return mmm

mmm_frame = pd.DataFrame()

for window_id, window in enumerate(windowed_frame):
    if window_id > 5000:
        break
    
    mmm_frame = pd.concat([mmm_frame, mode_mean_med(window, devices, window_id)], axis=0)

mmm_frame.columns=['window_id', 'device_name', 'mode', 'mean', 'median', 'iot']
print(mmm_frame)


    window_id                         device_name  mode        mean  median  \
0           0                        Smart Things    60   60.000000    60.0   
0           0                     Netatmo Welcome    88   88.000000    88.0   
0           0                             Dropcam   156  156.000000   156.0   
0           0                  Belkin Wemo switch    42   42.000000    42.0   
0           0                       Triby Speaker    66   67.333333    66.0   
..        ...                                 ...   ...         ...     ...   
0        4999                             Dropcam   156  156.000000   156.0   
0        4999         Withings Smart Baby Monitor    66   66.000000    66.0   
0        4999  TPLink Router Bridge LAN (Gateway)    66   66.000000    66.0   
0        5000                        Smart Things    60   60.000000    60.0   
0        5000  TPLink Router Bridge LAN (Gateway)    66   62.000000    66.0   

      iot  
0    True  
0    True  
0    True  
0  

In [159]:
n_neighbors = 5

for metric in mmm_frame.columns[2:5]:
    X = mmm_frame[metric].values
    y = mmm_frame.iloc[:, -1].values
    X = X.reshape(-1, 1)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

    for weights in ["uniform", "distance"]:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
        print("Accuracy of {} with {}: {}".format(metric, weights, accuracy))

Accuracy of mode with uniform: 0.8299065420560747
Accuracy of mode with distance: 0.8304405874499332
Accuracy of mean with uniform: 0.841388518024032
Accuracy of mean with distance: 0.8477970627503337
Accuracy of median with uniform: 0.8309746328437917
Accuracy of median with distance: 0.8331108144192256
