In [204]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay

In [205]:
data_folder = "data/traces/"

def read_date(date: str):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for i in range(50):
        try:
            frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
            data = pd.concat([data, frame], axis=0)
        except OSError:
            pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date_part("16-09-23")
data.set_index("timestamp")
print(data)

      packet_id            timestamp  packet_size            eth_src  \
0             0  1474552802257569024           70  18:b7:9e:02:20:44   
1             1  1474552802257692160           66  18:b7:9e:02:20:44   
2             2  1474552802323085056           66  14:cc:20:51:33:ea   
3             3  1474552802383173888           60  d0:52:a8:00:67:5e   
4             5  1474552802447910912           88  70:ee:50:18:34:43   
...         ...                  ...          ...                ...   
1863       1996  1474553105698045952           66  ec:1a:59:83:28:11   
1864       1997  1474553105698468864           66  ec:1a:59:83:28:11   
1865       1998  1474553105698507008           66  ec:1a:59:83:28:11   
1866       1999  1474553105698558976           66  ec:1a:59:83:28:11   
1867       2000  1474553105700568064           66  ec:1a:59:79:f4:89   

                             device_name protocol    iot  
0                          Triby Speaker      Raw   True  
1                

In [206]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']
    windowed_frame = timestamped_frame.set_index('timestamp', drop=False) 

    ns_in_sec = int(1e9)
    number_of_windows = int((last_packet_time - first_packet_time) / ns_in_sec) + 1

    try:
        windowed_frame.insert(0, value=0, column="window_id")
    except Exception as e:
        print("It seems like the column 'window_id' already exists. Just updating values...")
        print(e)
    finally:
        windowed_frame.window_id.astype(np.int64)

    for window_id in range(0, number_of_windows):
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec
        windowed_frame.loc[from_time:to_time, 'window_id'] = window_id
    
    windowed_frame.set_index('window_id', inplace=True)
    return windowed_frame


devices = pd.read_csv("data/list_of_devices.csv")
windowed_frame = attach_window_id(data)
print(windowed_frame)

           packet_id            timestamp  packet_size            eth_src  \
window_id                                                                   
0                  0  1474552802257569024           70  18:b7:9e:02:20:44   
0                  1  1474552802257692160           66  18:b7:9e:02:20:44   
0                  2  1474552802323085056           66  14:cc:20:51:33:ea   
0                  3  1474552802383173888           60  d0:52:a8:00:67:5e   
0                  5  1474552802447910912           88  70:ee:50:18:34:43   
...              ...                  ...          ...                ...   
303             1996  1474553105698045952           66  ec:1a:59:83:28:11   
303             1997  1474553105698468864           66  ec:1a:59:83:28:11   
303             1998  1474553105698507008           66  ec:1a:59:83:28:11   
303             1999  1474553105698558976           66  ec:1a:59:83:28:11   
303             2000  1474553105700568064           66  ec:1a:59:79:f4:89   

In [225]:
def generate_features(windowed_frame: pd.DataFrame, device_frame: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""
    features = pd.DataFrame()

    for device_index in device_frame.index:
        device = device_frame.iloc[device_index]
        device_packets = windowed_frame.loc[windowed_frame['eth_src'] == device['eth_src']]

        for window_id in device_packets.index.unique():
            device_packet_sizes = device_packets.loc[window_id, 'packet_size']

            mean = device_packet_sizes.mean()
            std = device_packet_sizes.std()
            n_bytes = device_packet_sizes.sum()

            dev_features = pd.DataFrame([[window_id, device['device_name'], mean, std, n_bytes, device['iot']]])
            features = pd.concat([features, dev_features], axis=0)

    features.columns = ["window_id", "device_name", "mean", "std", "n_bytes", "iot"]
    features.set_index("window_id", inplace=True)
    return features

features = generate_features(windowed_frame, devices)
print(features)

                                  device_name        mean         std  \
window_id                                                               
0                                Smart Things   60.000000    0.000000   
1                                Smart Things   99.500000   55.861436   
11                               Smart Things   60.000000    0.000000   
16                               Smart Things   60.000000    0.000000   
21                               Smart Things   60.000000    0.000000   
...                                       ...         ...         ...   
299        TPLink Router Bridge LAN (Gateway)  118.777778  122.151318   
300        TPLink Router Bridge LAN (Gateway)  175.181818  130.261904   
301        TPLink Router Bridge LAN (Gateway)   66.000000    0.000000   
302        TPLink Router Bridge LAN (Gateway)   66.000000    0.000000   
303        TPLink Router Bridge LAN (Gateway)   58.000000   13.856406   

           n_bytes    iot  
window_id             

In [234]:
n_neighbors = 5

X = features.iloc[:, 2:4]
y = features.iloc[:, -1]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

for weights in ["uniform", "distance"]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X_train, y_train)

    accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
    print("Accuracy of {} with {}: {}".format(metric, weights, accuracy))

Accuracy of mean with uniform: 0.8
Accuracy of mean with distance: 0.8222222222222222
