In [204]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay

In [205]:
data_folder = "data/traces/"

def read_date(date: str):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for i in range(50):
        try:
            frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
            data = pd.concat([data, frame], axis=0)
        except OSError:
            pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date_part("16-09-23")
data.set_index("timestamp")
print(data)

      packet_id            timestamp  packet_size            eth_src  \
0             0  1474552802257569024           70  18:b7:9e:02:20:44   
1             1  1474552802257692160           66  18:b7:9e:02:20:44   
2             2  1474552802323085056           66  14:cc:20:51:33:ea   
3             3  1474552802383173888           60  d0:52:a8:00:67:5e   
4             5  1474552802447910912           88  70:ee:50:18:34:43   
...         ...                  ...          ...                ...   
1863       1996  1474553105698045952           66  ec:1a:59:83:28:11   
1864       1997  1474553105698468864           66  ec:1a:59:83:28:11   
1865       1998  1474553105698507008           66  ec:1a:59:83:28:11   
1866       1999  1474553105698558976           66  ec:1a:59:83:28:11   
1867       2000  1474553105700568064           66  ec:1a:59:79:f4:89   

                             device_name protocol    iot  
0                          Triby Speaker      Raw   True  
1                

In [206]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']
    windowed_frame = timestamped_frame.set_index('timestamp', drop=False) 

    ns_in_sec = int(1e9)
    number_of_windows = int((last_packet_time - first_packet_time) / ns_in_sec) + 1

    try:
        windowed_frame.insert(0, value=0, column="window_id")
    except Exception as e:
        print("It seems like the column 'window_id' already exists. Just updating values...")
        print(e)
    finally:
        windowed_frame.window_id.astype(np.int64)

    for window_id in range(0, number_of_windows):
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec
        windowed_frame.loc[from_time:to_time, 'window_id'] = window_id
    
    windowed_frame.set_index('window_id', inplace=True)
    return windowed_frame


devices = pd.read_csv("data/list_of_devices.csv")
windowed_frame = attach_window_id(data)
print(windowed_frame)

           packet_id            timestamp  packet_size            eth_src  \
window_id                                                                   
0                  0  1474552802257569024           70  18:b7:9e:02:20:44   
0                  1  1474552802257692160           66  18:b7:9e:02:20:44   
0                  2  1474552802323085056           66  14:cc:20:51:33:ea   
0                  3  1474552802383173888           60  d0:52:a8:00:67:5e   
0                  5  1474552802447910912           88  70:ee:50:18:34:43   
...              ...                  ...          ...                ...   
303             1996  1474553105698045952           66  ec:1a:59:83:28:11   
303             1997  1474553105698468864           66  ec:1a:59:83:28:11   
303             1998  1474553105698507008           66  ec:1a:59:83:28:11   
303             1999  1474553105698558976           66  ec:1a:59:83:28:11   
303             2000  1474553105700568064           66  ec:1a:59:79:f4:89   

In [291]:
def generate_features_labeled(windowed_frame: pd.DataFrame, device_frame: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""
    features = pd.DataFrame()

    for device_index in device_frame.index:
        device = device_frame.iloc[device_index]
        device_packets = windowed_frame.loc[windowed_frame['eth_src'] == device['eth_src']]
        device_features = generate_features(device_packets)

        device_features.insert(len(device_features.columns), "device_name", device['device_name'])
        device_features.insert(len(device_features.columns), "iot", device['iot'])

        features = pd.concat([features, device_features])

    features.columns = ["window_id", "mean", "std", "n_bytes", "device_name", "iot"]
    features.set_index("window_id", inplace=True)

    return features

def generate_features(windowed_frame: pd.DataFrame):
    features =  pd.DataFrame()

    for window_id in windowed_frame.index.unique():
        packet_sizes = windowed_frame.loc[window_id, 'packet_size']

        mean = packet_sizes.mean()
        std = packet_sizes.std()
        n_bytes = packet_sizes.sum()

        features = pd.concat([features, pd.DataFrame([[window_id, mean, std, n_bytes]])], axis=0)

    return features

def get_devices_in_window(windowed_frame: pd.DataFrame):
    devices_in_window = pd.DataFrame()
    
    for window_id in windowed_frame.index.unique():
        devices = windowed_frame.loc[window_id]['device_name']
        # iot = windowed_frame.loc[window_id]['iot']

        if not isinstance(devices, str):
            devices = devices.unique() 
            
        devices_in_window = pd.concat([devices_in_window, pd.DataFrame({"window_id": window_id, "devices": devices})])

    return devices_in_window

labeled_features = generate_features_labeled(windowed_frame, devices)

In [295]:
def train_and_test_iot_classifier(labeled_features):
    n_neighbors = 5

    X = labeled_features.iloc[:, 1:3]
    y = labeled_features.iloc[:]['iot']

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

    for weights in ["uniform", "distance"]:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
        print("Accuracy with weight {}: {}".format(weights, accuracy))

    return clf

iot_classfier = train_and_test_iot_classifier(labeled_features)

Accuracy with weight uniform: 0.84
Accuracy with weight distance: 0.8533333333333334


In [296]:
def train_and_test_device_classifier(labeled_features):
    iot_features = labeled_features.loc[labeled_features['iot'] == True]
    n_neighbors = 5

    X = iot_features.iloc[:, 1:3]
    y = iot_features.iloc[:]['device_name']

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

    for weights in ["uniform", "distance"]:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
        print("Accuracy with weight {}: {}".format(weights, accuracy))

    return clf

device_classifier = train_and_test_device_classifier(labeled_features)

Accuracy with weight uniform: 0.7483443708609272
Accuracy with weight distance: 0.7748344370860927


In [297]:
def test_realistic(iot_classifier, device_classifier, unlabeled_features, devices_in_window):
    assert len(unlabeled_features) == len(devices_in_window)

    # accuracy = metrics.accuracy_score(devices_in_window, iot_classifier.predict(unlabeled_features))
    accuracy = metrics.accuracy_score(devices_in_window, device_classifier.predict(unlabeled_features))
    print("Accuracy:", accuracy)

def custom_accuracy(y_true: pd.[str], y_predict: str):
 

unlabeled_features = generate_features(windowed_frame)
devices_in_window = get_devices_in_window(windowed_frame)

test_realistic(iot_classifier, device_classifier, unlabeled_features, devices_in_window)

ValueError: If using all scalar values, you must pass an index