In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay

In [3]:
data_folder = "data/traces/"

def read_date(date: str):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for i in range(50):
        try:
            frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
            data = pd.concat([data, frame], axis=0)
        except OSError:
            pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date_part("16-09-23")
print(data)

Empty DataFrame
Columns: []
Index: []


In [5]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    timestamped_frame.set_index('timestamp')

    first_packet_time = timestamped_frame.iloc[0].timestamp
    last_packet_time = timestamped_frame.iloc[-1].timestamp

    frames = []

    ns_in_sec = int(1e9)
    window_id = 0

    while first_packet_time + window_id * ns_in_sec <= last_packet_time:
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec

        window_mask = (from_time <= timestamped_frame.timestamp) \
            & (timestamped_frame.timestamp <= to_time)

        frames.append(timestamped_frame[window_mask])
        window_id += 1 

    return frames

devices = pd.read_csv("data/list_of_devices.csv")
windowed_frame = attach_window_id(data)


KeyError: "None of ['timestamp'] are in the columns"

In [None]:
def generate_features(traffic_frame: pd.DataFrame, device_frame: pd.DataFrame, window_id: int):
    """Calculate the mode, mean, and median of a window"""
    features = pd.DataFrame()

    for i in device_frame.index:
        mean = traffic_frame.loc[traffic_frame['eth_src'] ==
                                 device_frame['eth_src'][i], 'packet_size'].mean()
        std = traffic_frame.loc[traffic_frame['eth_src'] == 
                                 device_frame['eth_src'][i] , 'packet_size'].std()
        n_bytes = traffic_frame.loc[traffic_frame['eth_src'] == 
                                 device_frame['eth_src'][i] , 'packet_size'].sum()

        if len(mode) == 0:
            continue
        
        dev_features = pd.DataFrame([[window_id, device_frame['device_name'][i], mean, std, n_bytes, device_frame['iot'][i]]])
        features = pd.concat([features, dev_mmm], axis=0)

    return features

def generate_features_for_all(windowed_data):
    features = pd.DataFrame()

    for window_id, window in enumerate(windowed_data):
        if window_id > 5000:
            break
        
        features = pd.concat([features, generate_features(window, devices, window_id)], axis=0)

    features.columns=['window_id', 'device_name', 'mode', 'mean', 'median', 'iot']
    return features

features = generate_features_for_all(windowed_frame)


    window_id                         device_name  mode        mean  median  \
0           0                        Smart Things    60   60.000000    60.0   
0           0                     Netatmo Welcome    88   88.000000    88.0   
0           0                             Dropcam   156  156.000000   156.0   
0           0                  Belkin Wemo switch    42   42.000000    42.0   
0           0                       Triby Speaker    66   67.333333    66.0   
..        ...                                 ...   ...         ...     ...   
0        4999                             Dropcam   156  156.000000   156.0   
0        4999         Withings Smart Baby Monitor    66   66.000000    66.0   
0        4999  TPLink Router Bridge LAN (Gateway)    66   66.000000    66.0   
0        5000                        Smart Things    60   60.000000    60.0   
0        5000  TPLink Router Bridge LAN (Gateway)    66   62.000000    66.0   

      iot  
0    True  
0    True  
0    True  
0  

In [159]:
n_neighbors = 5

for metric in features.columns[2:5]:
    X = features[metric].values
    y = features.iloc[:, -1].values
    X = X.reshape(-1, 1)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

    for weights in ["uniform", "distance"]:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
        print("Accuracy of {} with {}: {}".format(metric, weights, accuracy))

Accuracy of mode with uniform: 0.8299065420560747
Accuracy of mode with distance: 0.8304405874499332
Accuracy of mean with uniform: 0.841388518024032
Accuracy of mean with distance: 0.8477970627503337
Accuracy of median with uniform: 0.8309746328437917
Accuracy of median with distance: 0.8331108144192256
