In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, model_selection, metrics
from sklearn.inspection import DecisionBoundaryDisplay

In [12]:
# Path to folder containing the CSVs
data_folder = "data/traces/"

def read_date(dates: [str]):
    """Aggregate data of a single from parts"""
    data = pd.DataFrame()

    for date in dates:
        for i in range(50):
            try:
                frame = pd.read_csv(data_folder + date + "-{}.csv".format(i))
                data = pd.concat([data, frame], axis=0)
            except OSError:
                pass

    return data


def read_date_part(date_part: str):
    """Read data of a part of a capture date"""
    data = pd.DataFrame()

    try:
        frame = pd.read_csv(data_folder + date_part + ".csv")
        data = pd.concat([data, frame], axis=0)
    except OSError:
        pass

    return data

data = read_date(["16-09","16-10"])

data.set_index("timestamp")

data

Unnamed: 0,packet_id,timestamp,packet_size,eth_src,device_name,protocol,iot
0,0,1474552802257569024,70,18:b7:9e:02:20:44,,Raw,False
1,1,1474552802257692160,66,18:b7:9e:02:20:44,,TCP,False
2,2,1474552802323085056,66,14:cc:20:51:33:ea,,TCP,False
3,3,1474552802383173888,60,d0:52:a8:00:67:5e,,Padding,False
4,4,1474552802396326144,98,14:cc:20:51:33:ea,,Raw,False
...,...,...,...,...,...,...,...
4918278,4948801,1476242930915371008,1434,14:cc:20:51:33:ea,,Raw,False
4918279,4948802,1476242930915453952,1434,14:cc:20:51:33:ea,,Raw,False
4918280,4948803,1476242930915572992,1434,14:cc:20:51:33:ea,,Raw,False
4918281,4948804,1476242930915685888,1434,14:cc:20:51:33:ea,,Raw,False


In [18]:
def devices(df : pd.DataFrame, all : bool) -> pd.DataFrame:
    if not all :
        return df[df['incl']].drop('incl', axis=1)
    else :
        return df.drop('incl', axis=1)

devices = devices(pd.read_csv("./data/list_of_devices.csv"), False)

devices

Unnamed: 0,device_name,eth_src,connection_type,iot
0,Smart Things,d0:52:a8:00:67:5e,Wired,True
1,Amazon Echo,44:65:0d:56:cc:d3,Wireless,True
2,Netatmo Welcome,70:ee:50:18:34:43,Wireless,True
3,TP-Link Day Night Cloud camera,f4:f2:6d:93:51:f1,Wireless,True
4,Samsung SmartCam,00:16:6c:ab:6b:88,Wireless,True
5,Dropcam,30:8c:fb:2f:e4:b2,Wireless,True
6,Insteon Camera,00:62:6e:51:27:2e,Wired,True
8,Withings Smart Baby Monitor,00:24:e4:11:18:a8,Wired,True
9,Belkin Wemo switch,ec:1a:59:79:f4:89,Wireless,True
10,TP-Link Smart plug,50:c7:bf:00:56:39,Wireless,True


In [19]:
def mode_mean_med(df: pd.DataFrame, devices: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""

    # Keep only the device name and mac address
    device_names = devices.loc[:, ['device_name', 'eth_src']]

    # Keep only the mac address name and package size
    df_size = df.loc[:, ['eth_src', 'packet_size']]

    # Merge the stats with the devices names
    merge_df = device_names.merge(df_size, on='eth_src')

    # Group the merged frame and calculate the stats
    stats = merge_df.groupby(['device_name']).agg({'packet_size': ['mean', 'median', lambda x: x.mode().iat[0]]})

    return stats

statistics = mode_mean_med(data, devices)

statistics

Unnamed: 0_level_0,packet_size,packet_size,packet_size
Unnamed: 0_level_1,mean,median,<lambda_0>
device_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Amazon Echo,111.107594,66.0,66
Belkin Wemo switch,401.417551,118.0,66
Belkin wemo motion sensor,120.807015,66.0,66
Blipcare Blood Pressure meter,108.03125,59.0,54
Dropcam,185.323517,156.0,156
HP Printer,119.924045,86.0,140
Insteon Camera,114.641222,73.0,60
Light Bulbs LiFX Smart Bulb,96.533507,92.0,123
NEST Protect smoke alarm,303.680796,350.0,509
Netatmo Welcome,561.266181,74.0,1510


In [23]:
def attach_window_id(timestamped_frame: pd.DataFrame) -> list[pd.DataFrame]:
    """Slice the dataframe into 1-second intervals"""
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']
    windowed_frame = timestamped_frame.set_index('timestamp', drop=False)

    ns_in_sec = int(1e9)
    number_of_windows = int((last_packet_time - first_packet_time) / ns_in_sec) + 1

    try:
        windowed_frame.insert(0, value=0, column="window_id")
    except Exception as e:
        print("It seems like the column 'window_id' already exists. Just updating values...")
        print(e)
    finally:
        windowed_frame.window_id.astype(np.int64)

    for window_id in range(0, number_of_windows):
        from_time = first_packet_time + window_id * ns_in_sec
        to_time = first_packet_time + (window_id  + 1) * ns_in_sec
        windowed_frame.loc[from_time:to_time, 'window_id'] = window_id

    windowed_frame.set_index('window_id', inplace=True)
    return windowed_frame


def attach_window_id_fast(timestamped_frame: pd.DataFrame) -> pd.DataFrame:
    first_packet_time = timestamped_frame.iloc[0]['timestamp']
    last_packet_time = timestamped_frame.iloc[-1]['timestamp']

    ns_in_sec = int(1e9)
    windows = range(first_packet_time - 1, last_packet_time + ns_in_sec, ns_in_sec)

    windowed_frame = pd.cut(timestamped_frame['timestamp'], windows, labels=range(len(windows) - 1)).to_frame()
    windowed_frame.columns = ["window_id"]
    return pd.concat([timestamped_frame, windowed_frame], axis=1).set_index('window_id')

windowed_frame = attach_window_id_fast(data)

windowed_frame

Unnamed: 0_level_0,packet_id,timestamp,packet_size,eth_src,device_name,protocol,iot
window_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1474552802257569024,70,18:b7:9e:02:20:44,,Raw,False
0,1,1474552802257692160,66,18:b7:9e:02:20:44,,TCP,False
0,2,1474552802323085056,66,14:cc:20:51:33:ea,,TCP,False
0,3,1474552802383173888,60,d0:52:a8:00:67:5e,,Padding,False
0,4,1474552802396326144,98,14:cc:20:51:33:ea,,Raw,False
...,...,...,...,...,...,...,...
1690128,4948801,1476242930915371008,1434,14:cc:20:51:33:ea,,Raw,False
1690128,4948802,1476242930915453952,1434,14:cc:20:51:33:ea,,Raw,False
1690128,4948803,1476242930915572992,1434,14:cc:20:51:33:ea,,Raw,False
1690128,4948804,1476242930915685888,1434,14:cc:20:51:33:ea,,Raw,False


In [24]:
def generate_features_labeled(windowed_frame: pd.DataFrame, device_frame: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""
    features = pd.DataFrame()

    for device_index in device_frame.index:
        device = device_frame.iloc[device_index]
        device_packets = windowed_frame.loc[windowed_frame['eth_src'] == device['eth_src']]
        device_features = generate_features(device_packets)

        device_features.insert(len(device_features.columns), "device_name", device['device_name'])
        device_features.insert(len(device_features.columns), "iot", device['iot'])

        features = pd.concat([features, device_features])

    features.columns = ["mean", "std", "n_bytes", "device_name", "iot"]
    return features

def generate_features(windowed_frame: pd.DataFrame):
    window_groups = windowed_frame.groupby("window_id", observed=True)

    mean = window_groups['packet_size'].mean()
    std = window_groups['packet_size'].std()
    n_bytes = window_groups['packet_size'].sum()

    # TODO Not sure if fillna is a good idea
    return pd.concat([mean, std, n_bytes], axis=1).fillna(0)

def get_devices_in_window(windowed_frame: pd.DataFrame):
    devices_in_window = pd.DataFrame()

    for window_id in windowed_frame.index.unique():
        devices = windowed_frame.loc[window_id]['device_name']
        # iot = windowed_frame.loc[window_id]['iot']

        if not isinstance(devices, str):
            devices = devices.unique()

        devices_in_window = pd.concat([devices_in_window, pd.DataFrame({"window_id": window_id, "devices": devices})])

    return devices_in_window

labeled_features = generate_features_labeled(windowed_frame, devices)

labeled_features

Unnamed: 0_level_0,mean,std,n_bytes,device_name,iot
window_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,60.000000,0.000000,60,Smart Things,True
1,99.500000,55.861436,199,Smart Things,True
11,60.000000,0.000000,60,Smart Things,True
16,60.000000,0.000000,60,Smart Things,True
21,60.000000,0.000000,60,Smart Things,True
...,...,...,...,...,...
1690124,1379.388771,239.297328,1302143,TPLink Router Bridge LAN (Gateway),False
1690125,1384.212121,226.137487,1370370,TPLink Router Bridge LAN (Gateway),False
1690126,1386.777328,219.127197,1370136,TPLink Router Bridge LAN (Gateway),False
1690127,1350.658847,301.655916,1381724,TPLink Router Bridge LAN (Gateway),False


In [None]:
import sklearn.svm as svm
import sklearn.neighbors as neighbors
import sklearn.tree as tree
import sklearn.ensemble as ensemble

def train_and_test_classifiers(labeled_features, target_label, n_splits=10, shuffle=False, random_state=None):
    """
    Trains and tests a series of models on the given, labeled data
    (Knn, RF, DT, SVM, Majority Voting)
    """

    X = labeled_features.iloc[:, 1:3]
    y = labeled_features.iloc[:][target_label]
    kf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    models = {
        "Knn" : neighbors.KNeighborsClassifier(n_neighbors=5, weights="distance"),
        "SVM" : svm.SVC(),
        "DT"  : tree.DecisionTreeClassifier(),
        "RF"  : ensemble.RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
    }
    models["Majority Voting"] = ensemble.VotingClassifier([ pair for pair in models.items() ], voting='hard')

    print("Training and testing models on {}".format(target_label))
    for (name, model) in models.items():
        accuracy = 0.0

        for (train_index, test_index) in kf.split(X, y):
            model.fit(X.iloc[train_index], y.iloc[train_index])

            accuracy += metrics.accuracy_score(y.iloc[test_index], model.predict(X.iloc[test_index]))

        print("{} mean accuracy over {} splits: {}".format(name, n_splits, accuracy/n_splits))

    return models

iot_classifiers = train_and_test_classifiers(labeled_features, 'iot')
device_classifiers = train_and_test_classifiers(labeled_features.loc[labeled_features['iot'] == True], 'device_name')

In [None]:
def test_realistic(iot_classifier, device_classifier, unlabeled_features, devices_in_window):
    assert len(unlabeled_features) == len(devices_in_window)

    # accuracy = metrics.accuracy_score(devices_in_window, iot_classifier.predict(unlabeled_features))
    accuracy = metrics.accuracy_score(devices_in_window, device_classifier.predict(unlabeled_features))
    print("Accuracy:", accuracy)


unlabeled_features = generate_features(windowed_frame)
devices_in_window = get_devices_in_window(windowed_frame)

test_realistic(iot_classifiers, device_classifiers, unlabeled_features, devices_in_window)

In [82]:
def mode_mean_med(traffic_frame: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""

    # Keep only the mac address name and package size
    df = traffic_frame.loc[:, ['window_id' ,'eth_src', 'packet_size']]

    # Group the dataframe by window_id and mac adress & Calculate the statistical measures
    grouped = df.groupby(['window_id', 'eth_src']).agg({'packet_size': ['mean', 'median', lambda x: x.mode().iat[0]]})

    # Reset index to make window_id a column again
    grouped = grouped.reset_index()

    # Save the results
    grouped.to_csv('stats.cvs', index=False)

    return grouped

result = mode_mean_med(windowed_frame)


KeyboardInterrupt: 

In [81]:

def mode_mean_med_without_window(df: pd.DataFrame, devices: pd.DataFrame):
    """Calculate the mode, mean, and median of a window"""

    # Keep only the device name and mac address
    device_names = devices.loc[:, ['device_name', 'eth_src']]

    # Keep only the mac address name and package size
    df_size = df.loc[:, ['eth_src', 'packet_size']]

    # Merge the stats with the devices names
    merge_df = device_names.merge(df_size, on='eth_src')

    # Group the merged frame and calculate the stats
    stats = merge_df.groupby(['device_name']).agg({'packet_size': ['mean', 'median', lambda x: x.mode().iat[0]]})

    # Save the results
    stats.to_csv('stats_no_window.cvs', index=False)

    print(stats)

    return stats

devices = pd.read_csv("./data/list_of_devices.csv")
result_no_window = mode_mean_med_without_window(data, devices)


                                    packet_size                   
                                           mean  median <lambda_0>
device_name                                                       
Amazon Echo                          118.734701    75.0         66
Android Phone                        167.706915    66.0         66
Belkin Wemo switch                   415.264227   118.0         66
Belkin wemo motion sensor            113.394691    66.0         66
Blipcare Blood Pressure meter        108.393701    59.0         54
Dropcam                              206.533368   156.0        156
HP Printer                           168.309073    60.0        140
IPhone                               109.954947    66.0         54
Insteon Camera                       104.441343    90.0        102
Laptop                               111.325552    54.0         54
Light Bulbs LiFX Smart Bulb           96.446889    92.0        123
MacBook                              128.950355    66.0       

In [78]:
# n_neighbors = 5
#
# for metric in mmm_frame.columns[2:5]:
#     X = mmm_frame[metric].values
#     y = mmm_frame.iloc[:, -1].values
#     X = X.reshape(-1, 1)
#
#     X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
#
#     for weights in ["uniform", "distance"]:
#         # we create an instance of Neighbours Classifier and fit the data.
#         clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
#         clf.fit(X_train, y_train)
#
#         accuracy = metrics.accuracy_score(y_test, clf.predict(X_test))
#         print("Accuracy of {} with {}: {}".format(metric, weights, accuracy))