In [1]:
import json
import math
import glob
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import OrderedDict

In [2]:
BSM_SIZE = 49471102 # 1242779 # 45748391
N_FEATURES = 22
root_dir = '/home/jovyan/work/VeReMi/'  # DoSRandom_1416/'
filenames = glob.glob(root_dir + '**/traceJSON*.json', recursive=True)

remove_datasets = [
    'MixAll',
    #'EventualStop',
    'ConstPos',
    'ConstPosOffset',
    'RandomPos',
    'RandomPosOffset',
    'ConstSpeed',
    'ConstSpeedOffset',
    'RandomSpeed',
    'RandomSpeedOffset',
    'DelayedMessages',
]

filenames = list(filter(lambda filename: all(dataset not in filename for dataset in remove_datasets), filenames))

In [3]:
# # Count the number of BSM messages in the dataset
# line_num = 0
# for filepath in tqdm(filenames):
#     f = open(filepath)
#     for line in f:
#         if '"type":3' in line:
#             line_num += 1
    
# line_num

In [4]:
attacker_sender_map = {}

datasets = OrderedDict()

for filepath in filenames:
    path_parts = filepath.split('/')
    filename = path_parts[-1]
    dataset_name = path_parts[-3]
    _, vehicle, _, attacker = filename.split('-')[:4]
    attacker_type = int(attacker[1:])
    vehicle_id = int(vehicle)
    attacker_sender_map[f'{dataset_name}:{vehicle_id}'] = attacker_type
    datasets[dataset_name] = 1

for i, k in enumerate(datasets.keys()):
    datasets[k] = i

datasets

OrderedDict([('EventualStop_1416', 0),
             ('DataReplaySybil_0709', 1),
             ('DoSRandomSybil_1416', 2),
             ('DoSRandom_0709', 3),
             ('DataReplay_0709', 4),
             ('DataReplay_1416', 5),
             ('GridSybil_0709', 6),
             ('DoS_0709', 7),
             ('Disruptive_1416', 8),
             ('DoS_1416', 9),
             ('DoSDisruptiveSybil_0709', 10),
             ('DoSRandom_1416', 11),
             ('DoSRandomSybil_0709', 12),
             ('EventualStop_0709', 13),
             ('Disruptive_0709', 14),
             ('GridSybil_1416', 15),
             ('DoSDisruptive_0709', 16),
             ('DoSDisruptive_1416', 17),
             ('DataReplaySybil_1416', 18),
             ('DoSDisruptiveSybil_1416', 19)])

In [25]:
# veremi = np.memmap('veremi_atk.memmap', dtype='float32', mode='w+', shape=(BSM_SIZE, N_FEATURES))
# # filenames = filenames[:1000]

# veremi_pos = 0
# for filepath in (pbar := tqdm(filenames)):
#     f = open(filepath)
#     path_parts = filepath.split('/')
#     filename = path_parts[-1]
#     dataset_name = path_parts[-3]
#     receiver = int(filename.split('-')[1])
#     dataset_id = datasets[dataset_name]
#     pbar.set_description("Vehicle ID {}".format(receiver))
#     gps = None
#     for line in f:
#         data = json.loads(line)
#         if data['type'] == 2:
#             gps = data
#         elif data['type'] == 3:
#             attack_type = attacker_sender_map.get(f'{dataset_name}:{data["sender"]}', None)
#             if attack_type is not None:
#                 out = (
#                     dataset_id,
#                     data['messageID'],
#                     data['rcvTime'],
#                     receiver,
#                     data['sender'],
#                     attack_type,
#                     data['pos'][0],
#                     data['pos'][1],
#                     data['spd'][0],
#                     data['spd'][1],
#                     data['acl'][0],
#                     data['acl'][1],
#                     data['hed'][0],
#                     data['hed'][1],
#                     gps['pos'][0],
#                     gps['pos'][1],
#                     gps['spd'][0],
#                     gps['spd'][1],
#                     gps['acl'][0],
#                     gps['acl'][1],
#                     gps['hed'][0],
#                     gps['hed'][1],
#                 )
#                 veremi[veremi_pos] = out
#                 veremi_pos += 1
#             else:
#                 print(f'{dataset_name}:{data["sender"]} is none')

  0%|          | 0/57590 [00:00<?, ?it/s]

In [44]:
veremi = np.memmap('veremi_atk.memmap', dtype='float32', mode='r', shape=(BSM_SIZE, N_FEATURES))

df = pd.DataFrame(veremi, columns=[
    'dataset_id',
    'message_id',
    'bsm_rcv_time',
    'receiver',
    'sender',
    'attack_type',
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
])

# df.drop(df[df.attack_type.between(1, 9)].index, inplace=True) # drop faults (1 - 9)
# df.drop(df[df.receiver == df.sender].index, inplace=True) # drop bad registers

# remove message duplicates
df = df.groupby(['dataset_id', 'message_id']).first().reset_index()

# Correcão vide ITA
df.loc[(df.attack_type == 9) & ((df.snd_spd_x != 0) | (df.snd_spd_y != 0)), 'attack_type'] = 0

df.sort_values(['dataset_id', 'sender', 'bsm_rcv_time', 'receiver'], inplace=True)

df.reset_index(inplace=True, drop=True)

# Calc delta_time based on the diff between the messages
def calc_delta(x):
    return pd.Series(np.insert(np.diff(x.bsm_rcv_time), 0, 0))

grouped = df[['dataset_id',
              'sender',
              'bsm_rcv_time']].groupby(['dataset_id',
                                        'sender'])

indexes = grouped.size().index

for k in tqdm(indexes):
    t = grouped.get_group(k)
    t = t.sort_values('bsm_rcv_time')
    time_id = np.insert((np.diff(t.bsm_rcv_time) > 1).cumsum(), 0, 0)
    df.loc[t.index, 'time_id'] = time_id
    # delta_time = calc_delta(t)
    # df.loc[t.index, 'delta_time'] = delta_time.values


df.to_feather('out_veremi/veremi-atk.feather')
df.head(50)

  0%|          | 0/57550 [00:00<?, ?it/s]

Unnamed: 0,dataset_id,message_id,bsm_rcv_time,receiver,sender,attack_type,snd_pos_x,snd_pos_y,snd_spd_x,snd_spd_y,...,snd_hed_y,rcv_pos_x,rcv_pos_y,rcv_spd_x,rcv_spd_y,rcv_acl_x,rcv_acl_y,rcv_hed_x,rcv_hed_y,time_id
0,0.0,8488.0,50404.601562,15.0,9.0,0.0,1317.712891,1013.469055,-0.90591,-1.427407,...,-0.839461,1370.78833,1273.317383,0.751052,-1.693244,0.821176,-1.851237,0.396755,-0.917924,0.0
1,0.0,8597.0,50405.601562,15.0,9.0,0.0,1316.451538,1011.180115,-1.902765,-2.998083,...,-0.836759,1372.011353,1270.615723,1.601819,-3.611294,1.022133,-2.30425,0.396755,-0.917924,0.0
2,0.0,8626.0,50406.601562,15.0,9.0,0.0,1314.070435,1007.3797,-2.831654,-4.461668,...,-0.834376,1374.130737,1266.041382,2.528235,-5.406475,0.743783,-1.590493,0.412991,-0.910735,0.0
3,0.0,8655.0,50407.601562,15.0,9.0,0.0,1310.508789,1002.046814,-3.771291,-5.942189,...,-0.833539,1377.653687,1259.814087,3.789588,-7.009244,0.788273,-1.45763,0.464821,-0.885405,0.0
4,0.0,17105.0,50408.601562,15.0,9.0,0.0,1306.484131,995.269653,-4.688379,-7.387179,...,-0.833539,1382.099121,1251.881226,4.706913,-8.705941,0.769331,-1.422368,0.464821,-0.885405,0.0
5,0.0,17214.0,50409.601562,15.0,9.0,0.0,1299.229492,988.436829,-7.568216,-7.320236,...,-0.674166,1386.068726,1241.537598,4.203956,-11.308191,0.861143,-2.316029,0.335991,-0.941865,0.0
6,0.0,17283.0,50410.601562,15.0,9.0,0.0,1290.474976,981.036743,-9.275353,-7.838631,...,-0.622457,1390.302002,1229.096069,4.730819,-13.372369,0.641116,-1.811916,0.320985,-0.947084,0.0
7,0.0,17352.0,50411.601562,15.0,9.0,0.0,1278.56897,976.230103,-13.275533,-4.239544,...,-0.275733,1392.919189,1214.041992,2.742379,-15.801229,0.29019,-1.671921,0.1564,-0.987694,0.0
8,0.0,17421.0,50412.601562,15.0,9.0,0.0,1264.393433,975.198364,-14.455855,0.724145,...,0.079725,1394.478271,1197.050537,1.264351,-17.553642,0.051925,-0.718282,0.057078,-0.99837,0.0
9,0.0,43094.0,50413.601562,45.0,9.0,0.0,1249.652466,976.428284,-14.471983,1.223206,...,0.147544,1391.741821,1221.377808,0.168931,-0.973328,0.386942,-2.229773,0.161888,-0.986809,0.0


In [9]:
datasets

OrderedDict([('EventualStop_1416', 0),
             ('DataReplaySybil_0709', 1),
             ('DoSRandomSybil_1416', 2),
             ('DoSRandom_0709', 3),
             ('DataReplay_0709', 4),
             ('DataReplay_1416', 5),
             ('GridSybil_0709', 6),
             ('DoS_0709', 7),
             ('Disruptive_1416', 8),
             ('DoS_1416', 9),
             ('DoSDisruptiveSybil_0709', 10),
             ('DoSRandom_1416', 11),
             ('DoSRandomSybil_0709', 12),
             ('EventualStop_0709', 13),
             ('Disruptive_0709', 14),
             ('GridSybil_1416', 15),
             ('DoSDisruptive_0709', 16),
             ('DoSDisruptive_1416', 17),
             ('DataReplaySybil_1416', 18),
             ('DoSDisruptiveSybil_1416', 19)])

In [8]:
df = pd.read_feather('out_veremi/veremi-atk.feather')
df.groupby('dataset_id').attack_type.unique()

dataset_id
0.0      [0.0, 9.0]
1.0     [0.0, 17.0]
2.0     [0.0, 18.0]
3.0     [0.0, 14.0]
4.0     [0.0, 11.0]
5.0     [0.0, 11.0]
6.0     [0.0, 16.0]
7.0     [0.0, 13.0]
8.0     [0.0, 10.0]
9.0     [0.0, 13.0]
10.0    [0.0, 19.0]
11.0    [0.0, 14.0]
12.0    [0.0, 18.0]
13.0     [0.0, 9.0]
14.0    [0.0, 10.0]
15.0    [0.0, 16.0]
16.0    [0.0, 15.0]
17.0    [0.0, 15.0]
18.0    [0.0, 17.0]
19.0    [0.0, 19.0]
Name: attack_type, dtype: object

In [27]:
df = pd.read_feather('out_veremi/veremi-atk.feather')

random_state = 42

# group by sender and receiver
#grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
grouped = df.groupby(['dataset_id', 'sender', 'attack_type', 'time_id'])
interactions = grouped.size().rename('seq_len').reset_index()

train = interactions.sample(frac=0.8, random_state=random_state)
test = interactions.drop(train.index)

train_full_genuine = train[train.attack_type == 0]

sample_size_5_pctg = round(len(train_full_genuine) / 0.95 - len(train_full_genuine))
train_5_pctg_atk = train[train.attack_type != 0].sample(sample_size_5_pctg, random_state=random_state)
train_95_genuine = pd.concat([train_full_genuine, train_5_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

sample_size_10_pctg = round(len(train_full_genuine) / 0.9 - len(train_full_genuine))
train_10_pctg_atk = train[train.attack_type != 0].sample(sample_size_10_pctg, random_state=random_state)
train_90_genuine = pd.concat([train_full_genuine, train_10_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

del train_10_pctg_atk
del train_5_pctg_atk
del interactions
del grouped

print(f'''
          Train: {len(train)}, atk_pctg: {len(train[train.attack_type > 0]) / len(train)},
          Train 0%: {len(train_full_genuine)}, atk_pctg: {len(train_full_genuine[train_full_genuine.attack_type > 0]), len(train_full_genuine)},
          Train 5%: {len(train_95_genuine)}, atk_pctg: {len(train_95_genuine[train_95_genuine.attack_type > 0]), len(train_95_genuine)},
          Train 10%: {len(train_90_genuine)}, atk_pctg: {len(train_90_genuine[train_90_genuine.attack_type > 0]), len(train_90_genuine)},
          Test: {len(test)}, atk_pctg: {len(test[test.attack_type > 0]), len(test)}''')


          Train: 105434, atk_pctg: 0.2790940303886792,
          Train 0%: 76008, atk_pctg: (0, 76008),
          Train 5%: 80008, atk_pctg: (4000, 80008),
          Train 10%: 84453, atk_pctg: (8445, 84453),
          Test: 26359, atk_pctg: (7330, 26359)


In [28]:
# cols = ['dataset_id', 'sender', 'receiver', 'attack_type']
cols = ['dataset_id', 'sender', 'attack_type', 'time_id']

test_df = df.join(test.set_index(cols), how='inner', on=cols)
test_df.reset_index(drop=True).to_feather('out_veremi/test-atk.feather')
del test_df
del test

train_df = df.join(train.set_index(cols), how='inner', on=cols)
train_df.reset_index(drop=True).to_feather('out_veremi/train-atk.feather')
del train_df
del train

train_full_genuine_df = df.join(train_full_genuine.set_index(cols), how='inner', on=cols)
train_full_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_full_genuine-atk.feather')
del train_full_genuine_df
del train_full_genuine

train_95_genuine_df = df.join(train_95_genuine.set_index(cols), how='inner', on=cols)
train_95_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_95_genuine-atk.feather')
del train_95_genuine_df
del train_95_genuine

train_90_genuine_df = df.join(train_90_genuine.set_index(cols), how='inner', on=cols)
train_90_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_90_genuine-atk.feather')
del train_90_genuine_df
del train_90_genuine

3500679
