In [43]:
import pandas as pd
# import vaex
import numpy as np
import glob
import dask.dataframe as dd
import json
from sklearn.model_selection import train_test_split
import math
import os
import csv
import torch.nn.functional as F
import torch
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report, confusion_matrix
import time
from torch.utils.data import Dataset, DataLoader
import warnings
import torch.nn as nn
import tqdm
import sys
import swifter


In [2]:
def hex_to_int(hex_value):
    return int(hex_value, base=16)

In [3]:
def hex_string_to_array(hex_string):
    if hex_string == 'z':
        return []
    else:
        return list(map(hex_to_int, hex_string))

In [4]:
hex_string_to_array('0D0')

[0, 13, 0]

In [5]:
def fill_flag(sample):
    if not isinstance(sample['Flag'], str):
        col = 'Data' + str(sample['DLC'])
        sample['Flag'] = sample[col]
        sample[col] = '00'
    return sample

In [6]:
# Read by dask first
attributes = ['Timestamp', 'canID', 'DLC',
   'Data0', 'Data1', 'Data2',
           'Data3', 'Data4', 'Data5',
                           'Data6', 'Data7', 'Flag']
dataset_path  = '../data/car-hacking/'
attack_types = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Test']
attack = attack_types[4]
file_name = '{}{}_dataset.csv'.format(dataset_path, attack)
print(file_name)

../data/car-hacking/Test_dataset.csv


In [44]:
def get_time(pe, time_position):
    # 根据时间位置切分出对应的位置编码
    pe = torch.index_select(pe, 0, time_position)
    return pe

In [79]:
df = dd.read_csv(file_name, header=None, names=attributes , dtype={5: 'object',
       9: 'object', 7: 'object', 6: 'object', 11: 'object'})
df = df.apply(fill_flag, axis=1, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'int64', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'float64', 'Data7': 'object', 'Flag': 'object'})

df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']] = df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']].fillna('00')

df['Payload'] = df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']].apply(lambda x: ''.join(x.astype(str)), axis=1, meta=(None, 'object'))

pd_df = df.compute()
pd_df = pd_df[['Timestamp', 'canID', 'DLC', 'Payload', 'Flag']].sort_values('Timestamp',  ascending=True)
pd_df['Flag'] = pd_df['Flag'].apply(lambda x: True if x == 'T' else False)
print(pd_df)

header = np.array(list(map(hex_string_to_array, list(pd_df['canID']))))[:100]
header = torch.from_numpy(header)
print("HEADER BEFORE: ", header)

sl_sum = np.array(list(map(hex_string_to_array, list(pd_df['Payload']))))
sl_sum = torch.from_numpy(sl_sum)
print("PAYLOAD BEFORE: ",sl_sum)

ori_seq_len = header.shape[0]
pad_len = 100 - ori_seq_len
print(pad_len)

header = F.pad(header.T, (0, pad_len)).T.numpy()
sl_sum = F.pad(sl_sum.T, (0, pad_len)).T.numpy()

if pad_len == 0:
       mask = np.array([False] * ori_seq_len)
else:
       mask = np.concatenate((np.array([False] * ori_seq_len), np.array([True] * pad_len)))
       
time_record = pd_df['Timestamp']
len_time_record = len(time_record)

for i in range(len_time_record):
       value = round(math.log(round(time_record[i] / 1e-6) + 1, 2))
       time_record[i] = value
for j in range(100 - len_time_record):
       time_record = np.append(time_record, time_record[len_time_record - 1])

pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / 40)) for i in range(40)] for pos in range(10000)])
pe[:, 0::2] = np.sin(pe[:, 0::2])  # 偶数列用sin
pe[:, 1::2] = np.cos(pe[:, 1::2])  # 奇数列用cos

time_feature = get_time(pe, torch.IntTensor(time_record))
sample = {'header': header, 'sl_sum': sl_sum, 'mask': mask, 'time': time_feature}

print("HEADER FEATURE: ", header, " AND LENGTH: ", len(header))
print("PAYLOAD FEATURE: ", sl_sum, " AND LENGTH: ", len(sl_sum))
print("MASK FEATURE: ", mask, " AND LENGTH: ", len(mask))
print("TIME FEATURE: ", time_feature)
# print("LABEL: ", label)
# print("INDEX: ", idx)


#print(sample)


       Timestamp canID  DLC           Payload   Flag
0   1.478196e+09  0545    8  d800008a00000000  False
1   1.478196e+09  02b0    5  ff7f000549000000  False
2   1.478196e+09  0002    8  0000000000010715  False
3   1.478196e+09  0153    8  002110ff00ff0000  False
4   1.478196e+09  0130    8  198000fffe7f0760  False
..           ...   ...  ...               ...    ...
95  1.478196e+09  0260    8  18212130088f7006  False
96  1.478196e+09  02a0    8  0400991d9702bd00  False
97  1.478196e+09  0329    8  40b87f1411200014  False
98  1.478196e+09  0545    8  d855008b00000000  False
99  1.478196e+09  02b0    5  ff7f00053e000000  False

[100 rows x 5 columns]
HEADER BEFORE:  tensor([[ 0,  5,  4,  5],
        [ 0,  2, 11,  0],
        [ 0,  0,  0,  2],
        [ 0,  1,  5,  3],
        [ 0,  1,  3,  0],
        [ 0,  1,  3,  1],
        [ 0,  1,  4,  0],
        [ 0,  3,  5,  0],
        [ 0,  2, 12,  0],
        [ 0,  3,  7,  0],
        [ 0,  4,  3, 15],
        [ 0,  4,  4,  0],
        [ 0,

In [55]:
warnings.filterwarnings("ignore")

def draw_confusion(label_y, pre_y, path):
    confusion = confusion_matrix(label_y, pre_y)
    print(confusion)


def write_result(fin, label_y, pre_y, classes_num):
    if classes_num > 2:
        accuracy = accuracy_score(label_y, pre_y)
        macro_precision = precision_score(label_y, pre_y, average='macro')
        macro_recall = recall_score(label_y, pre_y, average='macro')
        macro_f1 = f1_score(label_y, pre_y, average='macro')
        micro_precision = precision_score(label_y, pre_y, average='micro')
        micro_recall = recall_score(label_y, pre_y, average='micro')
        micro_f1 = f1_score(label_y, pre_y, average='micro')
        print('  -- test result: ')
        fin.write('  -- test result: \n')
        print('    -- accuracy: ', accuracy)
        fin.write('    -- accuracy: ' + str(accuracy) + '\n')
        print('    -- macro precision: ', macro_precision)
        fin.write('    -- macro precision: ' + str(macro_precision) + '\n')
        print('    -- macro recall: ', macro_recall)
        fin.write('    -- macro recall: ' + str(macro_recall) + '\n')
        print('    -- macro f1 score: ', macro_f1)
        fin.write('    -- macro f1 score: ' + str(macro_f1) + '\n')
        print('    -- micro precision: ', micro_precision)
        fin.write('    -- micro precision: ' + str(micro_precision) + '\n')
        print('    -- micro recall: ', micro_recall)
        fin.write('    -- micro recall: ' + str(micro_recall) + '\n')
        print('    -- micro f1 score: ', micro_f1)
        fin.write('    -- micro f1 score: ' + str(micro_f1) + '\n\n')
        report = classification_report(label_y, pre_y)
        fin.write(report)
        fin.write('\n\n')
    else:
        accuracy = accuracy_score(label_y, pre_y)
        precision = precision_score(label_y, pre_y)
        recall = recall_score(label_y, pre_y)
        f1 = f1_score(label_y, pre_y)
        print('  -- test result: ')
        print('    -- accuracy: ', accuracy)
        fin.write('    -- accuracy: ' + str(accuracy) + '\n')
        print('    -- recall: ', recall)
        fin.write('    -- recall: ' + str(recall) + '\n')
        print('    -- precision: ', precision)
        fin.write('    -- precision: ' + str(precision) + '\n')
        print('    -- f1 score: ', f1)
        fin.write('    -- f1 score: ' + str(f1) + '\n\n')
        report = classification_report(label_y, pre_y)
        fin.write(report)
        fin.write('\n\n')

In [160]:
class DNN(nn.Module):
    def __init__(self, d_in, d_out):  # config.slsum_count, config.dnn_out_d
        super(DNN, self).__init__()
        self.l1 = nn.Linear(d_in, 128)
        self.l2 = nn.Linear(128, 64)
        self.l3 = nn.Linear(64, d_out)

    def forward(self, x):
        # print('x: ', x.numpy()[0])
        out = F.relu(self.l1(x))
        out = F.relu(self.l2(out))
        out = F.relu(self.l3(out))
        # print('dnn out: ', out.detach().numpy()[0])
        return out

In [161]:
class Time_Positional_Encoding(nn.Module):
    def __init__(self, embed, max_time_position, device):
        super(Time_Positional_Encoding, self).__init__()
        self.device = device

    def forward(self, x, time_position):
        out = x.permute(1, 0, 2)
        out = out + nn.Parameter(time_position, requires_grad=False).to(self.device)
        out = out.permute(1, 0, 2)
        return out

In [23]:
class MyTrans(nn.Module):
    def __init__(self, config):
        super(MyTrans, self).__init__()
        self.dnn = DNN(config.slsum_count, config.dnn_out_d)
        self.head_dnn = DNN(60, config.head_dnn_out_d)
        self.position_embedding = Time_Positional_Encoding(config.d_model, config.max_time_position, config.device).to(
            config.device)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=config.d_model, nhead=config.nhead).to(config.device)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=config.num_layers).to(
            config.device)
        self.fc = nn.Linear(config.d_model, config.classes_num).to(config.device)
        self.pad_size = config.pad_size
        self.dnn_out_d = config.dnn_out_d
        self.head_dnn_out_d = config.head_dnn_out_d

    def forward(self, header, sl_sum, mask, time_position):
        dnn_out = torch.empty((sl_sum.shape[0], self.dnn_out_d, 0))

        for i in range(self.pad_size):
            tmp = self.dnn(sl_sum[:, i, :]).unsqueeze(2)
            dnn_out = torch.concat((dnn_out, tmp), dim=2)
        dnn_out = dnn_out.permute(0, 2, 1)

        head_dnn_out = torch.empty((header.shape[0], self.head_dnn_out_d, 0))
        for i in range(self.pad_size):
            tmp = self.head_dnn(header[:, i, :]).unsqueeze(2)
            head_dnn_out = torch.concat((head_dnn_out, tmp), dim=2)
        head_dnn_out = head_dnn_out.permute(0, 2, 1)

        x = torch.concat((head_dnn_out, dnn_out), dim=2).permute(1, 0, 2)

        out = self.position_embedding(x, time_position)
        out = self.transformer_encoder(out, src_key_padding_mask=mask)
        out = out.permute(1, 0, 2)
        out = torch.sum(out, 1)
        out = self.fc(out)
        return out

In [169]:
class Config:
    def __init__(self):
        self.model_name = 'Transformer'
        self.slide_window = 2
        self.slsum_count = int(math.pow(4, self.slide_window))  # 滑动窗口计数的特征的长度 n-gram?
        self.dnn_out_d = 8  # 经过DNN后的滑动窗口计数特征的维度 Dimensions of sliding window count features after DNN
        self.head_dnn_out_d = 2
        self.d_model = self.dnn_out_d + self.head_dnn_out_d  # transformer的输入的特征的维度, dnn_out_d + 包头长度 The dimension of the input feature of the transformer, dnn_out_d + header length
        self.pad_size = 100
        self.max_time_position = 10000
        self.nhead = 5
        self.num_layers = 3
        self.gran = 1e-6
        self.log_e = 2
        self.device = torch.device('cuda' if torch.backends.mps.is_available() else 'cpu')
        self.classes_num = 2
        self.batch_size = 10
        self.epoch_num = 5
        self.lr = 0.001
        self.train_pro = 0.8  # 训练集比例 Ratio of training set

        self.data_root_dir = '../data/car-hacking'
        self.sl_sum_dir = '../data/car_hacking-data_slide_count_' + str(
            self.slide_window) + '_arr'
        self.time_dir = '../data/car_hacking-data_time'
        self.names_file = '../data/name_class_CICIDS_3.csv'
        self.model_save_path = '../model/' + self.model_name + '/'
        if not os.path.exists(self.model_save_path):
            os.mkdir(self.model_save_path)
        self.result_file = '/Users/d41sy/Desktop/sch/coding/ml-ids/result/trans8_performance.txt'

        self.isload_model = False  # 是否加载模型继续训练 Whether to load the model and continue training
        self.start_epoch = 24  # 加载的模型的epoch The epoch of the loaded model
        self.model_path = 'model/' + self.model_name + '/' + self.model_name + '_model_' + str(self.start_epoch) + '.pth'  # 要使用的模型的路径 path to the model to use


In [170]:
config = Config()

fin = open(config.result_file, 'a')
fin.write('-------------------------------------\n')
fin.write(config.model_name + '\n')
fin.write('begin time: ' + str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) + '\n')
fin.write('data root dir: ' + config.data_root_dir + '\n')
fin.write('sl_sum_dir: ' + config.sl_sum_dir + '\n')
fin.write('names_file: ' + config.names_file + '\n')
fin.write('d_model: ' + str(config.d_model) + '\t pad_size: ' + str(config.pad_size) + '\t nhead: ' + str(config.nhead)
          + '\t num_layers: ' + str(config.num_layers) + '\t head_dnn_out_d: '+ str(config.head_dnn_out_d) +'\n')
fin.write(
    'batch_size: ' + str(config.batch_size) + '\t train pro: ' + str(config.train_pro) + '\t learning rate: ' + str(
        config.lr) + '\n\n')
fin.close()
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [171]:
class DatasetPreprocess(Dataset):
    def __init__(self, root_path, payload_path, timestamp_path, names_file, pad_size, embed, max_time_position, gran, log_e, transform=None):
        self.root_path = root_path
        self.payload_path = payload_path
        self.timestamp_path = timestamp_path
        self.names_file = names_file # file contain list filenames of data
        self.transform = transform
        self.size = 0
        self.name_list = []
        self.pad_size = pad_size
        self.embed = embed
        self.max_time_position = max_time_position
        self.gran = gran
        self.log_e = log_e
        
        
        if not os.path.isfile(self.names_file):
            print(self.names_file + 'does not exist!')
        f = open(self.names_file, 'r')
        reader = csv.reader(f)
        for line in reader:
            self.name_list.append(line)
            self.size += 1 # size of data files

        self.pe = torch.tensor(
            [[pos / (10000.0 ** (i // 2 * 2.0 / self.embed)) for i in range(self.embed)] for pos in
             range(self.max_time_position)])
        self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])  # Use sin for even columns
        self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])  # Use cos for odd columns
        
    def __len__(self):
        return self.size
    
    def get_time(self, time_position):
        # Segment the corresponding position code according to the time position
        pe = torch.index_select(self.pe, 0, time_position)
        return pe
    
    def __getitem__(self, idx):
        # print("NAMELIST: ", self.name_list)
        
        # READ FILE CAR-HACKING | file_name {Fuzzy, DDos, RPM, GEAR}
        df = dd.read_csv(file_name, header=None, names=attributes , dtype={5: 'object', 9: 'object', 7: 'object', 6: 'object', 11: 'object'})
        df = df.apply(fill_flag, axis=1, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'int64', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'float64', 'Data7': 'object', 'Flag': 'object'})

        # EXTRACTION
        df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']] = df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']].fillna('00')
        df['Payload'] = df[['Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7']].apply(lambda x: ''.join(x.astype(str)), axis=1, meta=(None, 'object'))
        pd_df = df.compute()
        pd_df = pd_df[['Timestamp', 'canID', 'DLC', 'Payload', 'Flag']].sort_values('Timestamp',  ascending=True)
        pd_df['Flag'] = pd_df['Flag'].apply(lambda x: True if x == 'T' else False)
        false_data = pd_df[pd_df['Flag'] == False]
        # pd_df.head(30)

        # PREPROCESS
        ## GET HEADER 
        header = np.array(list(map(hex_string_to_array, list(false_data['canID']))))[:100]
        header = torch.from_numpy(header)
        # print("HEADER BEFORE: ", header)
        ## GET PAYLOAD 
        payload = np.array(list(map(hex_string_to_array, list(false_data['Payload']))))
        payload = torch.from_numpy(payload)
        # print("PAYLOAD BEFORE: ",payload)

        ori_seq_len = header.shape[0]
        pad_len = 100 - ori_seq_len
        # print(pad_len)

        ## PAD WITH MAX SIZE = 100
        header = F.pad(header.T, (0, pad_len)).T.numpy()
        payload = F.pad(payload.T, (0, pad_len)).T.numpy()

        if pad_len == 0:
            mask = np.array([False] * ori_seq_len)
        else:
            mask = np.concatenate((np.array([False] * ori_seq_len), np.array([True] * pad_len)))
        
        ## GET TIMESTAMP 
        time_record = false_data['Timestamp']
        len_time_record = len(time_record)

        for i in range(len_time_record):
            value = round(math.log(round(time_record[i] / self.gran) + 1, self.log_e))
            time_record[i] = value
        for j in range(self.pad_size - len_time_record):
            time_record = np.append(time_record, time_record[len_time_record - 1])

        time_feature = self.get_time(torch.IntTensor(time_record))
        sample = {'header': header, 'sl_sum': payload, 'mask': mask, 'time': time_feature, 'label': 1, 'idx': idx}

        # print("HEADER FEATURE: ", header, " AND LENGTH: ", len(header))
        print("PAYLOAD FEATURE: ", payload, " AND LENGTH: ", len(payload[0]))
        # print("MASK FEATURE: ", mask, " AND LENGTH: ", len(mask))
        # print("TIME FEATURE: ", time_feature)
        # print("LABEL: ", label)
        # print("INDEX: ", idx)

        if self.transform:
            sample = self.transform(sample)

        # print("SAMPLE: ", sample)
        return sample

        

In [172]:
dataset = DatasetPreprocess(config.data_root_dir, config.sl_sum_dir, config.time_dir, config.names_file, config.pad_size, config.d_model, config.max_time_position, config.gran, config.log_e)
size = len(dataset)

In [173]:
train_size = int(config.train_pro * size)
test_size = size - train_size
print("TRAIN SIZE: ", train_size, " TEST SIZE: ", test_size, " SIZE: ", size)

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)
print('finish load data')

TRAIN SIZE:  16  TEST SIZE:  5  SIZE:  21
finish load data


In [174]:
if config.isload_model:
    print("Case loaded")
    fin = open(config.result_file, 'a')
    fin.write('load trained model :    model_path: ' + config.model_path)
    model = torch.load(config.model_path)
    start_epoch = config.start_epoch
    fin.close()
else:
    print("Case trained")
    model = MyTrans(config)
    start_epoch = -1
loss_func = nn.CrossEntropyLoss().to(config.device)
opt = torch.optim.Adam(model.parameters(), lr=config.lr)

for epoch in range(start_epoch + 1, config.epoch_num):
    fin = open(config.result_file, 'a')
    print('--- epoch ', epoch)
    fin.write('-- epoch ' + str(epoch) + '\n')
    for i, sample_batch in enumerate(train_loader):
        batch_header = sample_batch['header'].type(torch.FloatTensor).to(config.device)
        batch_sl_sum = sample_batch['sl_sum'].type(torch.FloatTensor).to(config.device)
        batch_mask = sample_batch['mask'].to(config.device)
        batch_label = sample_batch['label'].to(config.device)
        batch_time_position = sample_batch['time'].to(config.device)
        print("BATCH HEADER: ", len(batch_header[0]), " \nBATCH PAYLOAD: ", batch_sl_sum[0][0]," \nBATCH MASK: ", len(batch_mask)," \nBATCH LABEL: ", batch_label," \nBATCH TIME POSITION: ", batch_time_position[0])
        out = model(batch_header, batch_sl_sum, batch_mask, batch_time_position)
        loss = loss_func(out, batch_label)
        opt.zero_grad()
        loss.backward()
        opt.step()
        if i % 20 == 0:
            print('iter {} loss: '.format(i), loss.item())
    torch.save(model, (config.model_save_path + config.model_name + '_model_{}.pth').format(epoch))

    # test
    label_y = []
    pre_y = []
    with torch.no_grad():
        for j, test_sample_batch in enumerate(test_loader):
            test_header = test_sample_batch['header'].type(torch.FloatTensor).to(config.device)
            test_sl_sum = test_sample_batch['sl_sum'].type(torch.FloatTensor).to(config.device)
            test_mask = test_sample_batch['mask'].to(config.device)
            test_label = test_sample_batch['label'].to(config.device)
            test_time_position = test_sample_batch['time'].to(config.device)
            
            test_out = model(test_header, test_sl_sum, test_mask, test_time_position)

            pre = torch.max(test_out, 1)[1].cpu().numpy()
            pre_y = np.concatenate([pre_y, pre], 0)
            label_y = np.concatenate([label_y, test_label.cpu().numpy()], 0)
        write_result(fin, label_y, pre_y, config.classes_num)
    fin.close()

fin = open(config.result_file, 'a')
fin.write('\n\n\n')
fin.close()

Case trained


AssertionError: Torch not compiled with CUDA enabled

In [None]:

def serialize_example(x, y):
    """converts x, y to tf.train.Example and serialize"""
    # Need to pay attention to whether it needs to be converted to numpy() form
    input_features = tf.train.Int64List(value=np.array(x).flatten())
    label = tf.train.Int64List(value=np.array([y]))
    features = tf.train.Features(
        feature={
            "input_features": tf.train.Feature(int64_list=input_features),
            "label": tf.train.Feature(int64_list=label)
        }
    )
    example = tf.train.Example(features=features)
    return example.SerializeToString()

def write_tfrecord(data, filename):
    tfrecord_writer = tf.io.TFRecordWriter(filename)
    for _, row in tqdm(data.iterrows()):
        tfrecord_writer.write(serialize_example(row['features'], row['label']))
    tfrecord_writer.close()

In [111]:
def split_data(attack, dataset_path, window_size = 100, strided_size = 100):
    file_name = '{}{}_dataset.csv'.format(dataset_path, attack)
    if not os.path.exists(file_name):
            print(file_name, ' does not exist!')
            return None
    
    splited_list = list()
    
    # for idx in range(0, )
    
    df = pd.read_csv(file_name, header=None, names=attributes)
    print("Reading {}: done".format(file_name))
    df = df.sort_values('Timestamp', ascending=True)
    df = df.swifter.apply(fill_flag, axis=1) 
    
    num_data_bytes = 8
    for x in range(num_data_bytes):
        df['Data'+str(x)] = df['Data'+str(x)].map(lambda x: int(x, 16), na_action='ignore')
    # print("HEADER BEFORE: ", header)
    ## GET PAYLOAD 
    df['canID'] = df['canID'].apply(lambda x: hex_string_to_array(x))
    
    df = df.fillna(0)
    data_cols = ['Data{}'.format(x) for x in range(num_data_bytes)]
    df[data_cols] = df[data_cols].astype(int) 
    df['Data'] = df[data_cols].values.tolist()
    df['Flag'] = df['Flag'].apply(lambda x: 1 if x=='T' else 0)
    print("Pre-processing: Done")
    
    print("BYTEs: ", df['Flag'][0].nbytes)
    
    as_strided = np.lib.stride_tricks.as_strided
    output_shape = ((len(df) - window_size) // strided_size + 1, window_size)
    timestamp = as_strided(df.Timestamp, output_shape, (8*strided_size, 8))
    canid = as_strided(df.canID, output_shape, (8*strided_size, 8))
    data = as_strided(df.Data, output_shape, (8*strided_size, 8)) #Stride is counted by bytes
    label = as_strided(df.Flag, output_shape, (1*strided_size, 1))
    
    print("Data output", timestamp[0])
    print("Data output", canid[0])
    print("Data output", data[0])
    print("Data output", label[0])
    
    df = pd.DataFrame({
        'timestamp': pd.Series(timestamp.tolist()), 
        'header': pd.Series(canid.tolist()), 
        'payload': pd.Series(data.tolist()),
        'label': pd.Series(label.tolist())
    }, index= range(len(canid)))
    
    #df['label'] = df['label'].apply(lambda x: attack_id if any(x) else 0)
    print("Aggregating data: Done")
    print('#Normal: ', df[df['label'] == 0].shape[0])
    print('#Attack: ', df[df['label'] != 0].shape[0])
    return df[['timestamp', 'header', 'payload', 'label']].reset_index().drop(['index'], axis=1)


attributes = ['Timestamp', 'canID', 'DLC',
                           'Data0', 'Data1', 'Data2',
                           'Data3', 'Data4', 'Data5',
                           'Data6', 'Data7', 'Flag']
dataset_path  = '../data/car-hacking/'
attack_types = ['DoS', 'Fuzzy', 'gear', 'RPM', 'Test']

split_data(attack_types[4], dataset_path, 100, 100)

Reading ../data/car-hacking/Test_dataset.csv: done


Pandas Apply:   0%|          | 0/600 [00:00<?, ?it/s]

Pre-processing: Done
BYTEs:  8
Data output [1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e+09 1.47819572e+09
 1.47819572e+09 1.47819572e+09 1.47819572e

Unnamed: 0,timestamp,header,payload,label
0,"[1478195721.903877, 1478195721.903877, 1478195...","[[0, 5, 4, 5], [0, 5, 4, 5], [0, 5, 4, 5], [0,...","[[216, 0, 0, 138, 0, 0, 0, 0], [216, 0, 0, 138...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1478195721.91337, 1478195721.91337, 147819572...","[[0, 2, 10, 0], [0, 2, 10, 0], [0, 3, 2, 9], [...","[[4, 0, 153, 29, 151, 2, 189, 0], [4, 0, 153, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[1478195721.921601, 1478195721.921601, 1478195...","[[0, 4, 3, 15], [0, 4, 3, 15], [0, 4, 3, 15], ...","[[16, 64, 96, 255, 125, 139, 9, 0], [16, 64, 9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1478195721.93064, 1478195721.93064, 147819572...","[[0, 3, 7, 0], [0, 3, 7, 0], [0, 3, 7, 0], [0,...","[[0, 32, 0, 0, 0, 0, 0, 0], [0, 32, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[1478195721.939658, 1478195721.939658, 1478195...","[[0, 3, 5, 0], [0, 3, 5, 0], [0, 3, 7, 0], [0,...","[[5, 32, 68, 104, 120, 0, 0, 113], [5, 32, 68,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,"[1478195721.948674, 1478195721.948674, 1478195...","[[0, 1, 5, 3], [0, 1, 5, 3], [0, 1, 5, 3], [0,...","[[0, 33, 16, 255, 0, 255, 0, 0], [0, 33, 16, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [112]:
def write_tfrecord(data, filename):
    tfrecord_writer = tf.io.TFRecordWriter(filename)
    for _, row in tqdm(data.iterrows()):
        X = (row['timestamp'], row['header'], row['payload'])
        Y = row['label']
        tfrecord_writer.write(serialize_example(X, Y))
    tfrecord_writer.close() 

In [113]:
write_tfrecord()

TypeError: write_tfrecord() missing 2 required positional arguments: 'data' and 'filename'

ModuleNotFoundError: No module named 'dataPreprocess'