In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
import time
import warnings
import csv
import pandas as pd
import numpy as np
import math
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report, confusion_matrix

In [8]:
def hex_to_int(hex_value):
    return int(hex_value, base=16)

In [9]:
def hex_string_to_array(hex_string):
    if hex_string == 'z':
        return []
    else:
        return list(map(hex_to_int, hex_string))

In [10]:
class DNN(nn.Module):
    def __init__(self, d_in, d_out):  # config.slsum_count, config.dnn_out_d
        super(DNN, self).__init__()
        self.l1 = nn.Linear(d_in, 128)
        self.l2 = nn.Linear(128, 64)
        self.l3 = nn.Linear(64, d_out)

    def forward(self, x):
        # print('x: ', x.numpy()[0])
        out = F.relu(self.l1(x))
        out = F.relu(self.l2(out))
        out = F.relu(self.l3(out))
        # print('dnn out: ', out.detach().numpy()[0])
        return out

In [11]:
class Config:
    def __init__(self):
        self.model_name = 'Transformer'
        self.slide_window = 2
        self.slsum_count = int(math.pow(16, self.slide_window))  # 滑动窗口计数的特征的长度 n-gram?
        self.dnn_out_d = 8  # 经过DNN后的滑动窗口计数特征的维度 Dimensions of sliding window count features after DNN
        self.head_dnn_out_d = 32
        self.d_model = self.dnn_out_d + self.head_dnn_out_d  # transformer的输入的特征的维度, dnn_out_d + 包头长度 The dimension of the input feature of the transformer, dnn_out_d + header length
        self.pad_size = 100
        self.max_time_position = 10000
        self.nhead = 5
        self.num_layers = 3
        self.gran = 1e-6
        self.log_e = 2
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        self.classes_num = 3
        self.batch_size = 10
        self.epoch_num = 5
        self.lr = 0.001
        self.train_pro = 0.8  # 训练集比例 Ratio of training set

        self.data_root_dir = '../data/car-hacking'
        self.sl_sum_dir = '../data/car_hacking-data_slide_count_' + str(
            self.slide_window) + '_arr'
        self.time_dir = '../data/car_hacking-data_time'
        self.names_file = '../data/name_class_CICIDS_3.csv'
        self.model_save_path = '../model/' + self.model_name + '/'
        if not os.path.exists(self.model_save_path):
            os.mkdir(self.model_save_path)
        self.result_file = '/Users/d41sy/Desktop/sch/coding/ml-ids/result/trans8_performance.txt'

        self.isload_model = False  # 是否加载模型继续训练 Whether to load the model and continue training
        self.start_epoch = 24  # 加载的模型的epoch The epoch of the loaded model
        self.model_path = 'model/' + self.model_name + '/' + self.model_name + '_model_' + str(self.start_epoch) + '.pth'  # 要使用的模型的路径 path to the model to use


In [12]:
class Time_Positional_Encoding(nn.Module):
    def __init__(self, embed, max_time_position, device):
        super(Time_Positional_Encoding, self).__init__()
        self.device = device

    def forward(self, x, time_position):
        out = x.permute(1, 0, 2)
        out = out + nn.Parameter(time_position, requires_grad=False).to(self.device)
        out = out.permute(1, 0, 2)
        return out

In [8]:
class MyTrans(nn.Module):
    def __init__(self, config):
        super(MyTrans, self).__init__()
        self.dnn = DNN(config.slsum_count, config.dnn_out_d)
        self.head_dnn = DNN(60, config.head_dnn_out_d)
        self.position_embedding = Time_Positional_Encoding(config.d_model, config.max_time_position, config.device).to(
            config.device)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=config.d_model, nhead=config.nhead).to(config.device)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=config.num_layers).to(
            config.device)
        self.fc = nn.Linear(config.d_model, config.classes_num).to(config.device)
        self.pad_size = config.pad_size
        self.dnn_out_d = config.dnn_out_d
        self.head_dnn_out_d = config.head_dnn_out_d

    def forward(self, header, sl_sum, mask, time_position):
        dnn_out = torch.empty((sl_sum.shape[0], self.dnn_out_d, 0))

        for i in range(self.pad_size):
            tmp = self.dnn(sl_sum[:, i, :]).unsqueeze(2)
            dnn_out = torch.concat((dnn_out, tmp), dim=2)
        dnn_out = dnn_out.permute(0, 2, 1)

        head_dnn_out = torch.empty((header.shape[0], self.head_dnn_out_d, 0))
        for i in range(self.pad_size):
            tmp = self.head_dnn(header[:, i, :]).unsqueeze(2)
            head_dnn_out = torch.concat((head_dnn_out, tmp), dim=2)
        head_dnn_out = head_dnn_out.permute(0, 2, 1)

        x = torch.concat((head_dnn_out, dnn_out), dim=2).permute(1, 0, 2)

        out = self.position_embedding(x, time_position)
        out = self.transformer_encoder(out, src_key_padding_mask=mask)
        out = out.permute(1, 0, 2)
        out = torch.sum(out, 1)
        out = self.fc(out)
        return out

../data/car-hacking/Fuzzy_dataset.csv


In [5]:
def draw_confusion(label_y, pre_y, path):
    confusion = confusion_matrix(label_y, pre_y)
    print(confusion)

In [6]:
def write_result(fin, label_y, pre_y, classes_num):
    if classes_num > 2:
        accuracy = accuracy_score(label_y, pre_y)
        macro_precision = precision_score(label_y, pre_y, average='macro')
        macro_recall = recall_score(label_y, pre_y, average='macro')
        macro_f1 = f1_score(label_y, pre_y, average='macro')
        micro_precision = precision_score(label_y, pre_y, average='micro')
        micro_recall = recall_score(label_y, pre_y, average='micro')
        micro_f1 = f1_score(label_y, pre_y, average='micro')
        print('  -- test result: ')
        fin.write('  -- test result: \n')
        print('    -- accuracy: ', accuracy)
        fin.write('    -- accuracy: ' + str(accuracy) + '\n')
        print('    -- macro precision: ', macro_precision)
        fin.write('    -- macro precision: ' + str(macro_precision) + '\n')
        print('    -- macro recall: ', macro_recall)
        fin.write('    -- macro recall: ' + str(macro_recall) + '\n')
        print('    -- macro f1 score: ', macro_f1)
        fin.write('    -- macro f1 score: ' + str(macro_f1) + '\n')
        print('    -- micro precision: ', micro_precision)
        fin.write('    -- micro precision: ' + str(micro_precision) + '\n')
        print('    -- micro recall: ', micro_recall)
        fin.write('    -- micro recall: ' + str(micro_recall) + '\n')
        print('    -- micro f1 score: ', micro_f1)
        fin.write('    -- micro f1 score: ' + str(micro_f1) + '\n\n')
        report = classification_report(label_y, pre_y)
        fin.write(report)
        fin.write('\n\n')
    else:
        accuracy = accuracy_score(label_y, pre_y)
        precision = precision_score(label_y, pre_y)
        recall = recall_score(label_y, pre_y)
        f1 = f1_score(label_y, pre_y)
        print('  -- test result: ')
        print('    -- accuracy: ', accuracy)
        fin.write('    -- accuracy: ' + str(accuracy) + '\n')
        print('    -- recall: ', recall)
        fin.write('    -- recall: ' + str(recall) + '\n')
        print('    -- precision: ', precision)
        fin.write('    -- precision: ' + str(precision) + '\n')
        print('    -- f1 score: ', f1)
        fin.write('    -- f1 score: ' + str(f1) + '\n\n')
        report = classification_report(label_y, pre_y)
        fin.write(report)
        fin.write('\n\n')

In [35]:
config = Config()

In [36]:
fin = open(config.result_file, 'a')
fin.write('-------------------------------------\n')
fin.write(config.model_name + '\n')
fin.write('begin time: ' + str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) + '\n')
fin.write('data root dir: ' + config.data_root_dir + '\n')
fin.write('sl_sum_dir: ' + config.sl_sum_dir + '\n')
fin.write('names_file: ' + config.names_file + '\n')
fin.write('d_model: ' + str(config.d_model) + '\t pad_size: ' + str(config.pad_size) + '\t nhead: ' + str(config.nhead)
          + '\t num_layers: ' + str(config.num_layers) + '\t head_dnn_out_d: '+ str(config.head_dnn_out_d) +'\n')
fin.write(
    'batch_size: ' + str(config.batch_size) + '\t train pro: ' + str(config.train_pro) + '\t learning rate: ' + str(
        config.lr) + '\n\n')
fin.close()

In [37]:
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [52]:
class MyDatasetSLForTransDNNT(Dataset):
    def __init__(self, root_dir, sl_sum_dir, time_dir, names_file, pad_size, embed, max_time_position, gran, log_e,
                 transform=None):
        self.root_dir = root_dir
        self.sl_sum_dir = sl_sum_dir
        self.time_dir = time_dir
        self.names_file = names_file
        self.transform = transform
        self.size = 0
        self.name_list = []
        self.pad_size = pad_size
        self.embed = embed
        self.max_time_position = max_time_position
        self.gran = gran
        self.log_e = log_e

        if not os.path.isfile(self.names_file):
            print(self.names_file + 'does not exist!')
        f = open(self.names_file, 'r')
        reader = csv.reader(f)
        for line in reader:
            self.name_list.append(line)
            self.size += 1

        self.pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / self.embed)) for i in range(self.embed)] for pos in range(self.max_time_position)])
        print(self.pe)
        self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])  # 偶数列用sin Use sin for even columns
        self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])  # 奇数列用cos Use cos for odd columns

    def __len__(self):
        return self.size

    def get_time(self, time_position):
        # 根据时间位置切分出对应的位置编码
        # Segment the corresponding position code according to the time position
        pe = torch.index_select(self.pe, 0, time_position)
        return pe

    def __getitem__(self, idx):
        item = self.name_list[idx]
        feature_csv_path = os.path.join(self.root_dir, item[0])
        label = eval(item[1])
        if not os.path.exists(feature_csv_path):
            print(feature_csv_path, ' does not exist!')
            return None
        feature_f = open(feature_csv_path, 'r')
        feature_reader = csv.reader(feature_f)

        print(feature_f)
        print(feature_reader)

        ip_header = feature_reader.__next__()[1:]
        ip_header = np.array(list(map(hex_string_to_array, list(ip_header))))[:self.pad_size]
        tcp_header = feature_reader.__next__()[1:]
        tcp_header = np.array(list(map(hex_string_to_array, list(tcp_header))))[:self.pad_size]
        header = np.hstack((ip_header, tcp_header))
        header = torch.from_numpy(header)

        print(header)

        slsum_csv_path = os.path.join(self.sl_sum_dir, item[0])
        if not os.path.exists(slsum_csv_path):
            print(slsum_csv_path, 'does not exist!')
            return None
        sl_sum = pd.read_csv(slsum_csv_path, header=None, index_col=None).values[:self.pad_size]
        sl_sum = torch.from_numpy(np.array(sl_sum))

        ori_seq_len = header.shape[0]
        pad_len = self.pad_size - ori_seq_len

        header = F.pad(header.T, (0, pad_len)).T.numpy()
        sl_sum = F.pad(sl_sum.T, (0, pad_len)).T.numpy()

        if pad_len == 0:
            mask = np.array([False] * ori_seq_len)
        else:
            mask = np.concatenate((np.array([False] * ori_seq_len), np.array([True] * pad_len)))  # padding mask

        # time
        time_csv_path = os.path.join(self.time_dir, item[0])
        time_record = pd.read_csv(time_csv_path, header=None, index_col=None).values[0][:self.pad_size]
        len_time_record = len(time_record)
        for i in range(len_time_record):
            value = round(math.log(round(time_record[i] / self.gran) + 1, self.log_e))
            time_record[i] = value
        for j in range(self.pad_size - len_time_record):
            time_record = np.append(time_record, time_record[len_time_record - 1])

        time_feature = self.get_time(torch.IntTensor(time_record))

        sample = {'header': header, 'sl_sum': sl_sum, 'mask': mask, 'time': time_feature, 'label': label, 'idx': idx}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [57]:
dataset = MyDatasetSLForTransDNNT(config.data_root_dir, config.sl_sum_dir, config.time_dir, config.names_file, config.pad_size, config.d_model, config.max_time_position, config.gran, config.log_e)
size = len(dataset)
print(dataset.name_list)

In [6]:
train_size = int(config.train_pro * size)
test_size = size - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=config.batch_size)
print('finish load data')

NameError: name 'config' is not defined