In [1]:
import numpy as np
import pandas as pd 
import torch

import torch.optim as optim

from torch import nn
from pyarrow import csv
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import IterableDataset, Dataset, DataLoader
from torch.utils.data.dataset import T_co

In [2]:
device = torch.device("cuda:0")

In [3]:
def read_transform_indicator(filename):
    df = pd.read_csv(filename)
    Y = df.Label
    X = df.drop(['Unnamed: 0', 'Label'], axis=1)
    del df

    X['Attack'] = X['Attack'].astype("string")
    attack_type = X['Attack']
    X = X.drop('Attack', axis=1)

    X = X.drop(['SimillarHTTP'], axis=1)
    
    X.loc[X['Flow Bytes/s'].isnull(), 'Flow Bytes/s'] = 0
    X.loc[X['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = 0
    
    X.loc[X[' Flow Packets/s'].isnull(), ' Flow Packets/s'] = 0
    X.loc[X[' Flow Packets/s'] == np.inf, ' Flow Packets/s'] = 0

    return X, Y, attack_type

def scale(X):
    pass

In [1]:
def default_transform(df):
    return df

def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 6
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

def prep_NF_UQ(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Dataset', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']
    df.drop(columns=features_to_remove, axis=1, inplace=True)
    print(df.columns)
    Y = df.Label

    return df.drop('Label', axis=1).to_numpy(), Y.to_numpy()

def prep_ddos(df):
    features_to_remove = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Attack',
                   ' Flow Duration': 'FLOW_DURATION_MILLISECONDS',
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df['Label'] = 1
    df['mask'] = 2
    return df

NameError: name 'csv' is not defined

In [12]:
class ReadBigCSV(IterableDataset):

    def __getitem__(self, index) -> T_co:
        raise NotImplementedError

    def __init__(self, filename, opts=csv.ConvertOptions(), transform=default_transform, chunkscale_linear=10, chunkscale_exp=6):
        self.filename = filename
        self.opts = opts
        self.transform = transform
        self.chunkscale_linear = chunkscale_linear
        self.chunkscale_exp = chunkscale_exp

    def __iter__(self):
        chunksize = self.chunkscale_linear ** self.chunkscale_exp
        with csv.open_csv(
            self.filename,
            convert_options = self.opts,
            read_options=csv.ReadOptions(
                use_threads=True,
                block_size=chunksize
            )) as reader:

            for chunk in reader:
                if chunk is None:
                    break
                yield self.transform(chunk.to_pandas())

In [22]:
class modelDNN(nn.Module):
    def __init__(self):
        super(modelDNN, self).__init__()        
        self.layer_1 = nn.Linear(39, 4096) 
        self.layer_2 = nn.Linear(4096, 2048)
        self.layer_3 = nn.Linear(2048, 1024) 
        self.layer_4 = nn.Linear(1024, 64) 
        self.layer_5 = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(4096)
        self.batchnorm2 = nn.BatchNorm1d(2048)
        self.batchnorm3 = nn.BatchNorm1d(1024)
        self.batchnorm4 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.relu(self.layer_4(x))
        x = self.batchnorm4(x)
        x = self.dropout(x)
        x = self.layer_5(x)
        
        return x
    
class tabularCNN(nn.Module):
    def __init__(self):
        super(tabularCNN, self).__init__()        
        self.layer_1 = nn.Linear(39, 1024) 
        self.layer_2 = nn.Linear(1024, 4096)
        self.layer_3 = nn.Linear(32*36, 1024) 
        self.layer_4 = nn.Linear(1024, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(1024)
        self.batchnorm2 = nn.BatchNorm1d(4096)
        self.batchnorm3 = nn.BatchNorm1d(1024)
        
        self.pool = nn.MaxPool2d(2, 2)
        self.conv_1 = nn.Conv2d(1, 32, 3)
        self.conv_2 = nn.Conv2d(32, 32, 3)
        self.conv_3 = nn.Conv2d(32, 32, 3)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        #print(x.shape)
        x = x.reshape(len(x), 1, 64, 64)
        x = self.pool(self.relu(self.conv_1(x)))
        x = self.pool(self.relu(self.conv_2(x)))
        x = self.pool(self.relu(self.conv_3(x)))
        #print(x.shape)
        x = x.reshape(len(x), 32*36)
        
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_4(x)
        
        return x

In [7]:
X, Y = prep_NF_UQ(read_chunk('data/NFUQ/NF-UQ-NIDS-v2.csv', chunk_id=0))

In [8]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
del X, Y

In [14]:
def binary_acc(y_pred, y_test):

    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag.detach().cpu().numpy().reshape(-1) == y_test.detach().cpu().numpy().reshape(-1)).sum()

    acc = correct_results_sum/y_test.shape[0]

    acc *= 100
    
    return acc

In [26]:
data = ReadBigCSV('data/NFUQ/NF-UQ-NIDS-v2.csv', transform=prep_NF_UQ, chunkscale_linear=10, chunkscale_exp=5)
loader = DataLoader(dataset=data, batch_size=1)

model = tabularCNN()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)
lmbda = lambda epoch: 0.65 ** epoch
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)
epoch_acc = 0
total = 0
model.train()

tabularCNN(
  (layer_1): Linear(in_features=39, out_features=1024, bias=True)
  (layer_2): Linear(in_features=1024, out_features=4096, bias=True)
  (layer_3): Linear(in_features=1152, out_features=1024, bias=True)
  (layer_4): Linear(in_features=1024, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv_3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
)

In [16]:
def convertData32(X, Y):
    X = X.astype('float32')
    mask = np.logical_not((X > 10).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not((X < -10).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isinf(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isnan(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    return X, Y

In [17]:
def convertData32Scale(X, Y):
    X = X.astype('float32')
    mask = np.logical_not((X > 100000000).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not((X < -100000000).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isinf(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isnan(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    return X, Y

In [18]:
def getScalerStandart(filename, transform=default_transform):
    
    scaler = StandardScaler()
    chunksize = 10 ** 6
    with csv.open_csv(
        filename,
        convert_options = csv.ConvertOptions(),
        read_options=csv.ReadOptions(
             use_threads=True,
            block_size=chunksize
        )) as reader:

        for chunk in reader:
            if chunk is None:
                break
            X, Y = transform(chunk.to_pandas())
            X, Y = convertData32Scale(X, Y)
            scaler.partial_fit(X)
    return scaler

In [19]:
fullscaler = getScalerMinMax('data/NFUQ/NF-UQ-NIDS-v2.csv', transform=prep_NF_UQ)

In [20]:
scaler = fullscaler

In [27]:
%%time

i = 0
for epoch in range(10):
    for X, Y in loader:
        i+=1
        X, Y = convertData32(scaler.transform(X[0]), Y[0])
        X = torch.tensor(X).to(torch.float32).detach().requires_grad_(True).to(device)
        Y = Y.to(torch.float32).detach().requires_grad_(True).to(device)
        optimizer.zero_grad()

        y_pred = model(X)
        loss = criterion(y_pred, Y.unsqueeze(1))
        acc = binary_acc(y_pred, Y.unsqueeze(1))
        if i % 100 == 0:
            print(i)
            print(acc)
        epoch_acc += acc.item()
        loss.backward()
        optimizer.step()
    scheduler.step()

100
94.0952380952381
200
93.5361216730038
300
93.3085501858736
400
95.66854990583803
500
95.38461538461539
600
94.91525423728814
700
95.61904761904762
800
95.43726235741445
900
94.17293233082707
1000
95.76923076923077
1100
95.84905660377359
1200
96.12403100775194
1300
95.27410207939508
1400
95.93345656192237
1500
96.41509433962264
1600
96.2406015037594
1700
96.4774951076321
1800
96.42857142857143
1900
96.40151515151516
2000
97.91666666666666
2100
96.2406015037594
2200
95.62737642585552
2300
96.7741935483871
2400
97.52380952380952
2500
97.33840304182509
2600
96.55172413793103
2700
97.32824427480917
2800
97.52380952380952
2900
96.73704414587331
3000
96.42184557438794
3100
97.91271347248576
3200
97.02048417132217
3300
97.91666666666666
3400
96.53846153846153
3500
96.03024574669186
3600
96.98681732580037
3700
97.270955165692
3800
96.95238095238096
3900
97.33840304182509
4000
96.26865671641791
4100
97.02048417132217
4200
97.32824427480917
4300
94.7069943289225
4400
96.58444022770398
4500
95

KeyboardInterrupt: 

In [29]:
torch.save(model, "CNN.torch")

In [None]:
del X_batch

In [30]:
del data

In [276]:
torch.round(torch.sigmoid(model(X)))

tensor([[1.],
        [1.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]], device='cuda:0', grad_fn=<RoundBackward0>)

In [237]:
np.isnan(X.cpu().detach().numpy()).sum()

0

In [249]:
np.amax(sus1.cpu().detach().numpy())

4.056338

In [274]:
model(X)

tensor([[ 10.1656],
        [  1.8402],
        [ -5.5686],
        ...,
        [ 12.5682],
        [-12.2079],
        [ -4.4148]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [246]:
model(sus2)

tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
def get_metrics(clf, X, Y, verbal=False):
    acc = accuracy_score(Y, clf.transform(X))
    
    if verbal:
        print(f"accuracy: {acc}")
        
        tn, fp, fn, tp = confusion_matrix(Y, clf.transform(X)).ravel()

        ax= plt.subplot()
        sns.heatmap([[tn, fp],[fn, tp]], annot=True, fmt='g', ax=ax)

        ax.set_xlabel('Predicted labels')
        ax.set_ylabel('True labels')
        ax.set_title('Confusion Matrix')
        ax.xaxis.set_ticklabels(['benign', 'malicious']); ax.yaxis.set_ticklabels(['benign', 'malicious'])
        plt.show()
    return acc

In [200]:
np.isinf(torch.tensor(cleanData(scaler.transform(X[0]))).to(torch.float32).numpy()).sum()

0

In [142]:
t = scaler.transform(X[0]).astype('float32')

In [198]:
cleanData(scaler.transform(X[0])).shape

(5527, 39)

In [182]:
cleanData(t).shape

(5527, 39)

In [57]:
model(X)

tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [196]:
np.isinf(scaler.transform(X[0])).sum()

0

In [109]:
epoch_acc

77913.5812638262

In [70]:
scaler.transform(X)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [77]:
scaler.transform(X[0]) / 1000

array([[5.68181818e-05, 3.09734513e-05, 5.42905251e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.68181818e-05, 3.09734513e-05, 8.18291973e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.68181818e-05, 0.00000000e+00, 8.65501125e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.68181818e-05, 0.00000000e+00, 7.86819205e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.81818182e-04, 0.00000000e+00, 1.04253545e-07, ...,
        3.92156863e-06, 9.92063492e-08, 0.00000000e+00],
       [5.68181818e-05, 3.09734513e-05, 5.42905251e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])