In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt   

from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

from pyarrow import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import IterableDataset, Dataset, DataLoader
from torch.utils.data.dataset import T_co

In [2]:
def prep_UNSW(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack']
    df.drop(columns=features_to_remove, axis=1, inplace=True)
    Y = df.Label

    return df.drop('Label', axis=1).to_numpy(), Y.to_numpy()

In [3]:
device = torch.device("cuda:0")

In [4]:
class modelDNN(nn.Module):
    def __init__(self):
        super(modelDNN, self).__init__()        
        self.layer_1 = nn.Linear(39, 4096) 
        self.layer_2 = nn.Linear(4096, 2048)
        self.layer_3 = nn.Linear(2048, 1024) 
        self.layer_4 = nn.Linear(1024, 64) 
        self.layer_5 = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(4096)
        self.batchnorm2 = nn.BatchNorm1d(2048)
        self.batchnorm3 = nn.BatchNorm1d(1024)
        self.batchnorm4 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.relu(self.layer_4(x))
        x = self.batchnorm4(x)
        x = self.dropout(x)
        x = self.layer_5(x)
        
        return x

In [31]:
def default_transform(df):
    return df

def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 6
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

def prep_NF_UQ(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack']
    df.drop(columns=features_to_remove, axis=1, inplace=True)
    Y = df.Label

    return df.drop('Label', axis=1).to_numpy(), Y.to_numpy()

def prep_merged(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack']
    df.drop(columns=features_to_remove, axis=1, inplace=True)
    Y = df.Label
    df = df.drop('Label', axis=1)
    for name in df.columns:
        print(name)
        print(df[name])
    print(df.columns)
    return df.to_numpy(), Y.to_numpy()

def prep_ddos(df):
    features_to_remove = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Attack',
                   ' Flow Duration': 'FLOW_DURATION_MILLISECONDS',
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df['Label'] = 1
    df['mask'] = 2
    return df


class ReadBigCSV(IterableDataset):

    def __getitem__(self, index) -> T_co:
        raise NotImplementedError

    def __init__(self, filename, opts=csv.ConvertOptions(), transform=default_transform, chunkscale_linear=10, chunkscale_exp=6, start=0, end=None, skip_n=0):
        self.filename = filename
        self.opts = opts
        self.transform = transform
        self.chunkscale_linear = chunkscale_linear
        self.chunkscale_exp = chunkscale_exp
        self.start = start
        self.end = end
        self.skip_n = skip_n

    def __iter__(self):
        chunksize = self.chunkscale_linear ** self.chunkscale_exp
        with csv.open_csv(
            self.filename,
            convert_options = self.opts,
            read_options=csv.ReadOptions(
                use_threads=True,
                block_size=chunksize
            )) as reader:
            i = 0
            for chunk in reader:
                if chunk is None:
                    break
                chunk = chunk.to_pandas()
                chunk = chunk[chunk.columns[self.skip_n:]]
                if i < self.start:
                    continue
                if i == self.end:
                    break
                i+=1
                yield self.transform(chunk)

In [9]:
def convertData32(X, Y):
    X = X.astype('float32')
    mask = np.logical_not((X > 10).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not((X < -10).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isinf(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isnan(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    return X, Y

def convertData32Scale(X, Y):
    print(X)
    X = X.astype('float32')
    mask = np.logical_not((X > 100000000).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not((X < -100000000).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isinf(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    mask = np.logical_not(np.isnan(X).sum(axis=1).astype('bool'))
    X = X[mask]
    Y = Y[mask]
    return X, Y

In [29]:
def getScalerStandart(filename, transform=default_transform, skip_n=0):
    
    scaler = StandardScaler()
    chunksize = 10 ** 6
    with csv.open_csv(
        filename,
        convert_options = csv.ConvertOptions(),
        read_options=csv.ReadOptions(
             use_threads=True,
            block_size=chunksize
        )) as reader:

        for chunk in reader:
            if chunk is None:
                break
            chunk = chunk.to_pandas()
            chunk = chunk[chunk.columns[skip_n:]]
            X, Y = transform(chunk)
            break
            X, Y = convertData32Scale(X, Y)
            scaler.partial_fit(X)
    return scaler

In [32]:
scaler = getScalerStandart('data/split/MERGED_DDOS_UNSW-TRAIN.csv', transform=prep_merged, skip_n=1)

 ACK Flag Count
0        1305
1        1305
2        1305
3        1305
4        1305
        ...  
3240    13253
3241    48933
3242     8935
3243    38223
3244    12619
Name:  ACK Flag Count, Length: 3245, dtype: int64
 Active Max
0         21
1         21
2         21
3         21
4         21
        ... 
3240      22
3241    3833
3242      53
3243      80
3244    6881
Name:  Active Max, Length: 3245, dtype: int64
 Active Min
0        6
1        6
2        6
3        6
4        6
        ..
3240     6
3241     6
3242    17
3243     6
3244     6
Name:  Active Min, Length: 3245, dtype: int64
 Active Std
0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
3240    92.0
3241     0.0
3242     0.0
3243     7.0
3244     0.0
Name:  Active Std, Length: 3245, dtype: float64
 Average Packet Size
0           9
1         261
2         481
3         701
4        1031
        ...  
3240     3728
3241     4776
3242      132
3243     1044
3244    11168
Name:  Average Packet 

In [65]:
dataTrain = ReadBigCSV('data/split/MERGED_DDOS_UNSW-TRAIN.csv', transform=prep_NF_UQ, chunkscale_linear=10, chunkscale_exp=5, skip_n=1)
loaderTrain = DataLoader(dataset=dataTrain, batch_size=1)

dataTest = ReadBigCSV('data/split/NF-UNSW-NB15-V2-TEST.csv', transform=prep_NF_UQ, chunkscale_linear=10, chunkscale_exp=5, skip_n=1)
loaderTest = DataLoader(dataset=dataTest, batch_size=1)

model = modelDNN()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)
lmbda = lambda epoch: 0.65 ** epoch
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)
epoch_acc = 0
total = 0

In [61]:
def binary_acc(y_pred, y_test):

    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag.detach().cpu().numpy().reshape(-1) == y_test.detach().cpu().numpy().reshape(-1)).sum()

    acc = correct_results_sum/y_test.shape[0]

    acc *= 100
    
    return acc

In [66]:
%%time

for epoch in range(10):
    model.train()
    i = 0
    epoch_acc = 0
    for X, Y in loaderTrain:
        i+=1
        X = X[0]
        Y = Y[0]
        
        X, Y = convertData32(scaler.transform(X), Y)
        X = torch.tensor(X).to(torch.float32).detach().requires_grad_(True).to(device)
        Y = Y.to(torch.float32).detach().requires_grad_(True).to(device)
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, Y.unsqueeze(1))
        acc = binary_acc(y_pred, Y.unsqueeze(1))
        epoch_acc += acc.item()
        loss.backward()
        optimizer.step()
    print(f"epoch acc: {epoch_acc/i}")
    scheduler.step()
    
    model.eval()
    i = 0
    epoch_acc = 0
    for X, Y in loaderTest:
        i+=1
        X, Y = convertData32(scaler.transform(X[0]), Y[0])
        X = torch.tensor(X).to(torch.float32).detach().requires_grad_(True).to(device)
        Y = Y.to(torch.float32).detach().requires_grad_(True).to(device)
        y_pred = model(X)
        acc = binary_acc(y_pred, Y.unsqueeze(1))
        epoch_acc += acc
    print(f"test acc: {epoch_acc/i}")

epoch acc: 98.51919840337992
test acc: 99.37521918285503
epoch acc: 99.3504868855843
test acc: 99.52414368637773
epoch acc: 99.45393109775358
test acc: 99.53939066736437
epoch acc: 99.52166228389221
test acc: 99.52701491591144
epoch acc: 99.54667927708024
test acc: 99.53406201869598


KeyboardInterrupt: 

In [14]:
for X, Y in loaderTrain:
    print(X)
    break

tensor([[[  0.,   6.,   1.,  ...,   0.,   0., 331.],
         [  1.,   6.,   1.,  ...,   0.,   0., 230.],
         [  2.,   6.,   1.,  ...,   0.,   0., 229.],
         ...,
         [608.,   6.,   1.,  ...,   0.,   0., 125.],
         [609.,   6.,   1.,  ...,   0.,   0., 229.],
         [610.,   6.,   1.,  ...,   0.,   0., 125.]]], dtype=torch.float64)


In [32]:
X[0, :, 1:]

tensor([[6.0000e+00, 1.0000e+00, 9.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         3.3100e+02],
        [6.0000e+00, 1.0000e+00, 2.6100e+02,  ..., 0.0000e+00, 0.0000e+00,
         2.3000e+02],
        [6.0000e+00, 1.0000e+00, 4.8100e+02,  ..., 0.0000e+00, 0.0000e+00,
         2.2900e+02],
        ...,
        [6.0000e+00, 1.0000e+00, 1.4930e+03,  ..., 0.0000e+00, 0.0000e+00,
         1.2500e+02],
        [6.0000e+00, 1.0000e+00, 1.8170e+03,  ..., 0.0000e+00, 0.0000e+00,
         2.2900e+02],
        [6.0000e+00, 1.0000e+00, 2.0590e+03,  ..., 0.0000e+00, 0.0000e+00,
         1.2500e+02]], dtype=torch.float64)