In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/netflow/CIDDS-001/traffic/preprocessed.csv', index_col='DATE')

df.head()

Unnamed: 0_level_0,DURATION,PROTOCOL,SRC_IP_ADDR,SRC_PORT,DST_IP_ADDR,DST_PORT,PACKETS,BYTES,TOS,ATTACK_TYPE,...,FLAG_0xdf,FLAG_0xda,FLAG_0x52,FLAG_0xd3,FLAG_0x5b,FLAG_0x5a,FLAG_0xd2,FLAG_0xde,FLAG_0x53,FLAG_0xc6
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-14 17:43:26.135,81504.787,3,43440,8,44022,8,8639,9318.4,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43441,8,44021,8,12024,10547.2,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43441,8,44021,8,12024,10547.2,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43440,8,44022,8,8639,9318.4,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:39.011,183418.493,3,43441,8,44021,8,20751,5939.2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['ATTACK_TYPE'].value_counts(normalize=True) * 100

0    84.229777
2    14.647460
4     1.028429
1     0.070899
3     0.023436
Name: ATTACK_TYPE, dtype: float64

In [5]:
"""
model.py
Class model - Model model
"""
from abc import ABC, abstractmethod

class Model(ABC):
    """ Model model

    :params:
        name (str): model name
        type (str): type of algorithm (ex: arima, linear_regression...)
        hyperparameters_list (list): list of hyperparameters to input
        description (str): model description
    """
    def __init__(self, name=None, type=None, hyperparameters_list=None, description=None):
        self.name = name
        self.type = type
        self.hyperparameters_list = hyperparameters_list
        self.description = description

    @abstractmethod
    def apply_preprocessing(self):
        """ Apply preprocessing
        :return: data preprocessed
        """
        raise NotImplementedError

    @abstractmethod
    def train(self):
        """ Train the model for the given data
        must fit the `model` property and return it
        """
        raise NotImplementedError

    @abstractmethod
    def predict(self):
        """ Predict
        :return: predictions
        """
        raise NotImplementedError

MODEL 1

Baseline definition: random model

In [6]:
from random import randint

class RandomBaseline(Model):
    def __init__(self):
        super().__init__(
            name="Random Baseline", type="Random"
        )
    
    def apply_preprocessing(**kwargs):
        pass
    
    def train(**kwargs):
        pass
    
    def predict(self, X):
        return [randint(0, 4) for _ in range(len(X))]

In [7]:
X = df.drop('ATTACK_TYPE', axis=1).iloc[:100]

In [8]:
preds = RandomBaseline().predict(X)

In [9]:
preds[:5]

[1, 3, 0, 4, 0]

MODEL 2


In [10]:
class ProceduralBaseline1(Model):
    def __init__(self):
        super().__init__(
            name="Procedural Baseline based on IP addresses and protocols", type="Procedural"
        )
    
    def apply_preprocessing(**kwargs):
        pass
    
    def train(**kwargs):
        pass
    
    def predict(self, X):
        # 0 if PROTOCOL NOT IN ('TCP', 'UDP') AND SRC_IP != '192.168' else random
        return [0 if row['PROTOCOL'] not in range(2) or row['SRC_IP_ADDR'] not in [21284, 21265] else randint(1, 4) for _, row in X.iterrows()]

In [11]:
X.head()

Unnamed: 0_level_0,DURATION,PROTOCOL,SRC_IP_ADDR,SRC_PORT,DST_IP_ADDR,DST_PORT,PACKETS,BYTES,TOS,TYPE,...,FLAG_0xdf,FLAG_0xda,FLAG_0x52,FLAG_0xd3,FLAG_0x5b,FLAG_0x5a,FLAG_0xd2,FLAG_0xde,FLAG_0x53,FLAG_0xc6
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-14 17:43:26.135,81504.787,3,43440,8,44022,8,8639,9318.4,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43441,8,44021,8,12024,10547.2,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43441,8,44021,8,12024,10547.2,0,1,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:26.135,81504.787,3,43440,8,44022,8,8639,9318.4,0,1,...,0,0,0,0,0,0,0,0,0,0
2017-03-14 17:43:39.011,183418.493,3,43441,8,44021,8,20751,5939.2,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
ProceduralBaseline1().predict(X)[:5]

[0, 0, 0, 0, 0]

MODEL 3

In [13]:
class ProceduralBaseline2(Model):
    def __init__(self):
        super().__init__(
            name="Procedural Baseline based on IP addresses and TOS", type="Procedural"
        )
    
    def apply_preprocessing(**kwargs):
        pass
    
    def train(**kwargs):
        pass
    
    def predict(self, X):
        # 0 if TOS NOT IN ('32', '16') AND DST_IP != 192.168.100.6 else random
        return [0 if row['TOS'] in (2, 1) or row['DST_IP_ADDR'] != 21265 else randint(1, 4) for _, row in X.iterrows()]

In [14]:
ProceduralBaseline2().predict(X)[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

MODEL 4

In [15]:
from random import randint

class FullBegninBaseline(Model):
    def __init__(self):
        super().__init__(
            name="Random Baseline", type="Random"
        )
    
    def apply_preprocessing(**kwargs):
        pass
    
    def train(**kwargs):
        pass
    
    def predict(self, X):
        return [0 for _ in range(len(X))]

In [16]:
FullBegninBaseline().predict(X)[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]