In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

%matplotlib inline

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
pd.set_option('display.max_columns', None)

In [None]:
paths = {
    'part1': '../input/simargl2021-network-intrusion-detection-dataset/dataset-part1.csv',
    'part2': '../input/simargl2021-network-intrusion-detection-dataset/dataset-part2.csv',
    'initial_features': '../input/clean-simargl/initial_features.csv'
}

In [None]:
df = pd.concat([
    pd.read_csv(paths['part1']), 
    pd.read_csv(paths['part2'])
])

### EPXLORACION INICIAL

In [None]:
df.shape

In [None]:
df.info()

In [None]:
## exploramos algunas entradas, para ver qué tipo de data tenemos
df.head(50)

In [None]:
df.DST_TO_SRC_SECOND_BYTES.replace(',', 0)

In [None]:
df.FIREWALL_EVENT.value_counts()

In [None]:
df.FLOW_ACTIVE_TIMEOUT.value_counts()

In [None]:
df.FLOW_ID.is_unique

In [None]:
df.FLOW_INACTIVE_TIMEOUT.value_counts()

In [None]:
df.FRAME_LENGTH.value_counts()

In [None]:
df.MIN_IP_PKT_LEN.value_counts()

In [None]:
df.MAX_IP_PKT_LEN.value_counts()

In [None]:
zeros = 0

for val in df.OOORDER_IN_PKTS:
    if val == 0:
        zeros += 1
        
print("Porcentaje de valores 0 en columna: %.2f%%" % (100 * zeros / len(df)))

In [None]:
zeros = 0

for val in df.OOORDER_OUT_PKTS:
    if val == 0:
        zeros += 1
        
print("Porcentaje de valores 0 en columna: %.2f%%" % (100 * zeros / len(df)))

In [None]:
df.SAMPLING_INTERVAL.value_counts()

In [None]:
df.TOTAL_FLOWS_EXP.is_unique

In [None]:
df.BIFLOW_DIRECTION.value_counts()

In [None]:
TO_DELETE = [
    'DST_TO_SRC_SECOND_BYTES',
    'FLOW_ID',
    'BIFLOW_DIRECTION',
    'FIREWALL_EVENT', 
    'FLOW_ACTIVE_TIMEOUT',
    'FLOW_INACTIVE_TIMEOUT',
    'FRAME_LENGTH',
    'MAX_IP_PKT_LEN',
    'MIN_IP_PKT_LEN',
    'PROTOCOL_MAP',
    'SAMPLING_INTERVAL',
    'TOTAL_FLOWS_EXP',
    'OOORDER_OUT_PKTS',
    'OOORDER_IN_PKTS',
    'IPV4_SRC_ADDR',
    'IPV4_DST_ADDR'
]

In [None]:
clean_df = df.drop(TO_DELETE, inplace=True, axis=1)

In [None]:
clean_df.columns

In [None]:
clean_df.to_csv('datasets/initial_features.csv')

In [None]:
## En caso se necesite optimizar memoria
dtype={
    'PROTOCOL': 'int16',
    'DIRECTION': 'int16',
    'FLOW_DURATION_MILLISECONDS': 'int16',
    'IN_PKTS': 'int32',
    'OUT_PKTS': 'int32',
    'L4_DST_PORT': 'int32',
    'L4_SRC_PORT': 'int32',
    'RETRANSMITTED_IN_PKTS': 'int32',
    'RETRANSMITTED_OUT_PKTS': 'int32'
    'RETRANSMITTED_IN_BYTES': 'int32',
    'RETRANSMITTED_OUT_BYTES': 'int32',
}

In [4]:
## SI SE EJECUTA TODO EL DOCUMENTO SALTAR ESTA CELDA

clean_df = pd.read_csv(
    paths['initial_features'], 
    index_col=0,
    #dtype=dtype
)

In [5]:
clean_df.info()

In [6]:
clean_df.head()

In [7]:
clean_df = clean_df.drop(
    [
        'DST_TO_SRC_SECOND_BYTES', 
        'FLOW_START_SEC', 
        'FLOW_END_SEC', 
        'FLOW_DURATION_MICROSECONDS'
    ],
    axis=1
)

In [8]:
clean_df.head()

In [9]:
## cuando la dirección del flow es == 1, es muy probable que el flow sea normal
clean_df[clean_df['DIRECTION'] == 1].groupby(['LABEL']).size()

In [10]:
target = clean_df['LABEL']

In [11]:
features = clean_df.drop('LABEL', axis=1)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
test_ratio = 0.25

In [14]:
_, X_sample, _, y_sample = train_test_split(features, target, test_size = test_ratio)

In [15]:
print("Se trabajará con una muestra del %.0f%% de los datos originales" % (len(X_sample) / len(clean_df) * 100))

In [22]:
## Revisamos el balance de los datos, es posible que el balance sea 
## lo suficientemente bueno para entrenar los modelos sin over ni under sampling
y_sample.value_counts()

In [26]:
## Encoding de la columna de features ('LABEL')
encoding = {
    'Normal flow' : 0,
    'SYN Scan - aggressive': 1,
    'Denial of Service R-U-Dead-Yet': 2,
    'Denial of Service Slowloris': 3
}

In [34]:
y_sample = y_sample.apply(lambda row: encoding[row])

In [35]:
y_sample.value_counts()

In [36]:
clean_df = features = target = None

In [37]:
from imblearn.under_sampling import RandomUnderSampler

In [38]:
sampler = RandomUnderSampler(random_state=0)
X_undersampled, y_undersampled = sampler.fit_resample(X_sample, y_sample)

In [39]:
y_undersampled.value_counts()

In [40]:
len(y_undersampled)

### SCALING

### MODELOS