In [2]:
import numpy as np
import pandas as pd
import os
import re
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

In [3]:
RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

DATASET_NAME = "CICIDS_2017"
# Definindo o BASE_DIR
BASE_DIR = os.path.dirname(os.getcwd())

# Caminho para a pasta de dados
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'data_raw')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'data_preprocessed')

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [14]:
# Baixando o dataset
TEMP_DIR = '/tmp'
ZIP_FILE_PATH = os.path.join(TEMP_DIR, 'CIC_IDS_2017.zip')
if not os.path.exists(ZIP_FILE_PATH):
    print("Baixando o dataset...")
    !wget 'http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip' -O {ZIP_FILE_PATH}

TMP_FOLDER = os.path.join(RAW_DATA_DIR, 'tmp')
if not os.path.exists(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    print("Descompactando o dataset...")
    !unzip {ZIP_FILE_PATH} -d {TMP_FOLDER}

    from_dir = TMP_FOLDER
    to_dir = os.path.join(RAW_DATA_DIR, DATASET_NAME)
    os.makedirs(to_dir, exist_ok=True)
    
    for root, dirs, files in os.walk(from_dir):
        for file_name in files:
            src_path = os.path.join(root, file_name)
            dest_path = os.path.join(to_dir, file_name)
            shutil.move(src_path, dest_path)

    shutil.rmtree(TMP_FOLDER)

In [6]:
DATASET_DIR = os.path.join(RAW_DATA_DIR, DATASET_NAME)
df_list = []
for file in os.listdir(DATASET_DIR):
    df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
    df_list.append(df_aux)
df = pd.concat(df_list, ignore_index=True)
df.columns = df.columns.str.strip()
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [7]:
# Descartando duplicadas
initial_len = df.shape[0]
df = df.drop_duplicates()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartadas {initial_len - df.shape[0]} duplicadas')

# Descartando registros com valores NaN/Null/NA
initial_len = df.shape[0]
df = df.dropna()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartados {initial_len - df.shape[0]} registros com valores NA')

df = df.reset_index(drop=True)

Tamanho inicial: 2830743, tamanho final 2522362 | Descartadas 308381 duplicadas
Tamanho inicial: 2522362, tamanho final 2522009 | Descartados 353 registros com valores NA


In [8]:
changes = 0
for column in df.columns:
    if df[column].dtype in ['float64', 'int64']:
        max_finite_value = df[np.isfinite(df[column])][column].max()
        min_finite_value = df[np.isfinite(df[column])][column].min()
        df.loc[df[column] == np.inf, column] = max_finite_value
        df.loc[df[column] == -np.inf, column] = min_finite_value

In [9]:
df_train = df.query('Label == "BENIGN"').sample(frac=0.6, random_state=RANDOM_SEED)
df_val_test = df.drop(df_train.index)

df_train = df_train.reset_index(drop=True)
df_val_test = df_val_test.reset_index(drop=True)

X_train = df_train.drop('Label', axis='columns')

X_val, X_test, classes_val, classes_test = train_test_split(df_val_test.drop('Label', axis='columns'), df_val_test['Label'], test_size=0.65, stratify=df_val_test['Label'], random_state=RANDOM_SEED)

X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
classes_val, classes_test =  classes_val.reset_index(drop=True), classes_test.reset_index(drop=True)

y_val, y_test = classes_val.apply(lambda c: 0 if c == 'BENIGN' else 1), classes_test.apply(lambda c: 0 if c == 'BENIGN' else 1)

del df_train, df_val_test

In [10]:
def get_highly_correlated_features(correlation_matrix, threshold):
    correlated_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
                coefficient = correlation_matrix.iloc[i, j]
                correlated_pairs.append((pair, coefficient))
    return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)

corr_matrix = X_train.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

correlation_list[:10]

# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
    if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
        f2drop.append(feature_pair[1])

f2drop

['Fwd PSH Flags',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Fwd Header Length',
 'Total Fwd Packets',
 'Total Length of Fwd Packets',
 'Total Backward Packets',
 'Bwd Packet Length Mean',
 'Total Length of Bwd Packets',
 'Subflow Fwd Packets',
 'Flow Duration',
 'Subflow Bwd Packets',
 'RST Flag Count',
 'Packet Length Mean',
 'Flow IAT Max',
 'Idle Mean',
 'Fwd IAT Total',
 'Max Packet Length',
 'Fwd Packet Length Max',
 'Bwd IAT Max',
 'Bwd IAT Mean',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Idle Max']

In [11]:
f2drop = f2drop + ['Destination Port']

X_train = X_train.drop(f2drop, axis='columns')
X_val = X_val.drop(f2drop, axis='columns')
X_test = X_test.drop(f2drop, axis='columns')

In [12]:
std_scaler = StandardScaler()
std_scaler = std_scaler.fit(X_train)

norm_X_train = std_scaler.transform(X_train)
norm_X_val = std_scaler.transform(X_val)
norm_X_test = std_scaler.transform(X_test)

# Salvando os dados processados em "data_preprocessed"
X_train = pd.DataFrame(norm_X_train, columns=X_train.columns)
X_val = pd.DataFrame(norm_X_val, columns=X_val.columns)
X_test = pd.DataFrame(norm_X_test, columns=X_test.columns)

# Salvando os arquivos processados

RESULT_DIR = os.path.join(PROCESSED_DATA_DIR, DATASET_NAME)
X_train.to_csv(os.path.join(RESULT_DIR, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(RESULT_DIR, 'X_val.csv'), index=False)
X_test.to_csv(os.path.join(RESULT_DIR, 'X_test.csv'), index=False)
y_val.to_csv(os.path.join(RESULT_DIR, 'y_val.csv'), index=False)
y_test.to_csv(os.path.join(RESULT_DIR, 'y_test.csv'), index=False)

del X_train, X_val, X_test