In [35]:
import numpy as np
import pandas as pd
import os
import re
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

In [36]:
RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

DATASET_NAME = "CIDDS-001"
# Definindo o BASE_DIR
BASE_DIR = os.path.dirname(os.getcwd())

# Caminho para a pasta de dados
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'data_raw')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'data_preprocessed')

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [37]:
# Baixando o dataset
TEMP_DIR = '/tmp'
ZIP_FILE_PATH = os.path.join(TEMP_DIR, 'CIDDS-001.zip')
if not os.path.exists(ZIP_FILE_PATH):
    print("Baixando o dataset...")
    !wget 'https://www.hs-coburg.de/wp-content/uploads/2024/11/CIDDS-001.zip' -O {ZIP_FILE_PATH}

TMP_FOLDER = os.path.join(RAW_DATA_DIR, 'tmp')
if not os.path.exists(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    print("Descompactando o dataset...")
    !unzip {ZIP_FILE_PATH} -d {TMP_FOLDER}

    from_dir = TMP_FOLDER
    to_dir = os.path.join(RAW_DATA_DIR, DATASET_NAME)
    os.makedirs(to_dir, exist_ok=True)
    
    for root, dirs, files in os.walk(from_dir):
        for file_name in files:
            src_path = os.path.join(root, file_name)
            dest_path = os.path.join(to_dir, file_name)
            shutil.move(src_path, dest_path)

    shutil.rmtree(TMP_FOLDER)

In [38]:
for root, dirs, files in os.walk(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    for file_name in files:
        if (file_name.endswith('.log') or file_name.endswith('.conf') or 
            file_name.endswith('.pdf') or file_name.startswith('attack_logs')):
            os.remove(os.path.join(root, file_name))

In [39]:
DATASET_DIR = os.path.join(RAW_DATA_DIR, DATASET_NAME)
df_list = []
for file in os.listdir(DATASET_DIR):
    df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
    df_list.append(df_aux)
df = pd.concat(df_list, ignore_index=True)
df.columns = df.columns.str.strip()
df.columns

  df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
  df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
  df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
  df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))
  df_aux = pd.read_csv(os.path.join(DATASET_DIR, file))


Index(['Date first seen', 'Duration', 'Proto', 'Src IP Addr', 'Src Pt',
       'Dst IP Addr', 'Dst Pt', 'Packets', 'Bytes', 'Flows', 'Flags', 'Tos',
       'class', 'attackType', 'attackID', 'attackDescription'],
      dtype='object')

## Removendo colunas indesejadas

In [40]:
columns_to_remove = [
    "Date first seen",
    "Src IP Addr",
    "Dst IP Addr",
    'Src Pt',
    'Dst Pt'
]
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')


## Removendo dados nulos e duplicados

In [41]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [42]:
df.dtypes

Duration             float64
Proto                 object
Packets                int64
Bytes                 object
Flows                  int64
Flags                 object
Tos                    int64
class                 object
attackType            object
attackID              object
attackDescription     object
dtype: object

## Tratando as colunas
As colunas restantes são:
```
Duration             float64
Proto                 object
Packets                int64
Bytes                 object
Flows                  int64
Flags                 object
Tos                    int64
class                 object
attackType            object
attackID              object
attackDescription     object
Bytes Numeric        float64
```

Tratamento: 
- Proto: 
    One Hot Encoding
- Bytes
    Converter para float
- Flags: 
    Converter para binário + One Hot Encoding com os bits
- Tos: 
    One Hot Encoding
- class: 
    Remover as linhas que não sejam "normal, suspicious, attacker"
- attackType, attackID, attackDescription: 
    Após a divisão treino/teste/validação, remover as colunas dos conjuntos de treino e validação. 
    Remover do conjunto de teste, mas salvar os valores em test_attacks.csv

### 1. Proto: One Hot Encoding

In [43]:
df = pd.get_dummies(df, columns=["Proto"], prefix="Proto")

### 2. Bytes: Converter para float

In [45]:
def convert_bytes(val):
    if isinstance(val, str) and 'M' in val:
        return float(val.replace('M', '')) * 1024 * 1024
    else:
        return float(val)

df["Bytes"] = df["Bytes"].apply(convert_bytes)
df["Bytes Numeric"] = pd.to_numeric(df["Bytes"], errors='coerce')
df.drop(columns=["Bytes"], inplace=True, errors='ignore')

### 3. Flags: Converter para binário + One Hot Encoding com os bits
Alguns valores de Flags estão no formato:
..X.X. (string com 6 caracteres. Cada letra representa um bit e cada . representa 0)
0xNN onde NN é um número hexadecimal.

- Converter para um binário de 8 bits
- Criar uma nova coluna para cada bit
- Setar o valor 1 ou 0 de acordo com o valor do bit na coluna original

In [57]:
def convert_flag_to_binary(flag):
    flag = flag.strip()
    if flag.startswith("0x"):
        try:
            num = int(flag, 16)
            return format(num, '08b')
        except:
            return "00000000"
    else:
        # Assume flag is a string like "..X.X." (6 characters)
        if len(flag) == 6:
            bits = ''.join('1' if ch != '.' else '0' for ch in flag)
            return "00" + bits  # pad with two zeros to form 8 bits
        elif len(flag) == 8:
            return ''.join('1' if ch != '.' else '0' for ch in flag)
        else:
            return ''.join('1' if ch != '.' else '0' for ch in flag)

df["flag_binary"] = df["Flags"].apply(convert_flag_to_binary)
for i in range(8):
    df[f'flag_bit_{i}'] = df["flag_binary"].str[i].astype(int)
df.drop(columns=["Flags", "flag_binary"], inplace=True, errors='ignore')

### 4. Tos: One Hot Encoding

In [59]:
df = pd.get_dummies(df, columns=["Tos"], prefix="Tos")

### 5. class: Remover as linhas que não sejam "normal, suspicious, attacker"

In [61]:
df = df[df["class"].isin(["normal", "suspicious", "attacker"])].copy()

In [None]:
# Conver normal to 0 and suspicious/attacker to 1
df["class"] = df["class"].replace({"normal": 0, "suspicious": 1, "attacker": 1}).infer_objects(copy=False)

class
0    4222389
1     197047
Name: count, dtype: int64

In [69]:
old_len = len(df)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
new_len = len(df)
print(f"Removed {old_len - new_len} duplicate rows.")

Removed 0 duplicate rows.


In [76]:
df.dtypes

Duration             float64
Packets                int64
Flows                  int64
class                  int64
attackType            object
attackID              object
attackDescription     object
Proto_GRE               bool
Proto_ICMP              bool
Proto_IGMP              bool
Proto_TCP               bool
Proto_UDP               bool
Bytes Numeric        float64
flag_bit_0             int64
flag_bit_1             int64
flag_bit_2             int64
flag_bit_3             int64
flag_bit_4             int64
flag_bit_5             int64
flag_bit_6             int64
flag_bit_7             int64
Tos_0                   bool
Tos_12                  bool
Tos_16                  bool
Tos_32                  bool
Tos_192                 bool
dtype: object

## Removendo colunas com alta correlação

In [None]:
def get_highly_correlated_features(correlation_matrix, threshold):
    correlated_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
                coefficient = correlation_matrix.iloc[i, j]
                correlated_pairs.append((pair, coefficient))
    return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)

df_without_attacks = df.drop(["attackType", "attackID", "attackDescription"], axis=1, errors='ignore')
df_without_attacks = df_without_attacks[df_without_attacks["class"] == 0].copy()
corr_matrix = df_without_attacks.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

# Drop high correlated features in correlation list

f2drop = []
for feature_pair, _ in correlation_list:
    if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
        f2drop.append(feature_pair[1])

f2drop

['Tos_0', 'Proto_TCP  ', 'Proto_UDP  ']

In [81]:
df.drop(columns=f2drop, inplace=True, errors='ignore')
df.reset_index(drop=True, inplace=True)

In [84]:
df_train = df[df["class"] == 0].sample(frac=0.6, random_state=RANDOM_SEED).reset_index(drop=True)
df_val_test = df.drop(df_train.index).reset_index(drop=True)

df_val, df_test = train_test_split(df_val_test, test_size=0.65, stratify=df_val_test['class'], random_state=RANDOM_SEED)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

drop_cols = ['class', 'attackType', 'attackID', 'attackDescription']

X_train = df_train.drop(columns=drop_cols)
X_val = df_val.drop(columns=drop_cols)
y_train = df_train['class']   # should be all 0
y_val = df_val['class']

test_attacks = df_test[['attackType', 'attackID', 'attackDescription']].copy()
os.makedirs(os.path.join(PROCESSED_DATA_DIR, DATASET_NAME), exist_ok=True)
test_attacks.to_csv(os.path.join(PROCESSED_DATA_DIR, DATASET_NAME, 'test_attacks.csv'), index=False)
X_test = df_test.drop(columns=drop_cols)
y_test = df_test['class']

del df_train, df_val, df_test, df_val_test

In [87]:
std_scaler = StandardScaler()
std_scaler = std_scaler.fit(X_train)

norm_X_train = std_scaler.transform(X_train)
norm_X_val = std_scaler.transform(X_val)
norm_X_test = std_scaler.transform(X_test)

# Salvando os dados processados em "data_preprocessed"
X_train = pd.DataFrame(norm_X_train, columns=X_train.columns)
X_val = pd.DataFrame(norm_X_val, columns=X_val.columns)
X_test = pd.DataFrame(norm_X_test, columns=X_test.columns)

# Salvando os arquivos processados

RESULT_DIR = os.path.join(PROCESSED_DATA_DIR, DATASET_NAME)
X_train.to_csv(os.path.join(RESULT_DIR, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(RESULT_DIR, 'X_val.csv'), index=False)
X_test.to_csv(os.path.join(RESULT_DIR, 'X_test.csv'), index=False)
y_val.to_csv(os.path.join(RESULT_DIR, 'y_val.csv'), index=False)
y_test.to_csv(os.path.join(RESULT_DIR, 'y_test.csv'), index=False)

del X_train, X_val, X_test