In [31]:
import numpy as np
import pandas as pd
import os
import re
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

In [32]:
RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

DATASET_NAME = "Kitsune"
# Definindo o BASE_DIR
BASE_DIR = os.path.dirname(os.getcwd())

# Caminho para a pasta de dados
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'data_raw')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'data_preprocessed')

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [33]:
# Baixando o dataset
TEMP_DIR = '/tmp'
ZIP_FILE_PATH = os.path.join(TEMP_DIR, 'Kitsune.zip')
if not os.path.exists(ZIP_FILE_PATH):
    print("Baixando o dataset...")
    !wget 'https://www.kaggle.com/api/v1/datasets/download/ymirsky/network-attack-dataset-kitsune' -O {ZIP_FILE_PATH}

TMP_FOLDER = os.path.join(RAW_DATA_DIR, 'tmp')
if not os.path.exists(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    print("Descompactando o dataset...")
    !unzip {ZIP_FILE_PATH} -d {TMP_FOLDER}

    from_dir = TMP_FOLDER
    to_dir = os.path.join(RAW_DATA_DIR, DATASET_NAME)
    os.makedirs(to_dir, exist_ok=True)
    
    for root, dirs, files in os.walk(from_dir):
        for file_name in files:
            src_path = os.path.join(root, file_name)
            dest_path = os.path.join(to_dir, file_name)
            shutil.move(src_path, dest_path)

    shutil.rmtree(TMP_FOLDER)

    

Descompactando o dataset...
Archive:  /tmp/Kitsune.zip
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/ARP MitM/ARP_MitM_dataset.csv  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/ARP MitM/ARP_MitM_labels.csv  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/ARP MitM/ARP_MitM_pcap.pcapng  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/Active Wiretap/Active_Wiretap_dataset.csv  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/Active Wiretap/Active_Wiretap_labels.csv  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/data/data_raw/tmp/Active Wiretap/Active_Wiretap_pcap.pcapng  
  inflating: /mnt/F02E8D3A2E8CFABC/SharedDocs/DETEC-INVASAO/Projeto/DiFF-RF-Plus/tests/da

Os arquivos desse dataset são muito grande. Então, realizaremos alguns processamentos adicionais para remover arquivos desnecessários e diminuir consideravelmente o tamanho do dataset.

In [34]:
# Delete all .pcap files
for root, dirs, files in os.walk(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    for file_name in files:
        if file_name.endswith('.pcapng') or file_name.endswith('.pcap'):
            os.remove(os.path.join(root, file_name))

In [36]:
for file_name in os.listdir(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    if file_name.endswith('_dataset.csv'):
        dataset_file = os.path.join(RAW_DATA_DIR, DATASET_NAME, file_name)
        labels_file = dataset_file.replace('_dataset.csv', '_labels.csv')

        # Mirai_dataset is wrongly named (mirai_labels.csv)
        if file_name.startswith('Mirai_dataset'):
            labels_file = os.path.join(RAW_DATA_DIR, DATASET_NAME, 'mirai_labels.csv')
            MIRAI_DATASET = True
        else:
            MIRAI_DATASET = False

        if os.path.exists(labels_file):
            print(f"Processing {file_name} and {os.path.basename(labels_file)}...")

            # Read dataset and labels files into pandas dataframes
            df_dataset = pd.read_csv(dataset_file, header=None)
            df_dataset.columns = [f"feat{i+1}" for i in range(df_dataset.shape[1])]

            if MIRAI_DATASET:
                df_labels = pd.read_csv(labels_file, header=None)
                df_labels.columns = ["Anomaly"]
            else:
                df_labels = pd.read_csv(labels_file)
                df_labels = df_labels.iloc[:, [1]]
                df_labels.columns = ["Anomaly"]

            # Merge the dataset and labels
            df_merged = pd.concat([df_dataset, df_labels], axis=1)

            # Calculate factor (2000 / (size of dataset + size of labels))
            dataset_size = os.path.getsize(dataset_file)
            labels_size = os.path.getsize(labels_file)
            total_size = dataset_size + labels_size
            factor = (2000 * 1024 * 1024) / total_size
            factor = min(factor, 1.0)

            # Calculate the number of rows to keep based on the factor
            n_rows_to_keep = int(len(df_merged) * factor)
            df_merged = df_merged.sample(n=n_rows_to_keep, random_state=RANDOM_SEED)

            # Save the merged dataframe as NAME.csv
            merged_file_path = os.path.join(RAW_DATA_DIR, DATASET_NAME, file_name.replace('_dataset.csv', '.csv'))
            df_merged.to_csv(merged_file_path, index=False)

            # Delete the original dataset and labels files
            os.remove(dataset_file)
            os.remove(labels_file)
        else:
            print(f"Labels file {labels_file} not found for {file_name}. Skipping...")


Processing Mirai_dataset.csv and mirai_labels.csv...
Processing OS_Scan_dataset.csv and OS_Scan_labels.csv...
Processing SSDP_Flood_dataset.csv and SSDP_Flood_labels.csv...
Processing SSL_Renegotiation_dataset.csv and SSL_Renegotiation_labels.csv...
Processing SYN_DoS_dataset.csv and SYN_DoS_labels.csv...
Processing Video_Injection_dataset.csv and Video_Injection_labels.csv...


In [37]:
for file_name in os.listdir(os.path.join(RAW_DATA_DIR, DATASET_NAME)):
    if file_name.endswith('.csv'):
        DATA_NAME = file_name.replace('.csv', '')
        processed_folder = os.path.join(PROCESSED_DATA_DIR, f"Kitsune_{DATA_NAME}")
        os.makedirs(processed_folder, exist_ok=True)
        dataset_file = os.path.join(RAW_DATA_DIR, DATASET_NAME, file_name)
        df_dataset = pd.read_csv(dataset_file)
        df_dataset = df_dataset.dropna().drop_duplicates()
        
        # Test/train split (assuming benign samples are labeled as 0 in the "Anomaly" column)
        df_train = df_dataset.query('Anomaly == 0').sample(frac=0.6, random_state=RANDOM_SEED)
        df_val_test = df_dataset.drop(df_train.index)
        df_train = df_train.reset_index(drop=True)
        df_val_test = df_val_test.reset_index(drop=True)
        
        X_train = df_train.drop('Anomaly', axis=1)
        from sklearn.model_selection import train_test_split
        X_val, X_test, y_val, y_test = train_test_split(
            df_val_test.drop('Anomaly', axis=1),
            df_val_test['Anomaly'],
            test_size=0.65,
            stratify=df_val_test['Anomaly'],
            random_state=RANDOM_SEED
        )
        X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
        y_val, y_test = y_val.reset_index(drop=True), y_test.reset_index(drop=True)
        del df_train, df_val_test
        
        # Correlation analysis and drop highly correlated features
        def get_highly_correlated_features(corr_matrix, threshold):
            correlated_pairs = []
            for i in range(len(corr_matrix.columns)):
                for j in range(i):
                    if abs(corr_matrix.iloc[i, j]) > threshold:
                        pair = (corr_matrix.columns[i], corr_matrix.columns[j])
                        coefficient = corr_matrix.iloc[i, j]
                        correlated_pairs.append((pair, coefficient))
            return sorted(correlated_pairs, key=lambda pair: pair[1], reverse=True)
        
        corr_matrix = X_train.corr().abs()
        correlation_list = get_highly_correlated_features(corr_matrix, 0.95)
        f2drop = []
        for feature_pair, _ in correlation_list:
            if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
                f2drop.append(feature_pair[1])
        X_train = X_train.drop(f2drop, axis=1)
        X_val = X_val.drop(f2drop, axis=1)
        X_test = X_test.drop(f2drop, axis=1)
        
        # Normalization
        from sklearn.preprocessing import StandardScaler
        std_scaler = StandardScaler().fit(X_train)
        norm_X_train = std_scaler.transform(X_train)
        norm_X_val = std_scaler.transform(X_val)
        norm_X_test = std_scaler.transform(X_test)
        X_train = pd.DataFrame(norm_X_train, columns=X_train.columns)
        X_val = pd.DataFrame(norm_X_val, columns=X_val.columns)
        X_test = pd.DataFrame(norm_X_test, columns=X_test.columns)
        
        # Save processed files
        X_train.to_csv(os.path.join(processed_folder, 'X_train.csv'), index=False)
        X_val.to_csv(os.path.join(processed_folder, 'X_val.csv'), index=False)
        X_test.to_csv(os.path.join(processed_folder, 'X_test.csv'), index=False)
        y_val.to_csv(os.path.join(processed_folder, 'y_val.csv'), index=False)
        y_test.to_csv(os.path.join(processed_folder, 'y_test.csv'), index=False)
        
        del X_train, X_val, X_test, y_val, y_test
