In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import norm

# Obtener la lista de rutas de los archivos CSV
dspaths = []
for dirname, _, filenames in os.walk('/TFM/DATASET-1/v2/final_csv'):
    for filename in filenames:
        if filename.endswith('.csv'):
            pds = os.path.join(dirname, filename)
            dspaths.append(pds)
            print(pds)

# Cargar los archivos CSV en un diccionario de DataFrames
data_frames = {}
for file in dspaths:
    file_name = os.path.basename(file)
    df = pd.read_csv(file)
    print(f"File: {file_name}, Shape: {df.shape}")
    data_frames[file_name] = df

/TFM/DATASET-1/v2/final_csv\data.csv
File: data.csv, Shape: (8656767, 77)


In [2]:
import pandas as pd

# Function to normalize column names
def normalize_column_names(df):
    column_mapping = {
        'Fwd Bytes/Bulk Avg': 'Fwd Avg Bytes/Bulk',
        'Fwd Packet/Bulk Avg': 'Fwd Avg Packets/Bulk',
        'Fwd Packets/Bulk Avg': 'Fwd Avg Packets/Bulk', # Duplicate column name not really present
        'Fwd Bulk Rate Avg': 'Fwd Avg Bulk Rate',
        'Bwd Bytes/Bulk Avg': 'Bwd Avg Bytes/Bulk',
        'Bwd Packet/Bulk Avg': 'Bwd Avg Packets/Bulk',
        'Bwd Packets/Bulk Avg': 'Bwd Avg Packets/Bulk', # Duplicate column name not really present
        'Bwd Bulk Rate Avg': 'Bwd Avg Bulk Rate',
        'FWD Init Win Bytes': 'Init_Win_bytes_forward', # Column changes 2
        'Bwd Init Win Bytes': 'Init_Win_bytes_backward',
        'Fwd Act Data Pkts': 'act_data_pkt_fwd',
        'Fwd Seg Size Min': 'min_seg_size_forward' ,
        'Total Fwd Packet': 'Total Fwd Packets',
        'Total Bwd packets': 'Total Backward Packets',
        'Total Length of Fwd Packet': 'Total Length of Fwd Packets',
        'Total Length of Bwd Packet': 'Total Length of Bwd Packets',
        'Fwd Segment Size Avg': 'Avg Fwd Segment Size',
        'Bwd Segment Size Avg': 'Avg Bwd Segment Size',
        'Packet Length Min': 'Min Packet Length',
        'Packet Length Max': 'Max Packet Length'
    }
    df.rename(columns=column_mapping, inplace=True)
    return df

# Function to get the index of the label column
def get_label_column_index(df):
    if 'Label' in df.columns:
        return df.columns.get_loc('Label')
    return df.shape[1] - 1

# Crear un DataFrame con todas las muestras benignas
benign_samples = pd.concat([normalize_column_names(df[df.iloc[:, get_label_column_index(df)] == 'BENIGN']) for df in data_frames.values()])
print(f"Label: BENIGN, Number of samples: {len(benign_samples)}")

# Gaussian distribution parameters
mu = 0.5
sigma = 0.2

# Initialize dictionary for malignant samples by label
malign_samples_by_label = {}

# Define the category mapping
category_mapping = {
    'BENIGN': 'BENIGN',
    'Bruteforce DNS': 'Bruteforce',
    'Bruteforce FTP': 'Bruteforce',
    'Bruteforce HTTP': 'Bruteforce',
    'Bruteforce SSH': 'Bruteforce',
    'Bruteforce Telnet': 'Bruteforce',
    'DoS ACK': 'DoS',
    'DoS CWR': 'DoS',
    'DoS ECN': 'DoS',
    'DoS FIN': 'DoS',
    'DoS HTTP': 'DoS',
    'DoS ICMP': 'DoS',
    'DoS MAC': 'DoS',
    'DoS PSH': 'DoS',
    'DoS RST': 'DoS',
    'DoS SYN': 'DoS',
    'DoS UDP': 'DoS',
    'DoS URG': 'DoS',
    'Information Gathering': 'Information Gathering',
    'Mirai DDoS ACK': 'Mirai',
    'Mirai DDoS DNS': 'Mirai',
    'Mirai DDoS GREETH': 'Mirai',
    'Mirai DDoS GREIP': 'Mirai',
    'Mirai DDoS HTTP': 'Mirai',
    'Mirai DDoS SYN': 'Mirai',
    'Mirai DDoS UDP': 'Mirai',
    'Mirai Scan Bruteforce': 'Mirai'
}

# Process each DataFrame
for file, df in data_frames.items():
    # Normalize column names
    df = normalize_column_names(df)
    
    label_col_index = get_label_column_index(df)
    
    # Standardize the label column by replacing 'UDP-lag' and 'UDP-Lag' with 'UDPLag'
    df.iloc[:, label_col_index] = df.iloc[:, label_col_index].replace(['UDP-lag', 'UDP-Lag'], 'UDPLag')
    
    # Apply category mapping
    df.iloc[:, label_col_index] = df.iloc[:, label_col_index].map(category_mapping)
    
    # Get only malignant samples and drop of WEBDDOS
    malign_samples = df[(df.iloc[:, label_col_index] != 'BENIGN') & (df.iloc[:, label_col_index] != 'WebDDoS')]
    
    # Group by label and collect samples
    for label in malign_samples.iloc[:, label_col_index].unique():
        if label not in malign_samples_by_label:
            malign_samples_by_label[label] = pd.DataFrame()
        
        malign_samples_by_label[label] = pd.concat([malign_samples_by_label[label], malign_samples[malign_samples.iloc[:, label_col_index] == label]])

# Display the number of malignant samples per label
for label, samples in malign_samples_by_label.items():
    malign_count = len(samples)
    print(f"Label: {label}, Number of malignant samples: {malign_count}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=column_mapping, inplace=True)


Label: BENIGN, Number of samples: 1301
Label: Bruteforce, Number of malignant samples: 35172
Label: DoS, Number of malignant samples: 7490929
Label: Information Gathering, Number of malignant samples: 1038363
Label: Mirai, Number of malignant samples: 91002


Reducción de la base de datos

In [3]:
"""
from scipy.stats import norm

# Gaussian distribution parameters
mu = 0.5
sigma = 0.2

# Initialize dictionary for reduced malignant samples by label
malign_reduced_samples_by_label = {}

# Process each DataFrame
for label, samples in malign_samples_by_label.items():
    # Calculate the probability density function of each sample
    samples['prob'] = norm.pdf(np.linspace(0, 1, len(samples)), mu, sigma)
    
    # Reduce samples to 113828 using the probability weights
    if len(samples) > 113828:
        malign_reduced_samples_by_label[label] = samples.sample(n=113828, weights='prob', random_state=42)
    #else:
        malign_reduced_samples_by_label[label] = samples
    
    print(f"Label: {label}, Original samples: {len(samples)}, Reduced samples: {len(malign_reduced_samples_by_label[label])}")

# Concatenate all reduced malignant samples into one DataFrame
malign_reduced_data = pd.concat(malign_reduced_samples_by_label.values())

# Drop the 'prob' column
malign_reduced_data = malign_reduced_data.drop('prob', axis=1)

# Concatenate all benign samples into one DataFrame
benign_data = benign_samples

# Concatenate all benign and reduced malignant samples into one DataFrame
all_data = pd.concat([benign_data, malign_reduced_data])

print(f"Total samples in final dataset: {len(all_data)}")
print(f"Number of benign samples: {len(benign_data)}")
print(f"Number of malignant samples: {len(malign_reduced_data)}")
print("\nSample distribution:")
print(all_data.iloc[:, get_label_column_index(all_data)].value_counts())
"""

'\nfrom scipy.stats import norm\n\n# Gaussian distribution parameters\nmu = 0.5\nsigma = 0.2\n\n# Initialize dictionary for reduced malignant samples by label\nmalign_reduced_samples_by_label = {}\n\n# Process each DataFrame\nfor label, samples in malign_samples_by_label.items():\n    # Calculate the probability density function of each sample\n    samples[\'prob\'] = norm.pdf(np.linspace(0, 1, len(samples)), mu, sigma)\n    \n    # Reduce samples to 113828 using the probability weights\n    if len(samples) > 113828:\n        malign_reduced_samples_by_label[label] = samples.sample(n=113828, weights=\'prob\', random_state=42)\n    #else:\n        malign_reduced_samples_by_label[label] = samples\n    \n    print(f"Label: {label}, Original samples: {len(samples)}, Reduced samples: {len(malign_reduced_samples_by_label[label])}")\n\n# Concatenate all reduced malignant samples into one DataFrame\nmalign_reduced_data = pd.concat(malign_reduced_samples_by_label.values())\n\n# Drop the \'prob\'

In [4]:
### PRUEBA REDUCIENDO DATASET MALIGNO
import pandas as pd
import numpy as np
from scipy.stats import norm

# Asumimos que data_frames, benign_samples, y get_label_column_index están definidos previamente

# Gaussian distribution parameters
mu = 0.5
sigma = 0.2

# Initialize dictionary for malignant samples by label
malign_samples_by_label = {}

# Process each DataFrame
for file, df in data_frames.items():
    label_col_index = get_label_column_index(df)
    
    # Standardize the label column by replacing 'UDP-lag' and 'UDP-Lag' with 'UDPLag'
    df.iloc[:, label_col_index] = df.iloc[:, label_col_index].replace(['UDP-lag', 'UDP-Lag'], 'UDPLag')
    
    # Get only malignant samples and drop of WEBDDOS
    malign_samples = df[(df.iloc[:, label_col_index] != 'BENIGN') & (df.iloc[:, label_col_index] != 'WebDDoS')]
    
    # Group by label and collect samples
    for label in malign_samples.iloc[:, label_col_index].unique():
        if label not in malign_samples_by_label:
            malign_samples_by_label[label] = pd.DataFrame()
        
        malign_samples_by_label[label] = pd.concat([malign_samples_by_label[label], malign_samples[malign_samples.iloc[:, label_col_index] == label]])



# Display the number of malignant samples per label
for label, samples in malign_samples_by_label.items():
    malign_count = len(samples)
    print(f"Label: {label}, Number of malignant samples: {malign_count}")

# Reduce or increase samples as specified
target_samples = {
    'DoS': 200000,
    'Information Gathering': 200000,
    'Mirai': 100000,
    'Bruteforce': 100000
}

for label, target in target_samples.items():
    if label in malign_samples_by_label:
        current_samples = len(malign_samples_by_label[label])
        if current_samples > target:
            # Reduce samples
            malign_samples_by_label[label] = malign_samples_by_label[label].sample(n=target, random_state=42)
        elif current_samples < target:
            # Increase samples
            additional_samples = target - current_samples
            malign_samples_by_label[label] = pd.concat([malign_samples_by_label[label], malign_samples_by_label[label].sample(n=additional_samples, replace=True, random_state=42)])



# Concatenate all malignant samples into one DataFrame
malign_data = pd.concat(malign_samples_by_label.values())

# Concatenate all benign and malignant samples into one DataFrame
all_data = pd.concat([benign_samples, malign_data])

print(f"Total samples in final dataset: {len(all_data)}")
print(f"Number of benign samples: {len(benign_samples)}")
print(f"Number of malignant samples: {len(malign_data)}")
print("\nSample distribution:")
print(all_data.iloc[:, get_label_column_index(all_data)].value_counts())

# Now you can use all_data for further processing or analysis

Label: Bruteforce, Number of malignant samples: 35172
Label: DoS, Number of malignant samples: 7490929
Label: Information Gathering, Number of malignant samples: 1038363
Label: Mirai, Number of malignant samples: 91002
Total samples in final dataset: 601301
Number of benign samples: 1301
Number of malignant samples: 600000

Sample distribution:
Label
DoS                      200000
Information Gathering    200000
Bruteforce               100000
Mirai                    100000
BENIGN                     1301
Name: count, dtype: int64


In [5]:
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# Definir el mapeo de categorías
category_mapping = {
    'BENIGN': 'BENIGN',
    'Bruteforce DNS': 'Bruteforce',
    'Bruteforce FTP': 'Bruteforce',
    'Bruteforce HTTP': 'Bruteforce',
    'Bruteforce SSH': 'Bruteforce',
    'Bruteforce Telnet': 'Bruteforce',
    'DoS ACK': 'DoS',
    'DoS CWR': 'DoS',
    'DoS ECN': 'DoS',
    'DoS FIN': 'DoS',
    'DoS HTTP': 'DoS',
    'DoS ICMP': 'DoS',
    'DoS MAC': 'DoS',
    'DoS PSH': 'DoS',
    'DoS RST': 'DoS',
    'DoS SYN': 'DoS',
    'DoS UDP': 'DoS',
    'DoS URG': 'DoS',
    'Information Gathering': 'Information Gathering',
    'Mirai DDoS ACK': 'Mirai',
    'Mirai DDoS DNS': 'Mirai',
    'Mirai DDoS GREETH': 'Mirai',
    'Mirai DDoS GREIP': 'Mirai',
    'Mirai DDoS HTTP': 'Mirai',
    'Mirai DDoS SYN': 'Mirai',
    'Mirai DDoS UDP': 'Mirai',
    'Mirai Scan Bruteforce': 'Mirai'
}

# Función para procesar el dataset en lotes
def process_data_in_batches(input_file, output_file, batch_size=100000):
    # Inicializar contadores y almacenamiento temporal
    category_counts = {}
    temp_files = []
    
    # Procesar el archivo de entrada en lotes
    for chunk in pd.read_csv(input_file, chunksize=batch_size):
        # Aplicar el mapeo de categorías
        label_column_index = get_label_column_index(chunk)
        chunk.iloc[:, label_column_index] = chunk.iloc[:, label_column_index].map(category_mapping)
        
        # Actualizar conteos de categorías
        chunk_counts = chunk.iloc[:, label_column_index].value_counts()
        for category, count in chunk_counts.items():
            if category not in category_counts:
                category_counts[category] = 0
            category_counts[category] += count
        
        # Guardar el chunk procesado en un archivo temporal
        temp_file = f'temp_chunk_{len(temp_files)}.csv'
        chunk.to_csv(temp_file, index=False)
        temp_files.append(temp_file)
    
    # Encontrar la categoría con más muestras (excluyendo BENIGN)
    max_samples = max(count for category, count in category_counts.items() if category != 'BENIGN')
    
    # Procesar y balancear los datos
    with open(output_file, 'w') as outfile:
        first_chunk = True
        for temp_file in temp_files:
            for chunk in pd.read_csv(temp_file, chunksize=batch_size):
                # Separar BENIGN del resto
                benign = chunk[chunk.iloc[:, label_column_index] == 'BENIGN']
                malicious = chunk[chunk.iloc[:, label_column_index] != 'BENIGN']
                
                # Sobremuestrear categorías maliciosas
                balanced_malicious = pd.DataFrame()
                for category in malicious.iloc[:, label_column_index].unique():
                    category_data = malicious[malicious.iloc[:, label_column_index] == category]
                    samples_needed = max_samples - category_counts[category]
                    if samples_needed > 0:
                        oversampled = category_data.sample(n=samples_needed, replace=True, random_state=42)
                        balanced_malicious = pd.concat([balanced_malicious, category_data, oversampled])
                        category_counts[category] = max_samples
                    else:
                        balanced_malicious = pd.concat([balanced_malicious, category_data])
                
                # Combinar BENIGN y datos maliciosos balanceados
                balanced_chunk = pd.concat([benign, balanced_malicious])
                
                # Escribir en el archivo de salida
                balanced_chunk.to_csv(outfile, mode='a', header=first_chunk, index=False)
                first_chunk = False
            
            # Eliminar el archivo temporal después de procesarlo
            os.remove(temp_file)
    
    return category_counts

# Procesar y balancear los datos
input_file = 'data.csv'  # Reemplazar con el nombre de tu archivo de entrada
output_file = 'balanced_reduced_categories_dataset.csv'
final_category_counts = process_data_in_batches(input_file, output_file)

print("Distribución final de categorías:")
for category, count in final_category_counts.items():
    print(f"{category}: {count}")

# Dividir los datos en conjuntos de entrenamiento y prueba
def split_data(input_file, train_file, test_file, test_size=0.3, batch_size=100000):
    # Leer el archivo de entrada en lotes y escribir en archivos de entrenamiento y prueba
    for chunk in pd.read_csv(input_file, chunksize=batch_size):
        train, test = train_test_split(chunk, test_size=test_size, stratify=chunk.iloc[:, get_label_column_index(chunk)])
        train.to_csv(train_file, mode='a', header=not os.path.exists(train_file), index=False)
        test.to_csv(test_file, mode='a', header=not os.path.exists(test_file), index=False)

# Dividir los datos
train_file = 'train_balanced_reduced_categories.csv'
test_file = 'test_balanced_reduced_categories.csv'
split_data(output_file, train_file, test_file)

print("\nLos conjuntos de datos han sido procesados, balanceados y divididos en archivos de entrenamiento y prueba.")
"""

'\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nimport os\n\n# Definir el mapeo de categorías\ncategory_mapping = {\n    \'BENIGN\': \'BENIGN\',\n    \'Bruteforce DNS\': \'Bruteforce\',\n    \'Bruteforce FTP\': \'Bruteforce\',\n    \'Bruteforce HTTP\': \'Bruteforce\',\n    \'Bruteforce SSH\': \'Bruteforce\',\n    \'Bruteforce Telnet\': \'Bruteforce\',\n    \'DoS ACK\': \'DoS\',\n    \'DoS CWR\': \'DoS\',\n    \'DoS ECN\': \'DoS\',\n    \'DoS FIN\': \'DoS\',\n    \'DoS HTTP\': \'DoS\',\n    \'DoS ICMP\': \'DoS\',\n    \'DoS MAC\': \'DoS\',\n    \'DoS PSH\': \'DoS\',\n    \'DoS RST\': \'DoS\',\n    \'DoS SYN\': \'DoS\',\n    \'DoS UDP\': \'DoS\',\n    \'DoS URG\': \'DoS\',\n    \'Information Gathering\': \'Information Gathering\',\n    \'Mirai DDoS ACK\': \'Mirai\',\n    \'Mirai DDoS DNS\': \'Mirai\',\n    \'Mirai DDoS GREETH\': \'Mirai\',\n    \'Mirai DDoS GREIP\': \'Mirai\',\n    \'Mirai DDoS HTTP\': \'Mirai\',\n    \'Mirai DDoS SYN\': 

In [6]:
from sklearn.model_selection import train_test_split

# Dividir los datos en conjuntos de entrenamiento y prueba (70% entrenamiento, 30% prueba)
train_data, test_val_data = train_test_split(all_data, test_size=0.3, random_state=42)

# Dividir el conjunto de prueba en conjuntos de prueba y validación (50% prueba, 50% validación)
test_data, val_data = train_test_split(test_val_data, test_size=0.5, random_state=42)

# Guardar los conjuntos de entrenamiento en archivos CSV
train_data.to_csv('train_1.csv', index=False)

# Guardar los conjuntos de prueba en archivos CSV
test_val_data.to_csv('test_1.csv', index=False)

# Guardar los conjuntos de validación en archivos CSV
val_data.to_csv('validation_1.csv', index=False)