In [16]:
import numpy as np
import pandas as pd
import os
import re
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming RESULT_DIR is defined as in your original code
# Definindo o BASE_DIR
BASE_DIR = os.getcwd()

# Caminho para a pasta de dados

RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

RESULT_DIR = os.path.join(BASE_DIR, 'results')

# Function to safely parse dictionary-like strings
def parse_dict_string(dict_str):
    if not isinstance(dict_str, str):
        return dict_str
    
    if not dict_str.startswith('{'):
        return dict_str
        
    # Clean the string representation for proper parsing
    try:
        # Remove np.float64 references and parentheses
        cleaned_str = dict_str.replace('np.float64', '').replace('(', '').replace(')', '')
        # Parse the dictionary string
        result_dict = ast.literal_eval(cleaned_str)
        return result_dict
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing dictionary string: {e}")
        print(f"Problematic string: {dict_str}")
        return {}

In [17]:
# Collect results from subfolders
results_dfs = []
for folder_name in os.listdir(RESULT_DIR):
    folder_path = os.path.join(RESULT_DIR, folder_name)
    if os.path.isdir(folder_path):
        # Extract dataset name from folder name
        parts = folder_name.split('_')
        dataset_name = parts[0]
        for i in range(1, len(parts)-2):  # Skip the last two parts (date and id)
            dataset_name += '_' + parts[i]
            
        results_csv_path = os.path.join(folder_path, 'results.csv')
        if os.path.exists(results_csv_path):
            try:
                df = pd.read_csv(results_csv_path)
                df['dataset_name'] = dataset_name
                results_dfs.append(df)
            except pd.errors.EmptyDataError:
                print(f"Warning: 'results.csv' in '{folder_path}' is empty.")
            except pd.errors.ParserError:
                print(f"Warning: Could not parse 'results.csv' in '{folder_path}'. Check file format.")
        else:
            print(f"Warning: 'results.csv' not found in '{folder_path}'")



In [19]:
# Process the data
data_collective = []
data_pointwise = []

for df in results_dfs:
    for index, row in df.iterrows():
        dataset_name = row['dataset_name']
        model_name = row.get('model', 'Unknown')  # Get model name or default to 'Unknown'
        metric_type = row.get('metric_type', '')
        
        # Extract metrics directly from the row
        auc = row.get('auc', None)
        accuracy = row.get('accuracy', None)
        tpr = row.get('true positive rate', None)
        fpr = row.get('false positive rate', None)
        precision = row.get('precision', None)
        f1_score = row.get('f1-score', None)
        
        # Determine which list to append to based on metric_type
        if metric_type == 'collective':
            data_collective.append([
                dataset_name,
                metric_type,
                auc,
                precision
            ])
        else:  # Assuming anything that's not collective is pointwise
            data_pointwise.append([
                dataset_name,
                metric_type,
                auc,
                precision,
            ])
print(data_collective)
print(data_pointwise)

[['BNA', 'collective', 0.9983606557377048, 1.0], ['CICIDS_2017', 'collective', 0.5249125, 0.2084031664716609], ['CIDDS-001_ICMP', 'collective', 0.9910161742542246, 0.9863013698630136], ['CIDDS-001_TCP', 'collective', 0.5, 0.0], ['CIDDS-001_UDP', 'collective', 0.5, 0.0], ['CTG', 'collective', 0.8788237407072438, 0.895397489539749], ['DCCC', 'collective', 0.6576493578141884, 0.7306924101198402], ['HTRU2', 'collective', 0.9402514395796796, 0.9435637285986048], ['Kitsune_Active_Wiretap', 'collective', 0.952575, 0.8911850331694745], ['Kitsune_ARP_MitM', 'collective', 0.8333825791779926, 0.552682513760358], ['Kitsune_Fuzzing', 'collective', 0.89705, 0.5521312565822294], ['Kitsune_Mirai', 'collective', 0.942695761100437, 0.99998507618607], ['Kitsune_OS_Scan', 'collective', 0.99181875, 0.9905068059671506], ['Kitsune_SSDP_Flood', 'collective', 0.999675, 0.9986015383078614], ['Kitsune_SSL_Renegotiation', 'collective', 0.97575625, 0.8426695934822069], ['Kitsune_SYN_DoS', 'collective', 0.905853123

In [20]:
# Create DataFrames
df_collective = pd.DataFrame(data_collective, columns=['dataset_name', 'metric_type', 'auc', 'ap'])
df_pointwise = pd.DataFrame(data_pointwise, columns=['dataset_name', 'metric_type', 'auc', 'ap'])

In [21]:
# Rename columns to indicate collective/pointwise
df_collective = df_collective.rename(columns={
    'auc': 'auc_collective',
    'ap': 'ap_collective'
})

df_pointwise = df_pointwise.rename(columns={
    'auc': 'auc_pointwise',
    'ap': 'ap_pointwise'
})

# Merge the DataFrames on dataset_name
df_merged = pd.merge(
    df_pointwise[['dataset_name', 'auc_pointwise', 'ap_pointwise']], 
    df_collective[['dataset_name', 'auc_collective', 'ap_collective']], 
    on='dataset_name', 
    how='outer'
)

# Display the result
print(df_merged.head())

# Save to CSV
csv_file_path = os.path.join(RESULT_DIR, 'merged_results.csv')
df_merged.to_csv(csv_file_path, index=False)

     dataset_name  auc_pointwise  ap_pointwise  auc_collective  ap_collective
0             BNA       0.952566      0.983389        0.998361       1.000000
1     CICIDS_2017       0.519156      0.206435        0.524913       0.208403
2  CIDDS-001_ICMP       0.850064      0.796149        0.991016       0.986301
3   CIDDS-001_TCP       0.500000      0.000000        0.500000       0.000000
4   CIDDS-001_UDP       0.500000      0.000000        0.500000       0.000000
