In [7]:
import pandas as pd

Load the cleaned multiclass dataset

In [8]:
df = pd.read_csv('CICEVSE2024_Cleaned.csv')

print("Original label distribution:")
print(df['Label'].value_counts())
print()

Original label distribution:
Label
SynonymousIP_Flood           256730
TCP_Flood                    256315
PSHACK_Flood                 195952
SYN_Flood                    195002
SYN_Stealth_Scan              77278
TCP_Port_Scan                 64455
Service_Version_Detection     46334
Vulnerability_Scan            38023
UDP_Flood                     32475
OS_Fingerprinting             26080
Aggressive_Scan               21762
Slowloris_Scan                 2340
Benign                           82
ICMP_Flood                       32
ICMP_Fragmentation               28
Name: count, dtype: int64



Convert labels to binary classification
If label is 'Benign', keep it as 'Benign', otherwise change to 'Attack'

In [9]:
df['Label'] = df['Label'].map(lambda x: 'Benign' if x == 'Benign' else 'Attack')

print("Binary label distribution:")
print(df['Label'].value_counts())
print()

Binary label distribution:
Label
Attack    1212806
Benign         82
Name: count, dtype: int64



Calculate the percentage distribution

In [10]:
label_counts = df['Label'].value_counts()
total_samples = len(df)
print("Percentage distribution:")
for label, count in label_counts.items():
    percentage = (count / total_samples) * 100
    print(f"{label}: {count} samples ({percentage:.2f}%)")

Percentage distribution:
Attack: 1212806 samples (99.99%)
Benign: 82 samples (0.01%)


Shuffle the dataset to mix Attack and Benign samples randomly

In [11]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Dataset shuffled successfully!")
print("First 10 labels after shuffling:")
print(df_shuffled['Label'].head(10).tolist())

Dataset shuffled successfully!
First 10 labels after shuffling:
['Attack', 'Attack', 'Attack', 'Attack', 'Attack', 'Attack', 'Attack', 'Attack', 'Attack', 'Attack']


Save the binary classification dataset to a new CSV file

In [13]:
output_filename = 'CICEVSE2024_Binary_Classification.csv'
df_shuffled.to_csv(output_filename, index=False)

print(f"\nBinary classification dataset saved as '{output_filename}'")
print(f"Dataset shape: {df_shuffled.shape}")
print(f"Total samples: {len(df_shuffled)}")

# Verify the shuffling worked by showing label distribution remains the same
print(f"\nFinal label distribution (after shuffling):")
print(df_shuffled['Label'].value_counts())


Binary classification dataset saved as 'CICEVSE2024_Binary_Classification.csv'
Dataset shape: (1212888, 42)
Total samples: 1212888

Final label distribution (after shuffling):
Label
Attack    1212806
Benign         82
Name: count, dtype: int64
