In [37]:
import numpy as np
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()
df.columns = df.columns.str.strip() # got error with label 

  df = pd.read_csv('dataset.csv')


In [38]:
df.shape

(2214469, 79)

In [39]:
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [55]:
print ('Label' in df.columns)

True


In [59]:
# Columns irrelevant or harmful for C2 beaconing detection
drop_cols = [

    # Rare / mostly-zero TCP flag counters
    'Fwd PSH Flags', 'Bwd PSH Flags',
    'Fwd URG Flags', 'Bwd URG Flags',
    'CWE Flag Count', 'ECE Flag Count',

    # Bulk transfer features (beaconing is low-volume)
    'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',

    # Duplicate column
    'Fwd Header Length.1'
]

In [60]:
df = df.drop(columns=drop_cols, errors='ignore')

In [61]:
df.shape

(2212030, 66)

In [62]:
# Remove destination port to avoid leakage
df = df.drop(columns=['Destination Port'])

In [63]:
df.shape

(2212030, 65)

## Features Used

###  Timing
- Flow Duration  
- Flow Inter-Arrival Time (IAT): Mean, Standard Deviation, Minimum, Maximum  
- Active Time Statistics  
- Idle Time Statistics  

###  Packet Behavior
- Total Packet Count  
- Packet Length Statistics (Mean / Min / Max / Std)  

### Directional Asymmetry
- Forward (Fwd) vs Backward (Bwd) Packet Counts  
- Downstream / Upstream Packet Ratio  


In [65]:
keep_cols = [
    # Label
    'Label',

    # Flow timing
    'Flow Duration',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',

    # Forward / Backward timing
    'Fwd IAT Mean', 'Fwd IAT Std', 'Bwd IAT Mean', 'Bwd IAT Std',

    # Packet counts
    'Total Fwd Packets', 'Total Backward Packets',

    # Packet length behavior
    'Min Packet Length', 'Max Packet Length',
    'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
    'Average Packet Size',

    # Directional behavior
    'Down/Up Ratio',

    # Active / Idle timing (beacon periodicity)
    'Active Mean', 'Active Std', 'Idle Mean', 'Idle Std'
]

In [66]:
# Reduce dataset to C2-relevant features only
df = df[keep_cols]

In [67]:
df.shape

(2212030, 23)

In [68]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [69]:
df.dropna(inplace=True)

In [None]:
# Reduce dataset size while preserving class ratio
df = df.groupby('Label', group_keys=False).apply(
    lambda x: x.sample(frac=0.25, random_state=42)
)

In [71]:
# 0 = BENIGN, 1 = Any attack
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

In [None]:
!pip install pyarrow

In [73]:
# Save dataset in Parquet format (fast load, compressed)
df.to_parquet("cicids2017_c2_clean.parquet", index=False)

In [74]:
df.shape

(553008, 23)