# Analysis of CSECICIDS2018

In [1]:
# Data Loading
import pandas as pd
import os
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from collections import defaultdict
from tqdm import tqdm

In [2]:
filenames = [f for f in os.listdir('data/CSECICIDS2018_improved') if f.endswith('.csv')]

In [None]:
all_filenames = [
    'Thursday-22-02-2018.csv',
    'Friday-16-02-2018.csv',
    # 'Thursday-15-02-2018.csv',
    'Wednesday-14-02-2018.csv',
    'Tuesday-20-02-2018.csv',
    'Wednesday-21-02-2018.csv',
    # 'Friday-23-02-2018.csv',
    # 'Wednesday-28-02-2018.csv',
    # 'Friday-02-03-2018.csv',
    # 'Thursday-01-03-2018.csv'
]

In [28]:
active_filenames = [
    'Thursday-22-02-2018.csv',
    'Friday-16-02-2018.csv',
    'Wednesday-14-02-2018.csv',
    'Tuesday-20-02-2018.csv',
    'Wednesday-21-02-2018.csv',
]

In [11]:
EDGE_COLS = [
    'Bwd Packet Length Min', 'Protocol', 'Bwd Packets/s', 'FWD Init Win Bytes',
    'Packet Length Std', 'FIN Flag Count',
    'Packet Length Min', 'Fwd Seg Size Min',
    'Bwd IAT Total', 'SYN Flag Count', 'Bwd Packet Length Std'
]
LABEL_COL = "Label"
ID_COLS = ['Src IP', 'Dst IP', 'Timestamp']
COLS_TO_KEEP = EDGE_COLS + [LABEL_COL] + ID_COLS

In [12]:
data_dir = 'data/CSECICIDS2018_improved'
output_dir = 'data/processed_chunks'
final_path = 'data/combined_active.parquet'
os.makedirs(output_dir, exist_ok=True)

In [44]:
# Initialize global statistics
malicious_ips_src = defaultdict(lambda: {
    'first_ts': None, 
    'last_ts': None, 
    'files': set(), 
    'connection_count': 0
})

malicious_ips_dst = defaultdict(lambda: {
    'first_ts': None, 
    'last_ts': None, 
    'files': set(), 
    'connection_count': 0
})

per_file_stats = {}
chunksize = 200_000

for i, filename in enumerate(filenames, 1):
    file_path = os.path.join(data_dir, filename)
    print(f'[{i}/{len(filenames)}] Scanning: {filename}')
    
    # Initialize per-file tracking
    file_malicious_src_ips = set()
    file_malicious_dst_ips = set()
    num_malicious_edges = 0
    num_benign_edges = 0

    for chunk in pd.read_csv(file_path, usecols=COLS_TO_KEEP, chunksize=chunksize):
        # Parse Timestamp column to datetime
        chunk['Timestamp'] = pd.to_datetime(chunk['Timestamp'], errors='coerce')
        
        # Separate malicious and benign rows
        malicious_mask = chunk[LABEL_COL] != 'BENIGN'
        malicious_chunk = chunk[malicious_mask]
        benign_chunk = chunk[~malicious_mask]
        
        # Count edges
        num_malicious_edges += len(malicious_chunk)
        num_benign_edges += len(benign_chunk)
        
        # Process malicious connections
        if not malicious_chunk.empty:
            # Track unique malicious IPs in this file
            mal_src_ips = malicious_chunk['Src IP'].unique()
            mal_dst_ips = malicious_chunk['Dst IP'].unique()
            file_malicious_src_ips.update(mal_src_ips)
            file_malicious_dst_ips.update(mal_dst_ips)
            
            # Update global malicious IP statistics (src)
            for src_ip in mal_src_ips:
                src_rows = malicious_chunk[malicious_chunk['Src IP'] == src_ip]
                
                malicious_ips_src[src_ip]['files'].add(filename)
                malicious_ips_src[src_ip]['connection_count'] += len(src_rows)
                
                # Update timestamps
                timestamps = src_rows['Timestamp'].dropna()
                if not timestamps.empty:
                    min_ts = timestamps.min()
                    max_ts = timestamps.max()
                    
                    if malicious_ips_src[src_ip]['first_ts'] is None or min_ts < malicious_ips_src[src_ip]['first_ts']:
                        malicious_ips_src[src_ip]['first_ts'] = min_ts
                    if malicious_ips_src[src_ip]['last_ts'] is None or max_ts > malicious_ips_src[src_ip]['last_ts']:
                        malicious_ips_src[src_ip]['last_ts'] = max_ts
            
            # Update global malicious IP statistics (dst)
            for dst_ip in mal_dst_ips:
                dst_rows = malicious_chunk[malicious_chunk['Dst IP'] == dst_ip]
                
                malicious_ips_dst[dst_ip]['files'].add(filename)
                malicious_ips_dst[dst_ip]['connection_count'] += len(dst_rows)
                
                # Update timestamps
                timestamps = dst_rows['Timestamp'].dropna()
                if not timestamps.empty:
                    min_ts = timestamps.min()
                    max_ts = timestamps.max()
                    
                    if malicious_ips_dst[dst_ip]['first_ts'] is None or min_ts < malicious_ips_dst[dst_ip]['first_ts']:
                        malicious_ips_dst[dst_ip]['first_ts'] = min_ts
                    if malicious_ips_dst[dst_ip]['last_ts'] is None or max_ts > malicious_ips_dst[dst_ip]['last_ts']:
                        malicious_ips_dst[dst_ip]['last_ts'] = max_ts
    
    # Calculate per-file statistics
    malicious_ips_both = file_malicious_src_ips.intersection(file_malicious_dst_ips)
    
    per_file_stats[filename] = {
        'num_malicious_edges': num_malicious_edges,
        'num_benign_edges': num_benign_edges,
        'total_edges': num_malicious_edges + num_benign_edges,
        'malicious_edge_ratio': num_malicious_edges / (num_malicious_edges + num_benign_edges) if (num_malicious_edges + num_benign_edges) > 0 else 0,
        'num_malicious_src_ips': len(file_malicious_src_ips),
        'num_malicious_dst_ips': len(file_malicious_dst_ips),
        'num_malicious_ips_both': len(malicious_ips_both),
        'malicious_src_ips': file_malicious_src_ips,
        'malicious_dst_ips': file_malicious_dst_ips
    }
    
    print(f"  Malicious edges: {num_malicious_edges:,} | Benign edges: {num_benign_edges:,}")
    print(f"  Malicious IPs - Src: {len(file_malicious_src_ips)}, Dst: {len(file_malicious_dst_ips)}, Both: {len(malicious_ips_both)}")

# After processing all files, calculate IP role statistics
ip_role_stats = {
    'attacker_only': set(malicious_ips_src.keys()) - set(malicious_ips_dst.keys()),
    'victim_only': set(malicious_ips_dst.keys()) - set(malicious_ips_src.keys()),
    'both_roles': set(malicious_ips_src.keys()).intersection(set(malicious_ips_dst.keys())),
}

print("\n" + "="*60)
print("GLOBAL STATISTICS")
print("="*60)
print(f"Total unique malicious source IPs: {len(malicious_ips_src)}")
print(f"Total unique malicious destination IPs: {len(malicious_ips_dst)}")
print(f"IPs acting as attackers only: {len(ip_role_stats['attacker_only'])}")
print(f"IPs acting as victims only: {len(ip_role_stats['victim_only'])}")
print(f"IPs acting in both roles: {len(ip_role_stats['both_roles'])}")

[1/10] Scanning: Thursday-22-02-2018.csv
  Malicious edges: 208 | Benign edges: 6,070,945
  Malicious IPs - Src: 1, Dst: 1, Both: 0
[2/10] Scanning: Friday-16-02-2018.csv
  Malicious edges: 1,908,766 | Benign edges: 5,481,500
  Malicious IPs - Src: 2, Dst: 1, Both: 0
[3/10] Scanning: Thursday-15-02-2018.csv
  Malicious edges: 37,631 | Benign edges: 5,372,471
  Malicious IPs - Src: 2, Dst: 1, Both: 0
[4/10] Scanning: Wednesday-14-02-2018.csv
  Malicious edges: 287,551 | Benign edges: 5,610,799
  Malicious IPs - Src: 2, Dst: 1, Both: 0
[5/10] Scanning: Tuesday-20-02-2018.csv
  Malicious edges: 290,205 | Benign edges: 5,764,497
  Malicious IPs - Src: 11, Dst: 11, Both: 11
[6/10] Scanning: Wednesday-21-02-2018.csv
  Malicious edges: 1,084,194 | Benign edges: 5,878,399
  Malicious IPs - Src: 11, Dst: 11, Both: 11
[7/10] Scanning: Friday-23-02-2018.csv
  Malicious edges: 230 | Benign edges: 5,976,251
  Malicious IPs - Src: 1, Dst: 1, Both: 0
[8/10] Scanning: Wednesday-28-02-2018.csv
  Malici

In [45]:
print("\n" + "="*60)
print("IP-LEVEL STATISTICS")
print("="*60)
print(f"Total unique malicious SRC IPs: {len(malicious_ips_src)}")
print(f"Total unique malicious DST IPs: {len(malicious_ips_dst)}")
print(f"Total unique malicious IPs: {len(set(ip_role_stats['attacker_only']).union(set(ip_role_stats['victim_only'])).union(set(ip_role_stats['both_roles'])))}")


IP-LEVEL STATISTICS
Total unique malicious SRC IPs: 30
Total unique malicious DST IPs: 49
Total unique malicious IPs: 57


In [32]:
for filename, stats in per_file_stats.items():
    print("\n" + "="*60)
    print(f"FILE: {filename}")
    print("="*60)
    for stat_name, stat_value in stats.items():
        print(f"{stat_name}: {stat_value}")


FILE: Thursday-22-02-2018.csv
num_malicious_edges: 208
num_benign_edges: 6070945
total_edges: 6071153
malicious_edge_ratio: 3.4260378547534546e-05
num_malicious_src_ips: 1
num_malicious_dst_ips: 1
num_malicious_ips_both: 0

FILE: Friday-16-02-2018.csv
num_malicious_edges: 1908766
num_benign_edges: 5481500
total_edges: 7390266
malicious_edge_ratio: 0.25828109570075014
num_malicious_src_ips: 2
num_malicious_dst_ips: 1
num_malicious_ips_both: 0

FILE: Thursday-15-02-2018.csv
num_malicious_edges: 37631
num_benign_edges: 5372471
total_edges: 5410102
malicious_edge_ratio: 0.006955691408405978
num_malicious_src_ips: 2
num_malicious_dst_ips: 1
num_malicious_ips_both: 0

FILE: Wednesday-14-02-2018.csv
num_malicious_edges: 287551
num_benign_edges: 5610799
total_edges: 5898350
malicious_edge_ratio: 0.048751091406918884
num_malicious_src_ips: 2
num_malicious_dst_ips: 1
num_malicious_ips_both: 0

FILE: Tuesday-20-02-2018.csv
num_malicious_edges: 290205
num_benign_edges: 5764497
total_edges: 605470

In [101]:
# overlap between 23-02 and 01-03
mal_inter_scr_ip = per_file_stats['Friday-23-02-2018.csv']['malicious_src_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_src_ips'])
mal_inter_dst_ip = per_file_stats['Friday-23-02-2018.csv']['malicious_dst_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_dst_ips'])
mal_inter_ip = mal_inter_scr_ip.union(mal_inter_dst_ip)
print(f"Overlap between 23-02 and 01-03: {len(mal_inter_ip)}")

Overlap between 23-02 and 01-03: 0


In [102]:
# overlap between 28-03 and 01-03
mal_inter_scr_ip = per_file_stats['Wednesday-28-02-2018.csv']['malicious_src_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_src_ips'])
mal_inter_dst_ip = per_file_stats['Wednesday-28-02-2018.csv']['malicious_dst_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_dst_ips'])
mal_inter_ip = mal_inter_scr_ip.union(mal_inter_dst_ip)
print(f"Overlap between 28-02 and 01-03: {len(mal_inter_ip)}")

Overlap between 28-02 and 01-03: 26


In [103]:
# overlap between 01-03 and 02-03
mal_inter_scr_ip = per_file_stats['Friday-02-03-2018.csv']['malicious_src_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_src_ips'])
mal_inter_dst_ip = per_file_stats['Friday-02-03-2018.csv']['malicious_dst_ips'].intersection(per_file_stats['Thursday-01-03-2018.csv']['malicious_dst_ips'])
mal_inter_ip = mal_inter_scr_ip.union(mal_inter_dst_ip)
print(f"Overlap between 01-03 and 02-03: {len(mal_inter_ip)}")

Overlap between 01-03 and 02-03: 0


In [105]:
# overlap between 23-02and 02-03
mal_inter_scr_ip = per_file_stats['Friday-23-02-2018.csv']['malicious_src_ips'].intersection(per_file_stats['Friday-02-03-2018.csv']['malicious_src_ips'])
mal_inter_dst_ip = per_file_stats['Friday-23-02-2018.csv']['malicious_dst_ips'].intersection(per_file_stats['Friday-02-03-2018.csv']['malicious_dst_ips'])
mal_inter_ip = mal_inter_scr_ip.union(mal_inter_dst_ip)
print(f"Overlap between 23-02 and 02-03: {len(mal_inter_ip)}")

Overlap between 23-02 and 02-03: 0


In [107]:
train_unique_malicious_src_ips = set()
train_unique_malicious_dst_ips = set()
train_unique_malicious_ips = set()

train_total_malicious_edge_count = 0
train_total_benign_edge_count = 0
for filename, stats in per_file_stats.items():
    if filename in active_filenames:
        train_total_malicious_edge_count += stats.get("num_malicious_edges", 0)
        train_total_benign_edge_count += stats.get("num_benign_edges", 0)
        train_unique_malicious_src_ips.update(stats.get("malicious_src_ips", set()))
        train_unique_malicious_dst_ips.update(stats.get("malicious_dst_ips", set()))
        train_unique_malicious_ips.update(stats.get("malicious_src_ips", set()).union(stats.get("malicious_dst_ips", set())))

print("TOTAL TRAINING RATIO:", train_total_malicious_edge_count/(train_total_malicious_edge_count + train_total_benign_edge_count))
print("TRAINING SET STATS")
print(f"Unique Malicious Source IPs: {len(train_unique_malicious_src_ips)}")
print(f"Unique Malicious Destination IPs: {len(train_unique_malicious_dst_ips)}")
print(f"Unique Malicious IPs: {len(train_unique_malicious_ips)}")
print(f"Total malicious connections in training set: {train_total_malicious_edge_count}")
print(f"Total benign connections in training set: {train_total_benign_edge_count}")

TOTAL TRAINING RATIO: 0.11029177939049693
TRAINING SET STATS
Unique Malicious Source IPs: 16
Unique Malicious Destination IPs: 12
Unique Malicious IPs: 16
Total malicious connections in training set: 3570924
Total benign connections in training set: 28806140


In [69]:
potential_test_filenames = [
    'Friday-23-02-2018.csv',
    'Wednesday-28-02-2018.csv',
    'Thursday-01-03-2018.csv',
    'Friday-02-03-2018.csv'
]

In [74]:
for i in range(1, len(potential_test_filenames)+1):
    test_files = potential_test_filenames[:i]
    print("="*60)
    print(f"TEST FILES: {test_files}")
    print("="*60)

    test_total_malicious_edge_count = 0
    test_total_benign_edge_count = 0
    test_unique_malicious_src_ips = set()
    test_unique_malicious_dst_ips = set()
    test_unique_malicious_ips = set()

    for filename, stats in per_file_stats.items():
        if filename in test_files:
            test_total_malicious_edge_count += stats.get("num_malicious_edges", 0)
            test_total_benign_edge_count += stats.get("num_benign_edges", 0)
            test_unique_malicious_src_ips.update(stats.get("malicious_src_ips", set()))
            test_unique_malicious_dst_ips.update(stats.get("malicious_dst_ips", set()))
            test_unique_malicious_ips.update(stats.get("malicious_src_ips", set()).union(stats.get("malicious_dst_ips", set())))

    print("TOTAL TEST RATIO:", test_total_malicious_edge_count/(test_total_malicious_edge_count + test_total_benign_edge_count))
    print("MALICIOUS EDGES IN TEST SET:", test_total_malicious_edge_count)
    print("TRAIN-TEST OVERLAP STATS")
    print(f"Unique Malicious Source IPs: {len(train_unique_malicious_src_ips & test_unique_malicious_src_ips)} | {len(test_unique_malicious_src_ips)}")
    print(f"Unique Malicious Destination IPs: {len(train_unique_malicious_dst_ips & test_unique_malicious_dst_ips)} | {len(test_unique_malicious_dst_ips)}")
    print(f"Unique Malicious IPs: {len(train_unique_malicious_ips & test_unique_malicious_ips)} | {len(test_unique_malicious_ips)}")

TEST FILES: ['Friday-23-02-2018.csv']
TOTAL TEST RATIO: 3.84841849242054e-05
MALICIOUS EDGES IN TEST SET: 230
TRAIN-TEST OVERLAP STATS
Unique Malicious Source IPs: 1 | 1
Unique Malicious Destination IPs: 1 | 1
Unique Malicious IPs: 2 | 2
TEST FILES: ['Friday-23-02-2018.csv', 'Wednesday-28-02-2018.csv']
TOTAL TEST RATIO: 0.00399148455661194
MALICIOUS EDGES IN TEST SET: 50074
TRAIN-TEST OVERLAP STATS
Unique Malicious Source IPs: 1 | 2
Unique Malicious Destination IPs: 1 | 34
Unique Malicious IPs: 2 | 36
TEST FILES: ['Friday-23-02-2018.csv', 'Wednesday-28-02-2018.csv', 'Thursday-01-03-2018.csv']
TOTAL TEST RATIO: 0.004708741992295176
MALICIOUS EDGES IN TEST SET: 89921
TRAIN-TEST OVERLAP STATS
Unique Malicious Source IPs: 1 | 3
Unique Malicious Destination IPs: 1 | 37
Unique Malicious IPs: 2 | 38
TEST FILES: ['Friday-23-02-2018.csv', 'Wednesday-28-02-2018.csv', 'Thursday-01-03-2018.csv', 'Friday-02-03-2018.csv']
TOTAL TEST RATIO: 0.009174440832149617
MALICIOUS EDGES IN TEST SET: 233104
TRA

In [106]:
potential_test_filenames = [
    # 'Friday-23-02-2018.csv',
    # 'Wednesday-28-02-2018.csv',
    'Thursday-01-03-2018.csv',
    'Friday-02-03-2018.csv'
]
for i in range(1, len(potential_test_filenames)+1):
    test_files = potential_test_filenames[:i]
    print("="*60)
    print(f"TEST FILES: {test_files}")
    print("="*60)

    test_total_malicious_edge_count = 0
    test_total_benign_edge_count = 0
    test_unique_malicious_src_ips = set()
    test_unique_malicious_dst_ips = set()
    test_unique_malicious_ips = set()

    for filename, stats in per_file_stats.items():
        if filename in test_files:
            test_total_malicious_edge_count += stats.get("num_malicious_edges", 0)
            test_total_benign_edge_count += stats.get("num_benign_edges", 0)
            test_unique_malicious_src_ips.update(stats.get("malicious_src_ips", set()))
            test_unique_malicious_dst_ips.update(stats.get("malicious_dst_ips", set()))
            test_unique_malicious_ips.update(stats.get("malicious_src_ips", set()).union(stats.get("malicious_dst_ips", set())))

    print("TOTAL TEST RATIO:", test_total_malicious_edge_count/(test_total_malicious_edge_count + test_total_benign_edge_count))
    print("MALICIOUS EDGES IN TEST SET:", test_total_malicious_edge_count)
    print("TRAIN-TEST OVERLAP STATS")
    print(f"Unique Malicious Source IPs: {len(train_unique_malicious_src_ips & test_unique_malicious_src_ips)} | {len(test_unique_malicious_src_ips)}")
    print(f"Unique Malicious Destination IPs: {len(train_unique_malicious_dst_ips & test_unique_malicious_dst_ips)} | {len(test_unique_malicious_dst_ips)}")
    print(f"Unique Malicious IPs: {len(train_unique_malicious_ips & test_unique_malicious_ips)} | {len(test_unique_malicious_ips)}")

TEST FILES: ['Thursday-01-03-2018.csv']
TOTAL TEST RATIO: 0.006082210507340338
MALICIOUS EDGES IN TEST SET: 39847
TRAIN-TEST OVERLAP STATS
Unique Malicious Source IPs: 0 | 1
Unique Malicious Destination IPs: 0 | 29
Unique Malicious IPs: 0 | 30
TEST FILES: ['Thursday-01-03-2018.csv', 'Friday-02-03-2018.csv']
TOTAL TEST RATIO: 0.014229436703068359
MALICIOUS EDGES IN TEST SET: 183030
TRAIN-TEST OVERLAP STATS
Unique Malicious Source IPs: 0 | 11
Unique Malicious Destination IPs: 0 | 30
Unique Malicious IPs: 0 | 34


In [95]:
print(test_total_malicious_edge_count/ (train_total_malicious_edge_count + test_total_malicious_edge_count), test_total_benign_edge_count / (train_total_benign_edge_count + test_total_benign_edge_count))
print((test_total_malicious_edge_count + test_total_benign_edge_count) / (train_total_malicious_edge_count + train_total_benign_edge_count + test_total_malicious_edge_count + test_total_benign_edge_count))

0.024501620629320483 0.3114603833736298
0.288372242585262


# Creating a test set


In [24]:
test_ftrs = set(['Timestamp', 'Src IP', 'Dst IP', 'Bwd Packet Length Min', 'Protocol_6',
       'Bwd Packets/s', 'FWD Init Win Bytes', 'Packet Length Std',
       'FIN Flag Count', 'SrcPortRange_registered', 'Packet Length Min',
       'Fwd Seg Size Min', 'DstPortRange_well_known', 'Bwd IAT Total',
       'SYN Flag Count', 'Bwd Packet Length Std', 'target'])

In [27]:
len(test_ftrs)

17

In [None]:
import pandas as pd

def prepare_test_set(df: pd.DataFrame, filename: str, save: bool = True, filter_features: list = None) -> pd.DataFrame:
    target_col = "Label"
    df_raw = df.copy()
    y = df[target_col].copy() if target_col in df.columns else None
    drop_cols = [
        "Flow ID", "Timestamp", "Src IP", "Dst IP",
        "Attempted Category", "Total TCP Flow Time",  # potential leakage/post-facto
        target_col
    ]
    data_to_include = ['Timestamp', 'Src IP', 'Dst IP']
    # df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    def port_range(v):
        try:
            v = int(v)
        except Exception:
            return "unknown"
        if v <= 1023: return "well_known"      # standard services
        if v <= 49151: return "registered"
        return "ephemeral"
    if "Src Port" in df.columns:
        df["SrcPortRange"] = df["Src Port"].apply(port_range)
        df = df.drop(columns=["Src Port"])
    if "Dst Port" in df.columns:
        df["DstPortRange"] = df["Dst Port"].apply(port_range)
        df = df.drop(columns=["Dst Port"])

    keep_cols = [
        # Flow/throughput
        "Flow Duration","Flow Bytes/s","Flow Packets/s","Fwd Packets/s","Bwd Packets/s",
        # Packet length stats
        "Packet Length Min","Packet Length Max","Packet Length Mean","Packet Length Std","Packet Length Variance",
        "Fwd Packet Length Max","Fwd Packet Length Min","Fwd Packet Length Mean","Fwd Packet Length Std",
        "Bwd Packet Length Max","Bwd Packet Length Min","Bwd Packet Length Mean","Bwd Packet Length Std",
        "Average Packet Size","Fwd Segment Size Avg","Bwd Segment Size Avg",
        # IAT
        "Flow IAT Mean","Flow IAT Std","Flow IAT Max","Flow IAT Min",
        "Fwd IAT Total","Fwd IAT Mean","Fwd IAT Std","Fwd IAT Max","Fwd IAT Min",
        "Bwd IAT Total","Bwd IAT Mean","Bwd IAT Std","Bwd IAT Max","Bwd IAT Min",
        # TCP header/flags
        "Fwd Header Length","Bwd Header Length",
        "FIN Flag Count","SYN Flag Count","RST Flag Count","PSH Flag Count","ACK Flag Count","URG Flag Count","CWR Flag Count","ECE Flag Count",
        "Fwd PSH Flags","Bwd PSH Flags","Fwd URG Flags","Bwd URG Flags","Fwd RST Flags","Bwd RST Flags",
        # Bulk/Subflow/Window
        "Fwd Bytes/Bulk Avg","Fwd Packet/Bulk Avg","Fwd Bulk Rate Avg",
        "Bwd Bytes/Bulk Avg","Bwd Packet/Bulk Avg","Bwd Bulk Rate Avg",
        "Subflow Fwd Packets","Subflow Fwd Bytes","Subflow Bwd Packets","Subflow Bwd Bytes",
        "FWD Init Win Bytes","Bwd Init Win Bytes","Fwd Act Data Pkts","Fwd Seg Size Min","Down/Up Ratio",
        # Active/Idle
        "Active Mean","Active Std","Active Max","Active Min",
        "Idle Mean","Idle Std","Idle Max","Idle Min",
        # Small categoricals
        "Protocol","ICMP Type","ICMP Code",
        # Port ranges we just added
        "SrcPortRange","DstPortRange",
    ]
    keep_cols = [c for c in keep_cols if c in df.columns]
    df = df[keep_cols].copy()
    small_cats = [c for c in ["Protocol","ICMP Type","ICMP Code","SrcPortRange","DstPortRange"] if c in df.columns]
    for c in small_cats:
        df[c] = df[c].astype("category")
    X = pd.get_dummies(df, columns=small_cats, drop_first=False)
    y_binary = (y != "BENIGN").astype(int)

    print("\n Top 20 features with sign:")
    # Select the top 20 most correlated features you printed above
    # top20_features = [
    #     "FWD Init Win Bytes",
    #     "Fwd Seg Size Min",
    #     "FIN Flag Count",
    #     "Bwd Packet Length Std",
    #     "SrcPortRange_registered",
    #     "Protocol_6",
    #     "Protocol_17",
    #     "SrcPortRange_ephemeral",
    #     "SYN Flag Count",
    #     "Bwd Packet Length Min",
    #     "Packet Length Min",
    #     "Fwd Packet Length Min",
    #     "Bwd Packets/s",
    #     "Flow Packets/s",
    #     "Fwd Packets/s",
    #     "Bwd Packet Length Max",
    #     "DstPortRange_well_known",
    #     "DstPortRange_registered",
    #     "Packet Length Std",
    #     "Bwd IAT Total"
    # ]
    # corr_df = X[top20_features].copy()
    # corr_df['target'] = y_binary
    # corr_matrix = corr_df.corr()

    # feature_corr = corr_matrix.drop('target', axis=0).drop('target', axis=1)
    # upper_triangle = feature_corr.where(np.triu(np.ones(feature_corr.shape), k=1).astype(bool))
    # to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]
    # reduced_features = [f for f in top20_features if f not in to_drop]

    # print("Removed due to >0.9 correlation:", to_drop)
    # print("Final reduced feature set:", reduced_features)
    # print("Final number of features:", len(reduced_features))
    # final_dataset = X[reduced_features].copy()
    final_dataset = X[[f for f in filter_features if f in X.columns]].copy()
    final_dataset['target'] = y_binary
    final_dataset[data_to_include] = df_raw[data_to_include]
    print("Data shape before filtering:", final_dataset.shape)
    if filter_features is not None:
        features_not_included = [f for f in filter_features if f not in final_dataset.columns]
        print("Features missing in final dataset:", features_not_included)
        final_dataset = final_dataset[[f for f in filter_features if f in final_dataset.columns]]
        print("After filtering, final number of features:", len(final_dataset.columns))
    if save:
        final_dataset.to_csv(f'data/{filename}.csv', index=False)
    return final_dataset

In [36]:
test_filenames = ['Thursday-01-03-2018']

In [6]:
test_df = pd.read_csv('data/CSECICIDS2018_improved/Friday-23-02-2018.csv')

In [53]:
processed_test_df = prepare_test_set(test_df.sample(1000), 'Friday-23-02-2018', save=False, filter_features=list(test_ftrs))


 Top 20 features with sign:
Removed due to >0.9 correlation: ['Protocol_6', 'Flow Packets/s', 'Fwd Packets/s', 'Bwd Packet Length Max']
Final reduced feature set: ['FWD Init Win Bytes', 'Fwd Seg Size Min', 'FIN Flag Count', 'Bwd Packet Length Std', 'SrcPortRange_registered', 'Protocol_17', 'SrcPortRange_ephemeral', 'SYN Flag Count', 'Bwd Packet Length Min', 'Packet Length Min', 'Fwd Packet Length Min', 'Bwd Packets/s', 'DstPortRange_well_known', 'DstPortRange_registered', 'Packet Length Std', 'Bwd IAT Total']
Final number of features: 16
Data shape before filtering: (1000, 20)
After filtering, final number of features: 16


In [44]:
len(processed_test_df.columns)

18

In [45]:
processed_test_df.columns

Index(['FWD Init Win Bytes', 'Fwd Seg Size Min', 'FIN Flag Count',
       'Bwd Packet Length Std', 'SrcPortRange_registered', 'Protocol_17',
       'SrcPortRange_ephemeral', 'SYN Flag Count', 'Bwd Packet Length Min',
       'Bwd Packets/s', 'DstPortRange_well_known', 'DstPortRange_registered',
       'Packet Length Std', 'Bwd IAT Total', 'target', 'Timestamp', 'Src IP',
       'Dst IP'],
      dtype='object')

In [56]:
ftrs_to_include = test_ftrs - set(processed_test_df.columns)
print("Features missing in processed test set:", ftrs_to_include)

Features missing in processed test set: {'Protocol_6'}


In [57]:
ftrs_to_include = set(processed_test_df.columns) - test_ftrs
print("Features missing in test features:", ftrs_to_include)

Features missing in test features: set()


In [60]:
print("Loading Thursday-01-03-2018.csv...")
test_df1 = pd.read_csv('data/CSECICIDS2018_improved/Thursday-01-03-2018.csv')
print("Loading Friday-02-03-2018.csv...")
test_df2 = pd.read_csv('data/CSECICIDS2018_improved/Friday-02-03-2018.csv')


Loading Thursday-01-03-2018.csv...
Loading Friday-02-03-2018.csv...


In [66]:
print("Preparing test set...")
raw_test_df = pd.concat([test_df1, test_df2], ignore_index=True)
test_dataset = prepare_test_set(raw_test_df, 'test', save=False, filter_features=list(test_ftrs))

Preparing test set...

 Top 20 features with sign:
Data shape before filtering: (12862772, 17)
Features missing in final dataset: []
After filtering, final number of features: 17


In [67]:
test_dataset.columns 

Index(['FWD Init Win Bytes', 'Bwd IAT Total', 'Src IP',
       'Bwd Packet Length Min', 'DstPortRange_well_known', 'target', 'Dst IP',
       'Packet Length Std', 'Packet Length Min', 'Timestamp', 'Bwd Packets/s',
       'Bwd Packet Length Std', 'FIN Flag Count', 'Protocol_6',
       'SrcPortRange_registered', 'Fwd Seg Size Min', 'SYN Flag Count'],
      dtype='object')

In [None]:
test_dataset.to_csv('data/test.csv', index=False)

In [64]:
raw_test_df['Protocol'].unique()

array([ 6, 17,  0,  1])

In [117]:
test_dataset['target'].value_counts(normalize=True)

target
0    0.985771
1    0.014229
Name: proportion, dtype: float64

In [118]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12862772 entries, 0 to 12862771
Data columns (total 13 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   FWD Init Win Bytes       int64  
 1   Fwd Seg Size Min         int64  
 2   FIN Flag Count           int64  
 3   Bwd Packet Length Std    float64
 4   SrcPortRange_registered  bool   
 5   SYN Flag Count           int64  
 6   Bwd Packet Length Min    int64  
 7   Packet Length Min        int64  
 8   Bwd Packets/s            float64
 9   DstPortRange_well_known  bool   
 10  Packet Length Std        float64
 11  Bwd IAT Total            int64  
 12  target                   int64  
dtypes: bool(2), float64(3), int64(8)
memory usage: 1.1 GB
