In [None]:
pip install scapy pandas

Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/2.4 MB[0m [31m19.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.4/2.4 MB[0m [31m38.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1


http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/PCAPs/ </br>
http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/PCAPs/Friday-WorkingHours.pcap </br>
http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/GeneratedLabelledFlows.zip : In TrafficLabelling Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv

In [None]:
wget http://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/Dataset/PCAP/Benign_Final/BenignTraffic.pcap.pcap
wget http://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/Dataset/PCAP/DDoS-HTTP_Flood/DDoS-HTTP_Flood-.pcap

BenignTraffic.pcap

DDoS-HTTP_Flood-.pcap

This script splits the output pcap files into 70% training and 30% testing pcap files



```
python3 pcap_splitter_dpkt.py BenignTraffic.pcap ./pcaps_splits_10k --max-flows 50000 --train-flows 35000 --seed 42 --include-remaining none --verbose
python3 pcap_splitter_dpkt.py DDoS-HTTP_Flood.pcap ./pcaps_splits_10k --max-flows 50000 --train-flows 35000 --seed 42 --include-remaining none --verbose
```



In [None]:
Try AI directly in your favorite apps … Use Gemini to generate drafts and refine content, plus get Gemini Pro with access to Google's next-gen AI for $19.99 $0 for 1 month
#!/usr/bin/env python3
"""
Fast two-pass PCAP splitter using dpkt (optimized).

Behavior:
 - By default: collect first `--max-flows` unique flows (overall),
   then second-pass scans entire pcap to write ALL packets belonging to those flows
   (ensures complete flows).
 - If --prefix-only is set: collect flows and only write packets seen up to the point
   where the N-th unique flow was discovered (NO full second pass). This is much faster,
   but you won't get packets for those flows that appear later in the file.

Usage:
  python3 pcap_splitter_dpkt.py DDoS-SYN_Flood.pcap ./pcaps_splits \
      --max-flows 10000 --train-flows 7000 --seed 42 --include-remaining none --verbose

  # Fast prefix mode (stop at the packet where the 10k-th flow was first seen):
  python3 pcap_splitter_dpkt.py DDoS-SYN_Flood.pcap ./pcaps_splits \
      --max-flows 10000 --train-flows 7000 --prefix-only --verbose
"""
import os, argparse, hashlib, struct
import dpkt, socket

def flow_key_from_eth(eth, pkt_index):
    """Return flow key as bytes; non-IP gets a unique sentinel using packet index."""
    try:
        ip = eth.data
    except Exception:
        return b'__nonip__' + struct.pack("!I", pkt_index)
    if not isinstance(ip, dpkt.ip.IP):
        return b'__nonip__' + struct.pack("!I", pkt_index)
    src = ip.src  # bytes
    dst = ip.dst
    proto = ip.p
    sport = 0
    dport = 0
    if proto == dpkt.ip.IP_PROTO_TCP:
        try:
            tcp = ip.data
            sport = getattr(tcp, 'sport', 0)
            dport = getattr(tcp, 'dport', 0)
        except Exception:
            sport = 0; dport = 0
    elif proto == dpkt.ip.IP_PROTO_UDP:
        try:
            udp = ip.data
            sport = getattr(udp, 'sport', 0)
            dport = getattr(udp, 'dport', 0)
        except Exception:
            sport = 0; dport = 0
    # pack src(4) dst(4) sport(2) dport(2) proto(1)
    return src + dst + struct.pack("!HHB", sport, dport, proto)

def collect_flows_dpkt(path, max_flows=None, verbose=False, prefix_only=False):
    """
    Collect unique flows in file order.
    If prefix_only=True, also store packets seen up to break point and return them.
    Returns:
      flow_order (list of flow keys),
      packets_seen (list of (ts, buf, key)) if prefix_only else None
    """
    seen = set()
    flow_order = []
    packets_seen = [] if prefix_only else None
    total_pkts = 0
    with open(path, 'rb') as f:
        pcap = dpkt.pcap.Reader(f)
        for ts, buf in pcap:
            total_pkts += 1
            try:
                eth = dpkt.ethernet.Ethernet(buf)
                key = flow_key_from_eth(eth, total_pkts)
            except Exception:
                key = b'__nonip__' + struct.pack("!I", total_pkts)

            # If prefix-only, store packet until we break
            if prefix_only:
                packets_seen.append((ts, buf, key))

            if key not in seen:
                seen.add(key)
                flow_order.append(key)
                if (max_flows is not None) and (len(flow_order) >= max_flows):
                    if verbose:
                        print(f"  reached max_flows={max_flows} at packet #{total_pkts}")
                    break

            if verbose and (total_pkts % 500000 == 0):
                print(f"  scanned {total_pkts} pkts, collected {len(flow_order)} flows")
    if verbose:
        print(f"collect_flows: scanned {total_pkts} pkts, collected {len(flow_order)} flows")
    return flow_order, packets_seen

def flow_score(flow_key, seed):
    m = hashlib.md5()
    m.update(str(seed).encode('utf-8'))
    m.update(flow_key)
    return int.from_bytes(m.digest(), 'big')

def choose_train(flow_keys, k, seed):
    if k >= len(flow_keys):
        return set(flow_keys)
    scored = [(flow_score(kf, seed), kf) for kf in flow_keys]
    scored.sort(key=lambda x: x[0])
    sel = {kf for (_, kf) in scored[:k]}
    return sel

def write_from_prefix(packets_seen, train_set, collected_set, out_folder, base, include_remaining="none", verbose=False):
    """Write train/test pcaps using only packets in packets_seen list (faster)."""
    import dpkt, os
    train_out = os.path.join(out_folder, base + "_train.pcap")
    test_out = os.path.join(out_folder, base + "_test.pcap")
    os.makedirs(out_folder, exist_ok=True)
    tf = open(train_out, 'wb'); tw = dpkt.pcap.Writer(tf)
    ff = open(test_out, 'wb'); tw2 = dpkt.pcap.Writer(ff)

    ntrain = 0; ntest = 0; total = 0
    for ts, buf, key in packets_seen:
        total += 1
        if key in train_set:
            tw.writepkt(buf, ts=ts); ntrain += 1
        else:
            if key in collected_set:
                tw2.writepkt(buf, ts=ts); ntest += 1
            else:
                if include_remaining == "train":
                    tw.writepkt(buf, ts=ts); ntrain += 1
                elif include_remaining == "test":
                    tw2.writepkt(buf, ts=ts); ntest += 1
                else:
                    pass
    tf.close(); ff.close()
    if verbose:
        print(f"Prefix write -> total pkts considered: {total}, train:{ntrain}, test:{ntest}")
    return train_out, test_out, ntrain, ntest

def write_split_dpkt_fullscan(path, out_folder, train_set, collected_keys_set, include_remaining="none", verbose=False):
    """Second-pass full-scan writer (writes packets belonging to selected flows anywhere in file)."""
    base = os.path.splitext(os.path.basename(path))[0]
    os.makedirs(out_folder, exist_ok=True)
    train_out = os.path.join(out_folder, base + "_train.pcap")
    test_out = os.path.join(out_folder, base + "_test.pcap")
    tw_f = open(train_out, 'wb'); tw = dpkt.pcap.Writer(tw_f)
    tt_f = open(test_out, 'wb'); tt = dpkt.pcap.Writer(tt_f)

    total = 0; ntrain = 0; ntest = 0
    with open(path, 'rb') as f:
        pcap = dpkt.pcap.Reader(f)
        for ts, buf in pcap:
            total += 1
            try:
                eth = dpkt.ethernet.Ethernet(buf)
                key = flow_key_from_eth(eth, total)
            except Exception:
                key = b'__nonip__' + struct.pack("!I", total)

            if key in train_set:
                tw.writepkt(buf, ts=ts); ntrain += 1
            else:
                if key in collected_keys_set:
                    tt.writepkt(buf, ts=ts); ntest += 1
                else:
                    if include_remaining == "train":
                        tw.writepkt(buf, ts=ts); ntrain += 1
                    elif include_remaining == "test":
                        tt.writepkt(buf, ts=ts); ntest += 1
                    else:
                        pass

            if verbose and (total % 500000 == 0):
                print(f"  scanned {total} pkts, ntrain={ntrain}, ntest={ntest}")
    tw_f.close(); tt_f.close()
    if verbose:
        print(f"Full-scan write -> scanned {total}, train:{ntrain}, test:{ntest}")
    return train_out, test_out, ntrain, ntest

def main():
    ap = argparse.ArgumentParser(description="Fast dpkt-based pcap splitter (prefix-only option)")
    ap.add_argument("pcap", help="input pcap file")
    ap.add_argument("out_folder", help="output folder")
    ap.add_argument("--max-flows", type=int, default=10000, help="total flows to collect (use 0 for all)")
    ap.add_argument("--train-flows", type=int, default=7000, help="train flows within collected")
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--include-remaining", choices=["none","train","test"], default="none")
    ap.add_argument("--prefix-only", action="store_true", help="Only use packets up to the point where max-flows was reached (fast).")
    ap.add_argument("--verbose", action="store_true")
    args = ap.parse_args()

    max_flows = args.max_flows if args.max_flows > 0 else None

    if args.verbose:
        print("Collecting flows (this stops when first max_flows unique flows are found)...")
    flow_keys, packets_seen = collect_flows_dpkt(args.pcap, max_flows=max_flows, verbose=args.verbose, prefix_only=args.prefix_only)

    if len(flow_keys) == 0:
        print("No flows collected; exiting.")
        return

    if args.train_flows > len(flow_keys):
        print(f"Warning: requested train {args.train_flows} > collected {len(flow_keys)}. Selecting all available flows for train.")
    train_set = choose_train(flow_keys, args.train_flows, args.seed)
    collected_set = set(flow_keys)

    base = os.path.splitext(os.path.basename(args.pcap))[0]

    if args.prefix_only:
        if args.verbose:
            print("Writing split using only the prefix (packets captured up to the break point)...")
        train_out, test_out, ntrain, ntest = write_from_prefix(packets_seen, train_set, collected_set,
                                                               args.out_folder, base, include_remaining=args.include_remaining, verbose=args.verbose)
    else:
        if args.verbose:
            print("Writing split by full-scan (this will scan the entire pcap to include complete flows)...")
        train_out, test_out, ntrain, ntest = write_split_dpkt_fullscan(args.pcap, args.out_folder, train_set, collected_set, include_remaining=args.include_remaining, verbose=args.verbose)

    if args.verbose:
        print(f"Done. Outputs:\n  {train_out} ({ntrain} pkts)\n  {test_out} ({ntest} pkts)")

if __name__ == "__main__":
    main()


DDoS-HTTP_Flood-_test.pcap

BenignTraffic_test.pcap

DDoS-HTTP_Flood-_train.pcap

BenignTraffic_train.pcap

This bash script takes the output pcap and generates txt files:


```
chmod +x extract_pkts.sh
./extract_pkts.sh
```



In [None]:
# Change the output file path based on test or train dataset
for f in *.pcap
	do
		echo $f
        tshark -r $f -Y 'ip.proto == 6 or ip.proto == 17' -T fields -e frame.time_relative -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e ip.proto -e ip.len -e udp.srcport -e udp.dstport -E separator='|' > $f.txt
	done

DDoS-HTTP_Flood-_test.pcap.txt

BenignTraffic_test.pcap.txt

DDoS-HTTP_Flood-_train.pcap.txt

BenignTraffic_train.pcap.txt

The python scrpt below is called by the shell script

In [None]:
Try AI directly in your favorite apps … Use Gemini to generate drafts and refine content, plus get Gemini Pro with access to Google's next-gen AI for $19.99 $0 for 1 month
import pandas as pd
import numpy as np
import sys
import os

# Read input arguments
filename_in = sys.argv[1]
filename_out = sys.argv[2]
npkts = int(sys.argv[3])

# Load packet data
packet_data = pd.read_csv(
    filename_in,
    sep='|',
    header=None,
    dtype=str,
    low_memory=False
)

# Assign columns (as in your original)
packet_data.columns = [
    'timestamp', 'ip.src', 'ip.dst',
    'tcp.srcport', 'tcp.dstport',
    'ip.proto', 'ip.len',
    'udp.srcport', 'udp.dstport'
]

# Normalize column names to safe Python identifiers (no dots)
packet_data = packet_data.rename(columns={
    'ip.src': 'ip_src',
    'ip.dst': 'ip_dst',
    'tcp.srcport': 'tcp_srcport',
    'tcp.dstport': 'tcp_dstport',
    'ip.proto': 'ip_proto',
    'ip.len': 'ip_len',
    'udp.srcport': 'udp_srcport',
    'udp.dstport': 'udp_dstport'
})

# Drop rows with missing proto and strip whitespace
packet_data['ip_proto'] = packet_data['ip_proto'].astype(str).str.strip()
packet_data = packet_data.dropna(subset=['ip_proto'])

# Convert ip_len to int where possible
packet_data['ip_len'] = pd.to_numeric(packet_data['ip_len'], errors='coerce').fillna(0).astype(int)

# Convert TCP and UDP ports to nullable integers (Int64)
packet_data['tcp_srcport'] = pd.to_numeric(packet_data['tcp_srcport'], errors='coerce').astype('Int64')
packet_data['tcp_dstport'] = pd.to_numeric(packet_data['tcp_dstport'], errors='coerce').astype('Int64')
packet_data['udp_srcport'] = pd.to_numeric(packet_data['udp_srcport'], errors='coerce').astype('Int64')
packet_data['udp_dstport'] = pd.to_numeric(packet_data['udp_dstport'], errors='coerce').astype('Int64')

# Normalize protocol values to simple strings '6' and '17' (or others)
packet_data['ip_proto'] = packet_data['ip_proto'].str.strip()

# Build srcport/dstport based on protocol (strings compared to '6'/'17')
packet_data['srcport'] = np.where(packet_data['ip_proto'] == '6',
                                  packet_data['tcp_srcport'],
                                  packet_data['udp_srcport'])
packet_data['dstport'] = np.where(packet_data['ip_proto'] == '6',
                                  packet_data['tcp_dstport'],
                                  packet_data['udp_dstport'])

# Make sure srcport/dstport are Int64
packet_data['srcport'] = packet_data['srcport'].astype('Int64')
packet_data['dstport'] = packet_data['dstport'].astype('Int64')

# Optional: filter out weird combined values like "1,17" only if you intended that originally.
# (Your original filter `packet_data["ip.proto"] != "1,17"` etc. looked accidental.)
# If you want to remove rows where ip_proto contains a comma:
packet_data = packet_data[~packet_data['ip_proto'].str.contains(',', na=False)].reset_index(drop=True)

# Create safe flow_id column (use underscores)
packet_data['flow_id'] = (
    packet_data['ip_src'].astype(str) + " " +
    packet_data['ip_dst'].astype(str) + " " +
    packet_data['srcport'].astype(str) + " " +
    packet_data['dstport'].astype(str) + " " +
    packet_data['ip_proto'].astype(str)
)

# Labeling based on filename
filename_patterns = {"ddos": "DDoS", "benign": "BENIGN"}
label = "UNKNOWN"
for pattern, label_in_filename in filename_patterns.items():
    if pattern in filename_in.lower():
        label = label_in_filename

number_of_pkts_limit = npkts
min_number_of_packets = npkts

# Initialize dictionaries
main_packet_size = {}
flow_list = []
differential_packet_size = {}
main_inter_arrival_time = {}
last_time = {}
avg_pkt_sizes = {}
labels = {}
flow_start_time = {}
flow_end_time = {}

# Collect packets into flows
print("NOW: COLLECTING PACKETS INTO FLOWS...")
# Use named tuple attributes to avoid index mistakes
for row in packet_data.itertuples(index=False, name='Row'):
    # access via attributes (note: dot in original column becomes underscore in attribute)
    time = float(row.timestamp)
    srcip = row.ip_src
    dstip = row.ip_dst
    pktsize = int(row.ip_len)       # packet length from ip_len
    proto = str(row.ip_proto)       # '6' or '17' etc.
    srcport = row.srcport           # Int64 or <NA>
    dstport = row.dstport
    key = row.flow_id

    if key in flow_list:
        if len(main_packet_size[key]) < number_of_pkts_limit:
            main_packet_size[key].append(pktsize)
            flow_end_time[key] = time

            # differential packet length (previous exists because this is not first append)
            prev = main_packet_size[key][-2]
            diff_len = abs(pktsize - prev)
            differential_packet_size[key].append(diff_len)
            labels[key] = label

            # IAT
            lasttime = last_time[key]
            diff = round(float(time) - float(lasttime), 9)
            main_inter_arrival_time[key].append(diff)
            last_time[key] = time
    else:
        # new flow
        flow_list.append(key)
        labels[key] = label
        main_packet_size[key] = [pktsize]
        main_inter_arrival_time[key] = []
        differential_packet_size[key] = []
        flow_start_time[key] = time
        flow_end_time[key] = time
        last_time[key] = time

# Write output to CSV (include src/dst ports in header and output)
print("NOW: WRITING FLOW FEATURES INTO CSV...")
header = "Flow ID,Src Port,Dst Port,Min Packet Length,Max Packet Length,Packet Length Total,Min differential Packet Length,Max differential Packet Length,IAT min,IAT max,Flow Duration,Label"
file_exists = os.path.isfile(filename_out)
with open(filename_out, "a") as text_file:
    if not file_exists:
        text_file.write(header + "\n")

    for key in flow_list:
        packet_list = main_packet_size[key]
        length_packets = len(packet_list)
        total_bytes = sum(packet_list)
        avg_pkt_sizes[key] = total_bytes / length_packets
        min_pkt_size = min(packet_list)
        max_pkt_size = max(packet_list)

        inter_arrival_time_list = main_inter_arrival_time[key]
        iat_len = len(inter_arrival_time_list)
        if iat_len == 0:
            min_IAT_ms = 0
            max_IAT_ms = 0
            flow_duration_ms = 0
        else:
            min_IAT = min(inter_arrival_time_list)
            max_IAT = max(inter_arrival_time_list)
            min_IAT_ms = round(1000000 * min_IAT, 9)
            max_IAT_ms = round(1000000 * max_IAT, 9)
            flow_duration = sum(inter_arrival_time_list)
            flow_duration_ms = round(1000000000 * flow_duration, 9)

        min_diff_pkt_size = min(differential_packet_size[key]) if differential_packet_size[key] else 0
        max_diff_pkt_size = max(differential_packet_size[key]) if differential_packet_size[key] else 0

        # parse src/dst port from flow_id or from the packet_data (preferred)
        # extract ports from flow_id string: split on spaces -> [srcip,dstip,srcport,dstport,proto]
        try:
            parts = key.split(" ")
            srcport_str = parts[2]
            dstport_str = parts[3]
        except Exception:
            srcport_str = ""
            dstport_str = ""

        out_line = (
            f"{key},{srcport_str},{dstport_str},"
            f"{min_pkt_size},{max_pkt_size},{total_bytes},"
            f"{min_diff_pkt_size},{max_diff_pkt_size},"
            f"{min_IAT_ms},{max_IAT_ms},{flow_duration_ms},{labels[key]}\n"
        )

        if len(main_packet_size[key]) >= min_number_of_packets:
            text_file.write(out_line)


This shell script take the txt files as input and generate CSV files with all computed features per flow

```
chmod +x extract_flows.sh
./extract_flows.sh
```

Note: this is wrong file refrencing. They need to be processed in separate directories (train and test)

In [None]:
#!/bin/bash

# Change the output file path based on test or train dataset
output_file="train_data_CIC.csv"

if [ ! -f $output_file ]; then
    echo "Flow ID, Min Packet Length, Max Packet Length, Packet Length Total, Min differential Packet Length, Max differential Packet Length, IAT min, IAT max, Flow Duration, Label" > $output_file
fi

for f in *.txt
    do
        echo "Processing $f"
        python3 extract_flows_from_txt.py $f $output_file 4
    done

echo "Feature extraction completed for all files."

train_data_CIC.csv </br>
test_data_CIC.csv </br>
Those files are found in the google drive