# ML Systems Final Project - Traffic Classification

This notebook loads and processes network traffic data from a pcapng file with embedded pcapML labels.

In [2]:
# Install all required dependencies for pcapML → flows → features → ML
!pip install --upgrade pip

# Core pcapML tooling
!pip install pcapml-fe

# Data handling + ML
!pip install pandas scikit-learn numpy

# (Optional) AutoML — only if you want to compare to the leaderboard
!pip install autogluon

# Utility: progress bars (optional but recommended)
!pip install tqdm


Collecting pip
  Using cached pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.3-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.3
Collecting pcapml-fe
  Downloading pcapml_fe-0.0.3.tar.gz (15 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting dpkt (from pcapml-fe)
  Using cached dpkt-1.9.8-py3-none-any.whl.metadata (1.7 kB)
Downloading dpkt-1.9.8-py3-none-any.whl (194 kB)
Building wheels for collected packages: pcapml-fe
  Building wheel for pcapml-fe (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pcapml-fe: filename=pcapml_fe-0.0.3-cp310-cp310-macosx_11_0_arm64.whl size=24814 sha256=4829e2ad050d85283eb6f71aceb56019e79936b3b109d1c76530d95c35d0cf51
  

In [6]:
#!/usr/bin/env python3
"""
End-to-end script for pcapML Application Identification (non-vpn2016).

Assumes:
    - You are running in the same project directory that contains ./data/traffic.pcapng.gz
    - You have installed: pcapml-fe, pandas, scikit-learn, numpy

Input:
    ./data/traffic.pcapng.gz

Outputs:
    ./application_id_dataset.csv   - features + labels
    Prints Balanced Accuracy for a simple RandomForest on easy_label
"""

import os
import gzip
import tempfile
from pathlib import Path

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder

import pcapml_fe   # pip install pcapml-fe


# ---------------------------------------------------------------------
# 1. Paths and decompression
# ---------------------------------------------------------------------

# For notebooks / scripts: use current working directory
BASE_DIR = Path(os.getcwd())
INPUT_GZ = BASE_DIR / "data" / "traffic.pcapng.gz"

assert INPUT_GZ.exists(), f"Input file not found: {INPUT_GZ}"

# Decompress to a temporary .pcapng file (pcapml_fe wants plain pcapng)
tmp_dir = tempfile.mkdtemp(prefix="pcapml_")
PCAP_PATH = Path(tmp_dir) / "traffic.pcapng"

with gzip.open(INPUT_GZ, "rb") as f_in, open(PCAP_PATH, "wb") as f_out:
    f_out.write(f_in.read())

print(f"Decompressed input to: {PCAP_PATH}")
print("Iterating over pcapML samples and extracting features + labels...")


# ---------------------------------------------------------------------
# 2. Feature extraction from each pcapML sample
# ---------------------------------------------------------------------

def extract_features_from_sample(tsample):
    """
    Given a pcapML traffic_sample, compute simple per-sample flow features.
    Assumes:
        tsample.packets: iterable of packet-like objects with:
            - .raw_bytes (bytes)
            - .ts (timestamp float)
    """
    sizes = []
    times = []

    for pkt in tsample.packets:
        # Defensive: some implementations might not have these attributes
        try:
            sizes.append(len(pkt.raw_bytes))
            times.append(pkt.ts)
        except AttributeError:
            continue

    if not sizes:
        return {
            "num_pkts": 0,
            "total_bytes": 0.0,
            "mean_pkt_size": 0.0,
            "duration": 0.0,
        }

    duration = (max(times) - min(times)) if len(times) > 1 else 0.0

    return {
        "num_pkts": len(sizes),
        "total_bytes": float(sum(sizes)),
        "mean_pkt_size": float(sum(sizes) / len(sizes)),
        "duration": float(duration),
    }


feature_rows = []
label_rows = []

for tsample in pcapml_fe.sampler(str(PCAP_PATH)):
    # -----------------------------------------------------------------
    # IDs and labels
    # -----------------------------------------------------------------
    # Use the sampler's numeric ID as the sampleID
    sid = str(tsample.sid)

    # Metadata string is like: "p2p_torrent_torrent"
    meta = str(tsample.metadata).strip()
    parts = meta.split("_")

    if len(parts) != 3:
        raise ValueError(f"Unexpected metadata format for sample {sid}: {meta}")

    easy_lbl, med_lbl, hard_lbl = parts  # top-level, mid-level, fine-grained

    # -----------------------------------------------------------------
    # Feature extraction
    # -----------------------------------------------------------------
    feats = extract_features_from_sample(tsample)
    feats["sampleID"] = sid
    feature_rows.append(feats)

    label_rows.append(
        {
            "sampleID": sid,
            "easy_label": easy_lbl,
            "medium_label": med_lbl,
            "hard_label": hard_lbl,
        }
    )

features_df = pd.DataFrame(feature_rows)
labels_df = pd.DataFrame(label_rows)

print("Feature frame shape:", features_df.shape)
print("Label frame shape:  ", labels_df.shape)


# ---------------------------------------------------------------------
# 3. Merge features + labels into a single table
# ---------------------------------------------------------------------

full_df = features_df.merge(labels_df, on="sampleID", how="inner")
print("Merged frame shape:", full_df.shape)

# Save to disk for later experiments
OUTPUT_CSV = BASE_DIR / "application_id_dataset.csv"
full_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved merged dataset to: {OUTPUT_CSV}")


# ---------------------------------------------------------------------
# 4. Train a simple classifier on the easy_label (7-class problem)
# ---------------------------------------------------------------------

TARGET_COL = "easy_label"   # change to medium_label / hard_label if desired

feature_cols = ["num_pkts", "total_bytes", "mean_pkt_size", "duration"]

X = full_df[feature_cols].values
y_raw = full_df[TARGET_COL].astype(str).values

# Encode text labels as integers
le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42,
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

bal_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on {TARGET_COL}: {bal_acc:.4f}")

print("Done.")


Decompressed input to: /var/folders/2p/btp12d5s1c9fmnnlwdd9r2vh0000gp/T/pcapml_mgetn0ce/traffic.pcapng
Iterating over pcapML samples and extracting features + labels...
Feature frame shape: (158355, 5)
Label frame shape:   (158355, 4)
Merged frame shape: (158355, 8)
Saved merged dataset to: /Users/school/Documents/UChicago/ml_systems_final/application_id_dataset.csv
Balanced Accuracy on easy_label: 0.4272
Done.


In [9]:
import pandas as pd

df = pd.read_csv("application_id_dataset.csv")  # path from your console output
df.head(50)  # show first 5 rows


Unnamed: 0,num_pkts,total_bytes,mean_pkt_size,duration,sampleID,easy_label,medium_label,hard_label
0,1,145.0,145.0,0.0,9868669216672554899,p2p,torrent,torrent
1,1,60.0,60.0,0.0,15379293250252091038,p2p,torrent,torrent
2,100,91031.0,910.31,430872.0,8149511148527902631,p2p,torrent,torrent
3,2,509.0,254.5,286177.0,16847835362422566935,p2p,torrent,torrent
4,2,128.0,64.0,426045.0,13772236344740749544,p2p,torrent,torrent
5,1,148.0,148.0,0.0,2875513026698916686,p2p,torrent,torrent
6,2,448.0,224.0,99582.0,11930790100041994536,p2p,torrent,torrent
7,2,464.0,232.0,253184.0,15382778996981125454,p2p,torrent,torrent
8,55,44992.0,818.036364,529793.0,10619935944206136580,p2p,torrent,torrent
9,2,472.0,236.0,382.0,5029945778292188394,p2p,torrent,torrent


In [10]:
import pandas as pd

# Load the dataset you just created
df = pd.read_csv("application_id_dataset.csv")

# Columns you want to summarize
label_cols = ["easy_label", "medium_label", "hard_label"]

for col in label_cols:
    print(f"\n=== Class Counts for {col} ===")
    print(df[col].value_counts().sort_index())
    print(f"Total unique classes: {df[col].nunique()}")
    print(f"Total samples: {df[col].count()}")



=== Class Counts for easy_label ===
easy_label
audio            113150
chat               5070
email              2898
file-transfer     32114
p2p                1045
tor                 109
video              3969
Name: count, dtype: int64
Total unique classes: 7
Total samples: 158355

=== Class Counts for medium_label ===
medium_label
aim             409
email          2898
facebook      44366
ftps            750
gmail           446
google            5
hangouts      47433
icq             434
netflix         255
scp             170
sftp            188
skype         55618
spotify         204
torrent        1045
twitter           6
vimeo           422
voipbuster     2773
youtube         933
Name: count, dtype: int64
Total unique classes: 18
Total samples: 158355

=== Class Counts for hard_label ===
hard_label
aim-chat            409
email              2898
facebook-audio    43454
facebook-chat       505
facebook-video      402
ftps-down           606
ftps-up             144
gmail-chat 