# ML Systems Final Project - Traffic Classification

This notebook loads and processes network traffic data from a pcapng file with embedded pcapML labels.

In [5]:
import pandas as pd
import numpy as np
from pcapng import FileScanner
from scapy.all import *
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

## Load pcapng data with pcapML labels

The labels are embedded in the pcapng file as frame comments in the format:
`sampleID,easylabel_mediumlabel_hardlabel`

Packets with the same sampleID belong to the same flow/sample.

In [6]:
def load_pcapng_with_labels(filepath):
    """
    Load pcapng file and extract packet data with pcapML labels.
    
    Returns:
        pd.DataFrame with columns: sample_id, easy_label, medium_label, hard_label,
                                   packet_length, timestamp, and raw packet data
    """
    packets_data = []
    
    with open(filepath, 'rb') as f:
        scanner = FileScanner(f)
        
        for block in scanner:
            if hasattr(block, 'packet_data') and hasattr(block, 'options'):
                # Extract the pcapML comment
                if 'opt_comment' in block.options:
                    # The option value is already a string, not a list
                    comment = block.options['opt_comment']
                    
                    # Parse the comment: sampleID,easylabel_mediumlabel_hardlabel
                    if ',' in comment:
                        sample_id, labels_str = comment.split(',', 1)
                        labels = labels_str.split('_')
                        
                        # Handle different label formats
                        easy_label = labels[0] if len(labels) > 0 else None
                        medium_label = labels[1] if len(labels) > 1 else None
                        hard_label = labels[2] if len(labels) > 2 else None
                        
                        # Extract packet metadata
                        packet_length = len(block.packet_data)
                        timestamp = block.timestamp
                        
                        packets_data.append({
                            'sample_id': sample_id,
                            'easy_label': easy_label,
                            'medium_label': medium_label,
                            'hard_label': hard_label,
                            'packet_length': packet_length,
                            'timestamp': timestamp,
                            'packet_data': block.packet_data
                        })
    
    df = pd.DataFrame(packets_data)
    return df

In [7]:
# Load the data
print("Loading pcapng file...")
df = load_pcapng_with_labels('data/traffic.pcapng')
print(f"Loaded {len(df)} packets")

Loading pcapng file...
Loaded 529019 packets


## Display the loaded data

In [8]:
# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nNumber of unique samples: {df['sample_id'].nunique()}")
print(f"\nLabel distribution:")
print(f"  Easy labels: {df['easy_label'].value_counts().to_dict()}")
print(f"  Medium labels: {df['medium_label'].value_counts().to_dict()}")
print(f"  Hard labels: {df['hard_label'].value_counts().to_dict()}")

Dataset shape: (529019, 7)

Columns: ['sample_id', 'easy_label', 'medium_label', 'hard_label', 'packet_length', 'timestamp', 'packet_data']

Number of unique samples: 158355

Label distribution:
  Easy labels: {'audio': 321981, 'file-transfer': 97866, 'video': 47842, 'chat': 26680, 'email': 18876, 'p2p': 14927, 'tor': 847}
  Medium labels: {'skype': 169674, 'hangouts': 135227, 'facebook': 130253, 'email': 18876, 'torrent': 14927, 'youtube': 12052, 'voipbuster': 11355, 'ftps': 8585, 'vimeo': 6689, 'gmail': 4924, 'spotify': 3891, 'netflix': 3588, 'sftp': 2755, 'aim': 2268, 'icq': 2223, 'scp': 1491, 'twitter': 126, 'google': 115}
  Hard labels: {'hangouts-audio': 126174, 'facebook-audio': 122003, 'skype-file': 85035, 'skype-audio': 58558, 'email': 18876, 'torrent': 14927, 'skype-video': 13539, 'skype-chat': 12542, 'youtube': 11571, 'voipbuster': 11355, 'hangouts-video': 7548, 'vimeo': 6521, 'ftps-down': 5814, 'gmail-chat': 4924, 'facebook-video': 4907, 'spotify': 3891, 'netflix': 3588, 'f

In [9]:
# Display first few rows (without the raw packet_data for readability)
display_df = df.drop(columns=['packet_data'])
display_df.head(20)

Unnamed: 0,sample_id,easy_label,medium_label,hard_label,packet_length,timestamp
0,9868669216672554899,p2p,torrent,torrent,145,6132778000000.0
1,15379293250252091038,p2p,torrent,torrent,60,6132776000000.0
2,8149511148527902631,p2p,torrent,torrent,66,6132777000000.0
3,8149511148527902631,p2p,torrent,torrent,66,6132777000000.0
4,8149511148527902631,p2p,torrent,torrent,54,6132777000000.0
5,8149511148527902631,p2p,torrent,torrent,240,6132777000000.0
6,8149511148527902631,p2p,torrent,torrent,60,6132777000000.0
7,8149511148527902631,p2p,torrent,torrent,419,6132777000000.0
8,8149511148527902631,p2p,torrent,torrent,1404,6132777000000.0
9,8149511148527902631,p2p,torrent,torrent,54,6132777000000.0


In [10]:
# Summary statistics
df.describe()

Unnamed: 0,packet_length,timestamp
count,529019.0,529019.0
mean,166.879902,6137960000000.0
std,309.994372,4734748000.0
min,44.0,6132008000000.0
25%,64.0,6137605000000.0
50%,64.0,6137647000000.0
75%,92.0,6137941000000.0
max,1500.0,6156980000000.0


## Sample analysis

Group packets by sample_id to understand flow-level data

In [11]:
# Group by sample_id to see flow-level statistics
flow_stats = df.groupby('sample_id').agg({
    'packet_length': ['count', 'sum', 'mean', 'std'],
    'easy_label': 'first',
    'medium_label': 'first',
    'hard_label': 'first'
})

flow_stats.columns = ['num_packets', 'total_bytes', 'avg_packet_size', 'std_packet_size', 
                      'easy_label', 'medium_label', 'hard_label']
flow_stats.reset_index(inplace=True)

print(f"Number of flows: {len(flow_stats)}")
flow_stats.head()

Number of flows: 158355


Unnamed: 0,sample_id,num_packets,total_bytes,avg_packet_size,std_packet_size,easy_label,medium_label,hard_label
0,10000037044577594660,2,462,231.0,220.617316,audio,hangouts,hangouts-audio
1,10000404577091291935,2,128,64.0,0.0,audio,hangouts,hangouts-audio
2,10000589167870106281,2,128,64.0,0.0,audio,skype,skype-audio
3,10000627971628610129,1,131,131.0,,chat,skype,skype-chat
4,10000663751554192124,2,214,107.0,35.355339,audio,facebook,facebook-audio
