# ML Systems Final Project - Traffic Classification

This notebook loads and processes network traffic data from a pcapng file with embedded pcapML labels.

In [None]:
import pandas as pd
from scapy.all import *

## Load pcapng file and create Wireshark-like dataframe

In [None]:
def load_pcapng(filepath):
    """Load pcapng file and create a Wireshark-like dataframe"""
    packets = rdpcap(filepath)
    
    data = []
    start_time = None
    
    for i, pkt in enumerate(packets):
        if start_time is None:
            start_time = pkt.time
        
        # Extract basic packet info
        row = {
            'No.': i + 1,
            'Time': pkt.time - start_time,
            'Source': pkt[IP].src if IP in pkt else '',
            'Destination': pkt[IP].dst if IP in pkt else '',
            'Protocol': pkt.sprintf("%IP.proto%") if IP in pkt else pkt.sprintf("%Ether.type%"),
            'Length': len(pkt),
            'Info': pkt.summary()
        }
        data.append(row)
    
    return pd.DataFrame(data)

In [None]:
# Load the pcapng file
print("Loading pcapng file...")
df = load_pcapng('data/traffic.pcapng.gz')
print(f"Loaded {len(df)} packets")
df.head(20)