# Create the Fetures Group from PCAPs

In [1]:
# Install scapy
# !pip install scapy

import re

# Utilities for Text and Regular Expressions
import string

# Datetime for Timestamp Processing
from datetime import datetime

# Standard and Data Analysis Libraries
import numpy as np
import pandas as pd
import hashlib

# Scapy for Packet Analysis
from scapy.all import IP, TCP, UDP, Ether, PcapReader, Raw, sniff



In [2]:
from functools import wraps
from time import time


def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print("func:%r took: %2.4f sec" % (f.__name__, te - ts))
        return result

    return wrap

**Load PCAP Data:**

In [3]:
"""
def packets_from_pcap(filepath, chunk_size=10):
    packets = []
    with PcapReader(filepath) as pcap_reader:
        for packet in pcap_reader:
            packets.append(packet)
            if len(packets) == chunk_size:
                yield packets
                packets = []
        if packets:  # if there are any packets left in the buffer
            yield packets
"""


@timing
def packets_from_pcap(filepath, chunk_size=100, max_chunks=20):
    """
    Generator function to yield chunks of packets from a pcap file up to a specified number of chunks.
    """
    packets = []
    chunk_count = 0  # Initialize a counter for the number of yielded chunks
    with PcapReader(filepath) as pcap_reader:
        for packet in pcap_reader:
            packets.append(packet)
            if len(packets) == chunk_size:
                yield packets
                packets = []
                chunk_count += 1  # Increment the chunk counter
                if chunk_count >= max_chunks:
                    break  # Stop after reaching the specified number of chunks
        if packets:  # if there are any packets left in the buffer
            yield packets

**Extract Network Traffic Features:**

In [4]:
@timing
def extract_network_traffic_features(packets):
    # List to collect packet data
    network_traffic_features = []

    # Protocol number to name mapping (only adding common ones, expand as needed)
    protocol_mapping = {6: "TCP", 17: "UDP"}

    for packet in packets:
        # Initialize variables for each feature
        eth_src = eth_dst = src_ip = dst_ip = src_port = dst_port = protocol = None
        timestamp = packet.time

        # Extract Ethernet frame information
        if Ether in packet:
            eth_src = packet[Ether].src
            eth_dst = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst

            if TCP in packet:
                src_port = packet[TCP].sport
                dst_port = packet[TCP].dport
                protocol = "TCP"
            elif UDP in packet:
                src_port = packet[UDP].sport
                dst_port = packet[UDP].dport
                protocol = "UDP"
            else:
                # Handle other IP protocols
                protocol = protocol_mapping.get(packet[IP].proto, packet[IP].proto)

        # Append extracted features to the list
        network_traffic_features.append(
            {
                "eth_src": eth_src,
                "eth_dst": eth_dst,
                "src_ip": src_ip,
                "dst_ip": dst_ip,
                "src_port": src_port,
                "dst_port": dst_port,
                "protocol": protocol,
                "timestamp": timestamp,
            }
        )

    # Convert the list to a DataFrame
    return pd.DataFrame(network_traffic_features)

 **Extract Session-related Information Features:**

In [5]:
@timing
def extract_session_behavior_features(packets):
    session_behaviour_features = []

    # Given that session identification is non-trivial, let's focus on other metrics
    # such as inter-arrival time for packets from the same source

    src_times = {}

    for packet in packets:
        if IP in packet:
            src_ip = packet[IP].src
            timestamp = packet.time

            if src_ip in src_times:
                inter_arrival_time = timestamp - src_times[src_ip]
            else:
                inter_arrival_time = None

            src_times[src_ip] = timestamp
            session_behaviour_features.append(
                {"src_ip": src_ip, "inter_arrival_time": inter_arrival_time}
            )

    return pd.DataFrame(session_behaviour_features)

**Extract Network Flow Features:**

In [6]:
@timing
def extract_network_flow_features(packets):
    network_flow_list = []

    flow_packet_counts = {}
    flow_total_bytes = {}
    flow_first_timestamp = {}
    flow_last_timestamp = {}

    for packet in packets:
        timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime(
            "%Y-%m-%d %H:%M:%S"
        )

        # Extract Ethernet frame information
        eth_src, eth_dst = None, None
        if Ether in packet:
            eth_src = packet[Ether].src
            eth_dst = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst

            src_port, dst_port = None, None
            if TCP in packet:
                src_port = packet[TCP].sport
                dst_port = packet[TCP].dport
            elif UDP in packet:
                src_port = packet[UDP].sport
                dst_port = packet[UDP].dport

            flow_identifier = (src_ip, dst_ip, src_port, dst_port)

            if flow_identifier not in flow_packet_counts:
                flow_packet_counts[flow_identifier] = 0
                flow_total_bytes[flow_identifier] = 0
                flow_first_timestamp[flow_identifier] = timestamp

            flow_packet_counts[flow_identifier] += 1
            flow_total_bytes[flow_identifier] += len(packet)
            flow_last_timestamp[flow_identifier] = timestamp

    for flow_identifier, packet_count in flow_packet_counts.items():
        src_ip, dst_ip, src_port, dst_port = flow_identifier
        total_bytes = flow_total_bytes[flow_identifier]
        first_timestamp = flow_first_timestamp[flow_identifier]
        last_timestamp = flow_last_timestamp[flow_identifier]

        if src_ip.startswith("192.168.1"):
            traffic_direction = "Outbound"
        elif dst_ip.startswith("192.168.1"):
            traffic_direction = "Inbound"
        else:
            traffic_direction = "External"

        network_flow_list.append(
            [
                src_ip,
                dst_ip,
                src_port,
                dst_port,
                packet_count,
                total_bytes,
                traffic_direction,
                first_timestamp,
                last_timestamp,
            ]
        )

    headers = [
        "src_ip",
        "dst_ip",
        "src_port",
        "dst_port",
        "packet_count",
        "total_bytes",
        "traffic_direction",
        "first_timestamp",
        "last_timestamp",
    ]

    return pd.DataFrame(network_flow_list, columns=headers)

**Extract Protocol-specific Features:**

In [7]:
@timing
def extract_protocol_specific_features(packets):
    protocol_specific_features = []
    
    # Simpler pre-compiled regex for HTTP header extraction
    http_header_regex = re.compile(r'^([a-zA-Z\-]+): (.+)$', re.MULTILINE)
    
    timestamps = []

    for packet in packets:
        packet_features = {}
        
        timestamps.append(float(packet.time))

        if Ether in packet:
            eth_layer = packet[Ether]
            packet_features['eth_src'] = eth_layer.src
            packet_features['eth_dst'] = eth_layer.dst

        if IP in packet:
            ip_layer = packet[IP]
            packet_features['src_ip'] = ip_layer.src
            packet_features['dst_ip'] = ip_layer.dst

            if TCP in packet:
                tcp_layer = packet[TCP]
                packet_features['src_port'] = tcp_layer.sport
                packet_features['dst_port'] = tcp_layer.dport

                if Raw in packet:
                    raw_payload = str(packet[Raw].load)[:4096]  # only consider first 4KB
                    if (packet_features['src_port'] == 80 or packet_features['dst_port'] == 80) and "HTTP" in raw_payload[:4]:
                        http_headers = dict(http_header_regex.findall(raw_payload))
                        packet_features.update(http_headers)

                    elif packet_features['src_port'] == 22 or packet_features['dst_port'] == 22:
                        packet_features['ssh_data_sample'] = raw_payload[:10]

        protocol_specific_features.append(packet_features)
    
    df = pd.DataFrame(protocol_specific_features)
    df['timestamp'] = pd.to_datetime(pd.Series(timestamps), unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')

    return df




**Extract Payload Characteristics Features:**

In [8]:
@timing
def extract_payload_characteristics_features(packets):
    payload_characteristics_features = []

    for packet in packets:
        packet_features = {}

        packet_features["timestamp"] = datetime.utcfromtimestamp(
            float(packet.time)
        ).strftime("%Y-%m-%d %H:%M:%S")

        if Ether in packet:
            packet_features["eth_src"] = packet[Ether].src
            packet_features["eth_dst"] = packet[Ether].dst

        if IP in packet:
            packet_features["src_ip"] = packet[IP].src
            packet_features["dst_ip"] = packet[IP].dst

            if TCP in packet:
                packet_features["src_port"] = packet[TCP].sport
                packet_features["dst_port"] = packet[TCP].dport

                if Raw in packet:
                    raw_payload = packet[Raw].load
                    packet_features["payload_size"] = len(raw_payload)
                    #payload_hash = hashlib.sha256(raw_payload).hexdigest()
                    #packet_features["payload_hash"] = payload_hash

        payload_characteristics_features.append(packet_features)

    return pd.DataFrame(payload_characteristics_features)

**Main Execution and Saving DataFrames:**

In [9]:
import argparse
import logging
import os
import os.path

if __name__ == "__main__":
    filepath = "Tuesday-WorkingHours.pcap"

    # Function to write to CSV
    def save_to_csv(df, filename):
        # Check if file exists to decide on writing headers
        write_header = not os.path.exists(filename)
        df.to_csv(
            filename,
            mode="a",
            header=write_header,
            index=False,
        )

    for packets_chunk in packets_from_pcap(filepath):
        network_traffic_df = extract_network_traffic_features(packets_chunk)
        save_to_csv(network_traffic_df, "network_traffic_features.csv")

        session_behavior_df = extract_session_behavior_features(packets_chunk)
        save_to_csv(session_behavior_df, "session_behaviour_features.csv")

        network_flow_df = extract_network_flow_features(packets_chunk)
        save_to_csv(network_flow_df, "network_flow_features.csv")

        protocol_specific_df = extract_protocol_specific_features(packets_chunk)
        save_to_csv(protocol_specific_df, "protocol_specific_features.csv")

        payload_characteristics_df = extract_payload_characteristics_features(packets_chunk)
        save_to_csv(payload_characteristics_df, "payload_characteristics_features.csv")

        print(f"Received a chunk of {len(packets_chunk)} packets")


func:'packets_from_pcap' took: 0.0000 sec
func:'extract_network_traffic_features' took: 0.0064 sec
func:'extract_session_behavior_features' took: 0.0020 sec
func:'extract_network_flow_features' took: 0.0193 sec
func:'extract_protocol_specific_features' took: 0.0061 sec
func:'extract_payload_characteristics_features' took: 0.0052 sec
Received a chunk of 100 packets
func:'extract_network_traffic_features' took: 0.0045 sec
func:'extract_session_behavior_features' took: 0.0012 sec
func:'extract_network_flow_features' took: 0.0312 sec
func:'extract_protocol_specific_features' took: 0.0058 sec
func:'extract_payload_characteristics_features' took: 0.0064 sec
Received a chunk of 100 packets
func:'extract_network_traffic_features' took: 0.0044 sec
func:'extract_session_behavior_features' took: 0.0012 sec
func:'extract_network_flow_features' took: 0.0164 sec
func:'extract_protocol_specific_features' took: 0.0068 sec
func:'extract_payload_characteristics_features' took: 0.0072 sec
Received a chun