# Create the Fetures Group from PCAPs

In [1]:
# Install scapy
!pip install scapy

# Standard and Data Analysis Libraries
import numpy as np
import pandas as pd

# Datetime for Timestamp Processing
from datetime import datetime

# Scapy for Packet Analysis
from scapy.all import Ether, IP, TCP, UDP, Raw, PcapReader, sniff

# Utilities for Text and Regular Expressions
import string
import re



Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m1.0/1.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444330 sha256=a2e0d74d1f045792c2369871a1070243fdb13660e11bab897abd870662dc9196
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfull

**Load PCAP Data:**

In [2]:
def packets_from_pcap(filepath, chunk_size=10000):
    """
    Generator function to yield chunks of packets from a pcap file.
    """
    packets = []
    with PcapReader(filepath) as pcap_reader:
        for packet in pcap_reader:
            packets.append(packet)
            if len(packets) == chunk_size:
                yield packets
                packets = []
        if packets:  # if there are any packets left in the buffer
            yield packets


**Extract Network Traffic Features:**

In [3]:
def extract_network_traffic_features(packets):
    # List to collect packet data
    network_traffic_features = []

    # Protocol number to name mapping (only adding common ones, expand as needed)
    protocol_mapping = {6: 'TCP', 17: 'UDP'}

    for packet in packets:
        # Initialize variables for each feature
        eth_src = eth_dst = src_ip = dst_ip = src_port = dst_port = protocol = None
        timestamp = packet.time

        # Extract Ethernet frame information
        if Ether in packet:
            eth_src = packet[Ether].src
            eth_dst = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst

            if TCP in packet:
                src_port = packet[TCP].sport
                dst_port = packet[TCP].dport
                protocol = 'TCP'
            elif UDP in packet:
                src_port = packet[UDP].sport
                dst_port = packet[UDP].dport
                protocol = 'UDP'
            else:
                # Handle other IP protocols
                protocol = protocol_mapping.get(packet[IP].proto, packet[IP].proto)

        # Append extracted features to the list
        network_traffic_features.append({
            'eth_src': eth_src,
            'eth_dst': eth_dst,
            'src_ip': src_ip,
            'dst_ip': dst_ip,
            'src_port': src_port,
            'dst_port': dst_port,
            'protocol': protocol,
            'timestamp': timestamp
        })

    # Convert the list to a DataFrame
    return pd.DataFrame(network_traffic_features)


 **Extract Session-related Information Features:**

In [4]:
def extract_session_behavior_features(packets):
    session_behaviour_list = []
    session_timestamps = []

    auth_event_pattern = re.compile(b"authentication_failed")  # Convert the pattern to a compiled regex for speed

    current_session_start_time = None
    failed_login_count = 0
    last_auth_event_time = None

    # Iterate through the packets and extract the features
    for packet in packets:
        if IP not in packet:
            continue  # We only care about IP packets

        # Update session start and end times
        if current_session_start_time is None:
            current_session_start_time = packet.time

        # Convert packet.time to float
        timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')
        session_timestamps.append(timestamp)

        # Check if the packet contains the authentication event pattern
        if auth_event_pattern.search(bytes(packet.payload)):
            failed_login_count += 1
            last_auth_event_time = packet.time

        # Assuming a session is defined by time, say, 5 minutes of inactivity. This is a common choice but adjust as needed.
        if last_auth_event_time and (packet.time - last_auth_event_time > 300):
            session_duration = last_auth_event_time - current_session_start_time

            auth_attempt_interval = last_auth_event_time - current_session_start_time if failed_login_count > 1 else None

            # Append session behavior features to the list
            session_behaviour_list.append([session_duration, failed_login_count, auth_attempt_interval])

            # Reset session-related variables for the next session
            current_session_start_time = None
            failed_login_count = 0
            last_auth_event_time = None

    return pd.DataFrame(session_behaviour_list, columns=['session_duration', 'auth_event_count', 'auth_attempt_interval'])


**Extract Network Flow Features:**

In [5]:
def extract_network_flow_features(packets):
    network_flow_list = []

    flow_packet_counts = {}
    flow_total_bytes = {}
    flow_first_timestamp = {}
    flow_last_timestamp = {}

    for packet in packets:
        timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

        # Extract Ethernet frame information
        eth_src, eth_dst = None, None
        if Ether in packet:
            eth_src = packet[Ether].src
            eth_dst = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst

            src_port, dst_port = None, None
            if TCP in packet:
                src_port = packet[TCP].sport
                dst_port = packet[TCP].dport
            elif UDP in packet:
                src_port = packet[UDP].sport
                dst_port = packet[UDP].dport

            flow_identifier = (src_ip, dst_ip, src_port, dst_port)

            if flow_identifier not in flow_packet_counts:
                flow_packet_counts[flow_identifier] = 0
                flow_total_bytes[flow_identifier] = 0
                flow_first_timestamp[flow_identifier] = timestamp

            flow_packet_counts[flow_identifier] += 1
            flow_total_bytes[flow_identifier] += len(packet)
            flow_last_timestamp[flow_identifier] = timestamp

    for flow_identifier, packet_count in flow_packet_counts.items():
        src_ip, dst_ip, src_port, dst_port = flow_identifier
        total_bytes = flow_total_bytes[flow_identifier]
        first_timestamp = flow_first_timestamp[flow_identifier]
        last_timestamp = flow_last_timestamp[flow_identifier]

        if src_ip.startswith('192.168.1'):
            traffic_direction = 'Outbound'
        elif dst_ip.startswith('192.168.1'):
            traffic_direction = 'Inbound'
        else:
            traffic_direction = 'External'

        network_flow_list.append([src_ip, dst_ip, src_port, dst_port, packet_count, total_bytes, traffic_direction, first_timestamp, last_timestamp])

    headers = ['src_ip', 'dst_ip', 'src_port', 'dst_port', 'packet_count', 'total_bytes', 'traffic_direction', 'first_timestamp', 'last_timestamp']

    return pd.DataFrame(network_flow_list, columns=headers)


**Extract Protocol-specific Features:**

In [6]:
def extract_protocol_specific_features(packets):
    protocol_specific_list = []

    for packet in packets:
        packet_features = {}

        # Extract the timestamp of the packet
        packet_features['timestamp'] = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

        # Extract Ethernet frame information
        if Ether in packet:
            packet_features['eth_src'] = packet[Ether].src
            packet_features['eth_dst'] = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            packet_features['src_ip'] = packet[IP].src
            packet_features['dst_ip'] = packet[IP].dst
            if TCP in packet:
                packet_features['src_port'] = packet[TCP].sport
                packet_features['dst_port'] = packet[TCP].dport

                # Check if the packet is HTTP (port 80)
                if packet_features['src_port'] == 80 or packet_features['dst_port'] == 80:
                    if Raw in packet:
                        http_data = str(packet[Raw].load)
                        http_headers = http_data.split('\r\n\r\n')[0]
                        packet_features['http_headers'] = http_headers

                        status_code_match = re.search(r'HTTP/1.[01] (\d{3})', http_headers)
                        if status_code_match:
                            packet_features['http_status_code'] = int(status_code_match.group(1))

                # Check if the packet is SSH (port 22)
                if packet_features['src_port'] == 22 or packet_features['dst_port'] == 22:
                    if Raw in packet:
                        ssh_payload = str(packet[Raw].load)
                        packet_features['ssh_protocol_details'] = ssh_payload

        # Append the dictionary of features to the list
        protocol_specific_list.append(packet_features)

    return pd.DataFrame(protocol_specific_list)



**Extract Payload Characteristics Features:**

In [7]:
def extract_payload_characteristics_features(packets):
    payload_characteristics_list = []

    for packet in packets:
        packet_features = {}

        # Extract the timestamp of the packet
        packet_features['timestamp'] = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

        # Extract Ethernet frame information
        if Ether in packet:
            packet_features['eth_src'] = packet[Ether].src
            packet_features['eth_dst'] = packet[Ether].dst

        # Extract IP packet information
        if IP in packet:
            packet_features['src_ip'] = packet[IP].src
            packet_features['dst_ip'] = packet[IP].dst
            if TCP in packet:
                packet_features['src_port'] = packet[TCP].sport
                packet_features['dst_port'] = packet[TCP].dport

                # Check if the packet contains payload (Raw layer)
                if Raw in packet:
                    raw_payload = packet[Raw].load
                    packet_features['payload_size'] = len(raw_payload)

                    # Clean the payload contents
                    printable = set(string.printable)
                    cleaned_payload = ''.join(filter(lambda x: x in printable, raw_payload.decode('utf-8', errors='ignore')))
                    packet_features['payload_contents'] = cleaned_payload

        # Append the dictionary of features to the list
        payload_characteristics_list.append(packet_features)

    return pd.DataFrame(payload_characteristics_list)



**Main Execution and Saving DataFrames:**

In [None]:
import os
import argparse
import logging

if __name__ == "__main__":
    filepath = '/content/drive/MyDrive/Tuesday-WorkingHours.pcap'

    for packets_chunk in packets_from_pcap(filepath):
        network_traffic_df = extract_network_traffic_features(packets_chunk)
        # Save to CSV with append mode to keep adding to the same file
        network_traffic_df.to_csv('/content/drive/MyDrive/network_traffic_features.csv', mode='a', header=False, index=False)

        session_behavior_df = extract_session_behavior_features(packets_chunk)
        session_behavior_df.to_csv('/content/drive/MyDrive/session_behaviour_features.csv', mode='a', header=False, index=False)

        network_flow_df = extract_network_flow_features(packets_chunk)
        network_flow_df.to_csv('/content/drive/MyDrive/network_flow_features.csv', mode='a', header=False, index=False)

        protocol_specific_df = extract_protocol_specific_features(packets_chunk)
        protocol_specific_df.to_csv('/content/drive/MyDrive/protocol_specific_features.csv', mode='a', header=False, index=False)

        payload_characteristics_df = extract_payload_characteristics_features(packets_chunk)
        payload_characteristics_df.to_csv('/content/drive/MyDrive/payload_characteristics_features.csv', mode='a', header=False, index=False)

