# Create the Fetures Group from PCAPs

In [None]:
!pip install scapy


Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444330 sha256=03ab9860615424b874fbc96185237a8ca590575a3ee74141e03d1bab56937307
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0


In [None]:
#load the pcaps
from scapy.all import *

pcap_file_path = '/content/drive/MyDrive/output_part_1.pcap'
packets = rdpcap(pcap_file_path)

print(f"Number of packets in test load: {len(packets)}")
if len(packets) > 0:
    print(packets[0].summary())



Number of packets in test load: 783877
802.3 70:6e:6d:1d:bb:0f > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw


In [None]:
for packet in packets[:10]:
    print(packet.summary())


802.3 70:6e:6d:1d:bb:0f > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw
802.3 70:6e:6d:1d:bb:06 > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw
802.3 70:6e:6d:1d:bb:11 > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw
Ether / ARP who has 192.168.10.3 says 192.168.10.50 / Padding
Ether / ARP who has 192.168.10.3 says 192.168.10.50 / Padding
Ether / ARP is at 18:66:da:9b:e3:7d says 192.168.10.3 / Padding
Ether / ARP is at 18:66:da:9b:e3:7d says 192.168.10.3 / Padding
802.3 70:6e:6d:1d:bb:02 > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw
802.3 70:6e:6d:1d:bb:05 > 01:00:0c:cc:cc:cc / LLC / SNAP / Raw
Ether / IP / UDP / DNS Qry "b'v10.vortex-win.data.microsoft.com.'" 


In [None]:
#define feature groups

network_traffic_features = []
authentication_features = []
session_behaviour_features = []
frequency_features = []
network_anomalies_features = []
network_flow_features = []
protocol_specific_features = []
payload_characteristics_features = []


In [None]:

# Features for network traffic:
import numpy as np
import pandas as pd

for packet in packets:
    # Initialize variables for each feature
    eth_src = None
    eth_dst = None
    src_ip = None
    dst_ip = None
    src_port = None
    dst_port = None
    protocol = None
    timestamp = packet.time

    # Extract Ethernet frame information
    if Ether in packet:
        eth_src = packet[Ether].src
        eth_dst = packet[Ether].dst

    # Extract IP packet information
    if IP in packet:
        src_ip = packet[IP].src
        dst_ip = packet[IP].dst

        if TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
            protocol = 'TCP'
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport
            protocol = 'UDP'
        else:
            # Handle other IP protocols as needed
            protocol = packet[IP].proto

    # Append extracted features as a dictionary to the network_traffic_features list
    network_traffic_features.append({
        'eth_src': eth_src,
        'eth_dst': eth_dst,
        'src_ip': src_ip,
        'dst_ip': dst_ip,
        'src_port': src_port,
        'dst_port': dst_port,
        'protocol': protocol,
        'timestamp': timestamp
    })

# Save the features:

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(network_traffic_features)

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/network_traffic_features.csv', index=False)



In [None]:
# Authentication features:

from scapy.all import TCP, UDP, Raw, rdpcap
from datetime import datetime

authentication_features = []  # Ensure you have this list defined to store results

for packet in packets:
    authentication_method = None
    failed_login_attempts = None
    successful_authentication = None
    session_duration = None

    payload = None

    if packet.haslayer(TCP) and packet[TCP].payload and len(packet[TCP].payload) > 0:
        payload = str(packet[TCP].payload)
        authentication_method = 'TCP'

    elif packet.haslayer(UDP) and packet[UDP].payload and len(packet[UDP].payload) > 0:
        payload = str(packet[UDP].payload)
        authentication_method = 'UDP'

    elif packet.haslayer(Raw) and packet[Raw].payload and len(packet[Raw].payload) > 0:
        payload = str(packet[Raw].payload)
        authentication_method = 'Raw'

    # Extract timestamp from the packet
    packet_timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')


    # If payload exists, check for patterns
    if payload:
        # Check for login patterns
        if 'login' in payload:
            authentication_method += '-Login'
        elif 'auth' in payload:
            authentication_method += '-Auth'

        # Check for failed login attempts - assuming some pattern
        if 'failed_attempts=' in payload:
            failed_attempts_index = payload.index('failed_attempts=') + len('failed_attempts=')
            failed_login_attempts = int(payload[failed_attempts_index:failed_attempts_index+1])

        # Check for successful authentication - assuming some pattern
        if 'auth=success' in payload:
            successful_authentication = True
        else:
            successful_authentication = False

        # For session duration, we'd ideally need timestamps of session start and end, which isn't provided here.
        # Placeholder logic for now. You'd need additional parsing to get this.
        if 'session_duration=' in payload:
            duration_index = payload.index('session_duration=') + len('session_duration=')
            session_duration = int(payload[duration_index:duration_index+2])

        # Add to our features list
        authentication_features.append({
            'timestamp': packet_timestamp,
            'authentication_method': authentication_method,
            'failed_login_attempts': failed_login_attempts,
            'successful_authentication': successful_authentication,
            'session_duration': session_duration
        })

df = pd.DataFrame(authentication_features)
print(df.head())

df.to_csv('/content/drive/MyDrive/authentication_features.csv', index=False)




             timestamp authentication_method failed_login_attempts  \
0  2017-07-06 11:59:17                   UDP                  None   
1  2017-07-06 11:59:17                   UDP                  None   
2  2017-07-06 11:59:17                   UDP                  None   
3  2017-07-06 11:59:17                   UDP                  None   
4  2017-07-06 11:59:55                   UDP                  None   

   successful_authentication session_duration  
0                      False             None  
1                      False             None  
2                      False             None  
3                      False             None  
4                      False             None  


In [None]:
# Session-related information features:


import numpy as np
import pandas as pd
from datetime import datetime
from scapy.all import Ether, IP

current_session_start_time = None
current_session_end_time = None
session_start_time = None
failed_login_count = 0
session_behaviour_features = []
session_timestamps = []

# Define a pattern to identify authentication-related events
auth_event_pattern = "authentication_failed"

# Iterate through the packets and extract the features
for packet in packets:
    # Extract Ethernet frame information
    if Ether in packet:
        eth_src = packet[Ether].src
        eth_dst = packet[Ether].dst

    # Extract IP packet information
    if IP in packet:
        # Update session start and end times
        if current_session_start_time is None:
            current_session_start_time = packet.time
            # Convert packet.time to float
            timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')
            session_timestamps.append(timestamp)
        current_session_end_time = packet.time

        # Check if the packet contains the authentication event pattern
        if auth_event_pattern in str(packet.payload):
            failed_login_count += 1
    else:
        # End of session or non-IP packet
        if current_session_start_time is not None:
            # Calculate session duration
            session_duration = current_session_end_time - current_session_start_time

            # Append session behavior features to the list
            session_behaviour_features.append([session_duration, 1, failed_login_count])

            # Calculate interval between authentication attempts
            if session_start_time is not None:
                auth_attempt_interval = current_session_start_time - session_start_time
                session_behaviour_features[-1].append(auth_attempt_interval)
            else:
                session_behaviour_features[-1].append(None)

            # Reset session-related variables for the next session
            current_session_start_time = None
            current_session_end_time = None
            session_start_time = packet.time
            failed_login_count = 0

# Save the features:
# Convert the list of lists to a Pandas DataFrame
df = pd.DataFrame(session_behaviour_features, columns=['session_duration', 'auth_event_count', 'failed_login_count', 'auth_attempt_interval'])
df['session_start_timestamp'] = session_timestamps

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/session_behaviour_features.csv', index=False)

In [None]:
# Rate and frequency features:

import numpy as np
import pandas as pd

login_attempts = 0
successful_logins = 0

# Initialize the frequency_features list
frequency_features = []

# Define a pattern to identify authentication-related events
auth_attempt_pattern = "authentication_attempt"
successful_login_pattern = "successful_login"

# Iterate through the packets and extract the features
for packet in packets:
    # Check if the packet contains the authentication attempt pattern
    if auth_attempt_pattern in str(packet.payload):
        login_attempts += 1
    elif successful_login_pattern in str(packet.payload):
        successful_logins += 1

    # Extract timestamp from the packet
    packet_timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')


    # Calculate login attempt rate and successful login rate
    login_attempt_rate = login_attempts / len(packets)
    successful_login_rate = successful_logins / len(packets)

    # Append frequency features and timestamp to the list
    frequency_features.append([packet_timestamp, login_attempt_rate, successful_login_rate])

# Define the column names for the DataFrame
column_names = ["timestamp", "login_attempt_rate", "successful_login_rate"]

# Save the features:

df = pd.DataFrame(frequency_features, columns=column_names)

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/frequency_features.csv', index=False)


In [None]:
#Network anomaly feature:

import numpy as np
import pandas as pd
from datetime import datetime
from scapy.all import Ether, IP

packet_length_threshold = 1500  # Example threshold, modify as needed
network_anomalies_features = []

for packet in packets:
    # Calculate the length of the packet
    packet_length = len(packet)

    # Get the timestamp of the packet
    packet_timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

    # Determine if the packet is anomalous based on its length
    if packet_length > packet_length_threshold:
        anomaly_status = 'anomaly_detected'
    else:
        anomaly_status = 'normal'

    # Append features related to this packet to the list
    network_anomalies_features.append([packet_timestamp, packet_length, anomaly_status])

# Save the features:
# Convert the list of lists to a Pandas DataFrame
df = pd.DataFrame(network_anomalies_features, columns=['timestamp', 'packet_length', 'status'])

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/network_anomalies_features.csv', index=False)


In [None]:
# Network flow features:

import numpy as np
import pandas as pd
from scapy.all import Ether, IP, TCP, UDP
from datetime import datetime

# Define dictionaries to keep track of flow statistics
flow_packet_counts = {}
flow_total_bytes = {}
flow_first_timestamp = {}
flow_last_timestamp = {}

for packet in packets:
    timestamp = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

    # Extract Ethernet frame information
    if Ether in packet:
        eth_src = packet[Ether].src
        eth_dst = packet[Ether].dst
    else:
        eth_src = None
        eth_dst = None

    # Extract IP packet information
    if IP in packet:
        src_ip = packet[IP].src
        dst_ip = packet[IP].dst

        src_port, dst_port = None, None  # Initialize as None by default

        if TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport

        # Define a flow identifier based on source and destination IP addresses and ports
        flow_identifier = (src_ip, dst_ip, src_port, dst_port)

        # Update flow statistics
        if flow_identifier not in flow_packet_counts:
            flow_packet_counts[flow_identifier] = 0
            flow_total_bytes[flow_identifier] = 0
            flow_first_timestamp[flow_identifier] = timestamp  # Initialize first timestamp

        flow_packet_counts[flow_identifier] += 1
        flow_total_bytes[flow_identifier] += len(packet)
        flow_last_timestamp[flow_identifier] = timestamp  # Update last timestamp

# Extracted network flow features
network_flow_features = []

for flow_identifier, packet_count in flow_packet_counts.items():
    src_ip, dst_ip, src_port, dst_port = flow_identifier
    total_bytes = flow_total_bytes[flow_identifier]
    first_timestamp = flow_first_timestamp[flow_identifier]
    last_timestamp = flow_last_timestamp[flow_identifier]

    # Determine traffic direction based on source and destination IP addresses
    if src_ip.startswith('192.168.1'):  # Example: Check if source IP belongs to your network
        traffic_direction = 'Outbound'
    elif dst_ip.startswith('192.168.1'):  # Example: Check if destination IP belongs to your network
        traffic_direction = 'Inbound'
    else:
        traffic_direction = 'External'

    # Append flow features to the network_flow_features list
    network_flow_features.append([src_ip, dst_ip, src_port, dst_port, packet_count, total_bytes, traffic_direction, first_timestamp, last_timestamp])

# Define the column headers
headers = ['src_ip', 'dst_ip', 'src_port', 'dst_port', 'packet_count', 'total_bytes', 'traffic_direction', 'first_timestamp', 'last_timestamp']

# Save the features:
df = pd.DataFrame(network_flow_features, columns=headers)

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/network_flow_features.csv', index=False)



In [None]:
# Protocol-specific features:

import numpy as np
import pandas as pd
from scapy.all import Ether, IP, TCP, Raw
from datetime import datetime
import re

protocol_specific_features = []

for packet in packets:
    # Define a dictionary to store protocol-specific features for this packet
    packet_features = {}

    # Extract the timestamp of the packet
    packet_features['timestamp'] = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

    # Extract Ethernet frame information
    if Ether in packet:
        packet_features['eth_src'] = packet[Ether].src
        packet_features['eth_dst'] = packet[Ether].dst

    # Extract IP packet information
    if IP in packet:
        packet_features['src_ip'] = packet[IP].src
        packet_features['dst_ip'] = packet[IP].dst
        if TCP in packet:
            packet_features['src_port'] = packet[TCP].sport
            packet_features['dst_port'] = packet[TCP].dport

            # Check if the packet is HTTP (port 80)
            if packet_features['src_port'] == 80 or packet_features['dst_port'] == 80:
                if Raw in packet:
                    http_data = str(packet[Raw].load)

                    # Extract HTTP headers (assuming headers are in the first part of the payload)
                    http_headers = http_data.split('\r\n\r\n')[0]
                    packet_features['http_headers'] = http_headers

                    # Extract HTTP status code (if available)
                    status_code_match = re.search(r'HTTP/1.[01] (\d{3})', http_headers)
                    if status_code_match:
                        packet_features['http_status_code'] = int(status_code_match.group(1))

            # Check if the packet is SSH (port 22)
            if packet_features['src_port'] == 22 or packet_features['dst_port'] == 22:
                if Raw in packet:
                    ssh_payload = str(packet[Raw].load)
                    # Extract SSH protocol details as needed
                    packet_features['ssh_protocol_details'] = ssh_payload

    # Append the dictionary of features to the list
    protocol_specific_features.append(packet_features)

# Save the features:
df = pd.DataFrame(protocol_specific_features)

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/protocol_specific_features.csv', index=False)


In [None]:
# Payload characteristics features

import numpy as np
import pandas as pd
from scapy.all import Ether, IP, TCP, Raw
from datetime import datetime
import string

payload_characteristics_features = []

for packet in packets:
    # Define a dictionary to store payload characteristics features for this packet
    packet_features = {}

    # Extract the timestamp of the packet
    packet_features['timestamp'] = datetime.utcfromtimestamp(float(packet.time)).strftime('%Y-%m-%d %H:%M:%S')

    # Extract Ethernet frame information
    if Ether in packet:
        packet_features['eth_src'] = packet[Ether].src
        packet_features['eth_dst'] = packet[Ether].dst

    # Extract IP packet information
    if IP in packet:
        packet_features['src_ip'] = packet[IP].src
        packet_features['dst_ip'] = packet[IP].dst
        if TCP in packet:
            packet_features['src_port'] = packet[TCP].sport
            packet_features['dst_port'] = packet[TCP].dport

            # Check if the packet contains payload (Raw layer)
            if Raw in packet:
                raw_payload = packet[Raw].load
                packet_features['payload_size'] = len(raw_payload)

                # Clean the payload contents: replace non-printable characters with a placeholder or remove them
                printable = set(string.printable)
                cleaned_payload = ''.join(filter(lambda x: x in printable, raw_payload.decode('utf-8', errors='ignore')))
                packet_features['payload_contents'] = cleaned_payload

    # Append the dictionary of features to the list
    payload_characteristics_features.append(packet_features)

# Save the features:
df = pd.DataFrame(payload_characteristics_features)

# Save the DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/payload_characteristics_features.csv', index=False)

