<a href="https://colab.research.google.com/github/alirezashirmarz/XR-AR_NTC/blob/main/3_ReadPCAP_ExtractFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Title:** ***Extract Features (IPI, FS, IFI) from the PCAP File***
    ##Pay attention to configuration in each cell

In [None]:
!pip install scapy

# 1- **Find the Files in the root directory e.g. pcap or pcapnj files**

In [None]:
# The root Directory (Enter your address)
root_directory = r'Directory includes PCAP or PCAPnj Files'

In [None]:
# Find all pcap files in  a directory
import os
import glob

def find_files_with_extension(root_dir, extension):
  ''' Function: Find the files with sxpecific extension
  Output: list of files full address '''
    file_list = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(extension):
                file_list.append(os.path.join(root, file))
    return file_list

# set the extension to find from the rood directory
extension = '.pcapng'  # '.pcap' or '.pcapng'
files_with_extension = find_files_with_extension(root_directory, extension)

# print to show the found files
print("Files found:")
for file_path in files_with_extension:
    print(file_path)

# 2- **Extract the Features (IPI,IFI, FS, & PS) of flow from PCAP(NJ)**

In [None]:
import csv
import pandas as pd
import scapy.all as scapy
from collections import defaultdict

def process_pcap(pcap_file, csv_file):
  ''' Extrct the features from the PCAP(NJ) files
  Input: the Pcap files address and the address of csv files to be stored
  Output: the csv file stored in the csv file address'''
    packets = scapy.rdpcap(pcap_file)
    flows = defaultdict(list)

    # Process packets and organize them into TCP/UDP/QUIC flows
    # Process packets and organize them into TCP/UDP/QUIC flows
    for packet in packets:
        protocol = None
        if packet.haslayer(scapy.TCP):
            protocol = 'TCP'
            ip_layer = packet[scapy.IP] if packet.haslayer(scapy.IP) else packet[scapy.IPv6]
            transport_layer = packet[scapy.TCP]
        elif packet.haslayer(scapy.UDP):
            protocol = 'UDP'
            ip_layer = packet[scapy.IP] if packet.haslayer(scapy.IP) else packet[scapy.IPv6]
            transport_layer = packet[scapy.UDP]
        elif "quic" in str(packet).lower():  # Heuristic for QUIC
            protocol = 'QUIC'
            ip_layer = packet[scapy.IP] if packet.haslayer(scapy.IP) else packet[scapy.IPv6]
            transport_layer = packet[scapy.UDP]  # QUIC runs over UDP

        if protocol:
            flow_key = (
                ip_layer.src, ip_layer.dst,
                transport_layer.sport, transport_layer.dport, ip_layer.name, protocol
            )
            flows[flow_key].append(packet)


    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            'ID', 'SrcIP', 'DstIP', 'SrcPort', 'DstPort',
            'IPVersion', 'Protocol', 'PS', 'IPI', 'FlowSizeBytes',
            'FlowSizePackets', 'FS', 'FS(PKT)',
            'NumFrames', 'IFI'
        ])

        for i, (flow_key, packets) in enumerate(flows.items()):
            src_ip, dst_ip, src_port, dst_port, ip_version, protocol = flow_key

            timestamps = [packet.time for packet in packets]
            ip_intervals = [j - i for i, j in zip(timestamps[:-1], timestamps[1:])]
            packet_sizes = [len(packet) for packet in packets]
            total_flow_bytes = sum(packet_sizes)
            total_flow_packets = len(packets)

            avg_packet_size = total_flow_bytes / total_flow_packets if total_flow_packets else 0
            avg_ipi = sum(ip_intervals) / len(ip_intervals) if ip_intervals else 0

            # Frame Analysis (specific to UDP and TCP, might not be accurate for QUIC)
            frame_sizes_bytes, frame_sizes_packets, frame_intervals = analyze_frames(packets, protocol)

            avg_frame_size_bytes = sum(frame_sizes_bytes) / len(frame_sizes_bytes) if frame_sizes_bytes else 0
            avg_frame_size_packets = sum(frame_sizes_packets) / len(frame_sizes_packets) if frame_sizes_packets else 0
            num_frames = len(frame_sizes_bytes)
            avg_ifi = sum(frame_intervals) / len(frame_intervals) if frame_intervals else 0

            # Writing to CSV
            writer.writerow([
                i + 1, src_ip, dst_ip, src_port, dst_port, ip_version, protocol,
                avg_packet_size, avg_ipi, total_flow_bytes, total_flow_packets,
                avg_frame_size_bytes, avg_frame_size_packets, num_frames, avg_ifi
            ])

def analyze_frames(packets, protocol):
  '''Function: Analyze the frame
  Input: packets & Protocol
  Output: frame_sizes_bytes, frame_sizes_packets, frame_intervals'''
    frame_sizes_bytes = []
    frame_sizes_packets = []
    frame_intervals = []
    current_frame_size_bytes = 0
    current_frame_size_packets = 0
    current_frame_start_time = None
    last_length = None

    for packet in packets:
        if packet.haslayer(scapy.TCP) or packet.haslayer(scapy.UDP):
            if protocol == 'TCP':
                # For TCP, calculate the payload length
                if packet.haslayer(scapy.Raw):
                    length = len(packet[scapy.Raw].load)
                else:
                    length = 0
            else:
                # For UDP, use the length attribute
                length = packet[scapy.UDP].len

            if current_frame_start_time is None:
                current_frame_start_time = packet.time

            if last_length is not None and length != last_length:
                frame_sizes_bytes.append(current_frame_size_bytes)
                frame_sizes_packets.append(current_frame_size_packets)
                frame_intervals.append(packet.time - current_frame_start_time)
                current_frame_size_bytes = 0
                current_frame_size_packets = 0
                current_frame_start_time = packet.time

            current_frame_size_bytes += len(packet)
            current_frame_size_packets += 1
            last_length = length

    if current_frame_size_bytes > 0:
        frame_sizes_bytes.append(current_frame_size_bytes)
        frame_sizes_packets.append(current_frame_size_packets)

    return frame_sizes_bytes, frame_sizes_packets, frame_intervals

# Main function to run and extract the features and stor in csv format
def main(pcap_file, csv_file):
    process_pcap(pcap_file, csv_file + '.csv')

# Execute for all files found!
if __name__ == "__main__":
    for addr in files_with_extension:
        i += 1
        csv_path = addr + '.csv'
        main(addr, csv_path)
        print(f'The dataset {i}: {csv_path} was stored!')


# 3- **Read the CSV and Merge Them**

In [None]:
# Read and merge the csv files

'''
# Configuration
    # Input:
      1- root-directory--> It is the directory with csv files (In this case, it
      uses the roo directory from 2 previous cell)
      2- ds_name --> the name you want to save the csv file (without extension)
    # Output:
      1- Save the
'''

#The name of merded dataset (default is DS)"
ds_name= 'DS'
print(root_directory)
csvfiles = find_files_with_extension(root_directory, 'csv')
dff = []
for mycsv in csvfiles:
    mydf = pd.read_csv(mycsv)
    dff.append(mydf)
df = pd.concat(dff, ignore_index=True, sort=False)
#df = pd.concat(dff, ignore_index=True)
df.to_csv(root_directory +'/' + ds_name +'.csv', index=False)
#df.to_csv(root_directory + '\\AR.csv', index=False)
print(f"The df with {df.shape[0]} rows and {df.shape[1]} columns was stored as csv file!")

# 4- **Pre-Processing**
## (4-1) Add direction for each flow & (4-2) remove the NAN Values

In [None]:
mycsvfile= root_directory + '/DS.csv'
Directionfiles = mycsvfile.split('.')[0]+'2'+'.csv'
Removedfile = mycsvfile.split('.')[0]+'3'+'.csv'

### **4-1- add the direction to the CSV file**

In [None]:
import csv

def add_direction_feature(input_file, output_file):
    with open(input_file, mode='r', newline='') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)

    # Keep track of seen destination IPs and ports
    seen_destinations = set()

    for row in rows:
        src_tuple = (row['SrcIP'], row['SrcPort'])
        dst_tuple = (row['DstIP'], row['DstPort'])

        # Determine direction
        if src_tuple in seen_destinations:
            row['Direction'] = 'Downlink'
        else:
            row['Direction'] = 'Uplink'
            seen_destinations.add(dst_tuple)

    # Write to new CSV file
    with open(output_file, mode='w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['Direction'])
        writer.writeheader()
        writer.writerows(rows)

# Example usage
add_direction_feature(mycsvfile,Directionfiles)

### **4-2- Remove the flows without Frames**

In [None]:
import csv

def filter_csv(input_file, output_file):
    with open(input_file, mode='r', newline='') as infile, open(output_file, mode='w', newline='') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)

        writer.writeheader()

        for row in reader:
            num_frames = int(row['NumFrames'])
            ifi = float(row['IFI'])

            # Keep the row if NumFrames is greater than 1 and IFI is not 0
            if num_frames >= 1 and ifi != 0:
                writer.writerow(row)

# Example usage
filter_csv(Directionfiles, Removedfile)
