<a href="https://colab.research.google.com/github/alirezashirmarz/XR-AR_NTC/blob/main/3_ReadPCAP_ExtractFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Title**: Extract IPI, FS, and IFI Features from PCAP(NJ) File

> Indented block
    ## Pay attention to configuration in each cell

In [None]:
# Install Scapy package

!pip install scapy

# 1- Find the Files in the root directory e.g. pcap or pcapnj files

In [None]:
# Set the root directory includes PCAP(NJ) files

root_directory = r'/home/alireza/myframes/Uplink'

In [None]:
# Find all pcap files in  a directory

import os
import glob

def find_files_with_extension(root_dir, extension):
    file_list = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(extension):
                file_list.append(os.path.join(root, file))
    return file_list

# Example usage
extension = '.pcap'  # Possible values --> '.pcap' or  '.pcapng'
files_with_extension = find_files_with_extension(root_directory, extension)

print("Files found:")
for file_path in files_with_extension:
    print(file_path)

Files found:
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-6.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-5.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-7.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-4.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-10.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-8.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-3.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-1.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-2.pcap
/home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-9.pcap


# 2- Extract the Features (IPI,IFI, FS, & PS) of flow from PCAP(NJ)

In [None]:
import csv
import scapy.all as scapy
from collections import defaultdict

def process_pcap(pcap_file, csv_file):
    packets = scapy.rdpcap(pcap_file)
    flows = defaultdict(list)

    # Process packets and organize them into UDP/QUIC flows
    for packet in packets:
        if packet.haslayer(scapy.UDP) or "quic" in str(packet).lower():  # Heuristic for QUIC
            ip_layer = packet[scapy.IP] if packet.haslayer(scapy.IP) else packet[scapy.IPv6]
            protocol = 'QUIC' if "quic" in str(packet).lower() else 'UDP'
            flow_key = (
                ip_layer.src, ip_layer.dst,
                packet[scapy.UDP].sport, packet[scapy.UDP].dport, ip_layer.name, protocol
            )
            flows[flow_key].append(packet)

    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            'ID', 'SrcIP', 'DstIP', 'SrcPort', 'DstPort',
            'IPVersion', 'Protocol', 'PS', 'IPI', 'FlowSizeBytes',
            'FlowSizePackets', 'FS', 'FS(PKT)',
            'NumFrames', 'IFI'
        ])

        for i, (flow_key, packets) in enumerate(flows.items()):
            src_ip, dst_ip, src_port, dst_port, ip_version, protocol = flow_key

            timestamps = [packet.time for packet in packets]
            ip_intervals = [j - i for i, j in zip(timestamps[:-1], timestamps[1:])]
            packet_sizes = [len(packet) for packet in packets]
            total_flow_bytes = sum(packet_sizes)
            total_flow_packets = len(packets)

            avg_packet_size = total_flow_bytes / total_flow_packets if total_flow_packets else 0
            avg_ipi = sum(ip_intervals) / len(ip_intervals) if ip_intervals else 0

            # Frame Analysis (specific to UDP, might not be accurate for QUIC)
            frame_sizes_bytes, frame_sizes_packets, frame_intervals = analyze_frames(packets)

            avg_frame_size_bytes = sum(frame_sizes_bytes) / len(frame_sizes_bytes) if frame_sizes_bytes else 0
            avg_frame_size_packets = sum(frame_sizes_packets) / len(frame_sizes_packets) if frame_sizes_packets else 0
            num_frames = len(frame_sizes_bytes)
            avg_ifi = sum(frame_intervals) / len(frame_intervals) if frame_intervals else 0

            # Writing to CSV
            writer.writerow([
                i + 1, src_ip, dst_ip, src_port, dst_port, ip_version, protocol,
                avg_packet_size, avg_ipi, total_flow_bytes, total_flow_packets,
                avg_frame_size_bytes, avg_frame_size_packets, num_frames, avg_ifi
            ])

def analyze_frames(packets):
    frame_sizes_bytes = []
    frame_sizes_packets = []
    frame_intervals = []
    current_frame_size_bytes = 0
    current_frame_size_packets = 0
    current_frame_start_time = None
    last_udp_length = None

    for packet in packets:
        if packet.haslayer(scapy.UDP):
            udp_length = packet[scapy.UDP].len
            if current_frame_start_time is None:
                current_frame_start_time = packet.time

            if last_udp_length is not None and udp_length != last_udp_length:
                frame_sizes_bytes.append(current_frame_size_bytes)
                frame_sizes_packets.append(current_frame_size_packets)
                frame_intervals.append(packet.time - current_frame_start_time)
                current_frame_size_bytes = 0
                current_frame_size_packets = 0
                current_frame_start_time = packet.time

            current_frame_size_bytes += len(packet)
            current_frame_size_packets += 1
            last_udp_length = udp_length

    if current_frame_size_bytes > 0:
        frame_sizes_bytes.append(current_frame_size_bytes)
        frame_sizes_packets.append(current_frame_size_packets)

    return frame_sizes_bytes, frame_sizes_packets, frame_intervals




# Example usage

def main(pcap_file, csv_file):
    #first_five_rows = extract_and_save_data(pcap_file, csv_file)
    process_pcap(pcap_file, csv_file + '.csv')

    #for row in first_five_rows:
        #print(row)
i = 0

if __name__ == "__main__":
    # Replace these with the actual file paths
    for addr in files_with_extension:

        #pcap_path = "E:\Postdoc_UFScar\Dataset\Other Datasets\VOD\\orangejuice.pcapng"
        i=i+1
        csv_path = addr+'.csv'                          #"E:\\Postdoc_UFScar\\Dataset\\Other Datasets\\game\\orangejuice.csv"
        main(addr, csv_path)
        print(f'The dataset {i}: {csv_path} was stored!')



The dataset 1: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-6.pcap.csv was stored!
The dataset 2: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-5.pcap.csv was stored!
The dataset 3: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-7.pcap.csv was stored!
The dataset 4: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-4.pcap.csv was stored!
The dataset 5: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-10.pcap.csv was stored!
The dataset 6: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-8.pcap.csv was stored!
The dataset 7: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-3.pcap.csv was stored!
The dataset 8: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-1.pcap.csv was stored!
The dataset 9: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2-3840-72-2.pcap.csv was stored!
The dataset 10: /home/alireza/myframes/5-stream2-3840-1920-72-60/Stream2

# 3- Read the CSV and Merge Them

In [None]:
# Read and merge the csv files

'''
# Configuration
    # Input:
      1- root-directory--> It is the directory with csv files (In this case, it
      uses the roo directory from 2 previous cell)
      2- ds_name --> the name you want to save the csv file (without extension)
    # Output:
      1- Save the
'''

import pandas as pd
#root_directory = "E:\Postdoc_UFScar\Dataset\Other Datasets\VOD"
ds_name= 'DS'
print(root_directory)
csvfiles = find_files_with_extension(root_directory, 'csv')
dff = []
for mycsv in csvfiles:
    mydf = pd.read_csv(mycsv)
    dff.append(mydf)
df = pd.concat(dff, ignore_index=True, sort=False)
#df = pd.concat(dff, ignore_index=True)
df.to_csv(root_directory +'/' + ds_name +'.csv', index=False)
#df.to_csv(root_directory + '\\AR.csv', index=False)
print(f"The df with {df.shape[0]} rows and {df.shape[1]} columns was stored as csv file!")

/home/alireza/myframes/Uplink
The df with 30 rows and 16 columns was stored as csv file!


# 4- Pre-Processing
## Add direction for each flow & remove the NAN Values

In [None]:
#set processing file address

mycsvfile = root_directory + '/DS.csv'
Directionfiles = mycsvfile.split('.')[0]+'2'+'.csv'
Removedfile = mycsvfile.split('.')[0]+'3'+'.csv'

In [None]:
# Add the direction to CSV

In [None]:
import csv

def add_direction_feature(input_file, output_file):
    with open(input_file, mode='r', newline='') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)

    # Keep track of seen destination IPs and ports
    seen_destinations = set()

    for row in rows:
        src_tuple = (row['SrcIP'], row['SrcPort'])
        dst_tuple = (row['DstIP'], row['DstPort'])

        # Determine direction
        if src_tuple in seen_destinations:
            row['Direction'] = 'Downlink'
        else:
            row['Direction'] = 'Uplink'
            seen_destinations.add(dst_tuple)

    # Write to new CSV file
    with open(output_file, mode='w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['Direction'])
        writer.writeheader()
        writer.writerows(rows)

# Example usage
add_direction_feature(mycsvfile,Directionfiles)


In [None]:
# Remove the flows

In [None]:
import csv

def filter_csv(input_file, output_file):
    with open(input_file, mode='r', newline='') as infile, open(output_file, mode='w', newline='') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)

        writer.writeheader()

        for row in reader:
            num_frames = int(row['NumFrames'])
            ifi = float(row['IFI'])

            # Keep the row if NumFrames is greater than 1 and IFI is not 0
            if num_frames >= 1 and ifi != 0:
                writer.writerow(row)

# Example usage
filter_csv(Directionfiles, Removedfile)
