# Description
process et-bert data for fine-tuning
label.pcap (tsv format) -> train/validation/test.tsv

In [None]:
import os
import logging
import scapy.all as scapy
import random
import binascii
import csv
import json
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

In [None]:
dataset = 'vpn-app'

dataset_path = f'{dataset}'
output_path = f'code/PCAP_encoder/1.Datasets/Classification/{dataset}'
output_withoutIP_path = f'code/PCAP_encoder/1.Datasets/Classification/without_IP/{dataset}'

os.makedirs(output_path, exist_ok=True)
os.makedirs(output_withoutIP_path, exist_ok=True)

In [None]:
def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP):
        packet[scapy.IP].src = "0.0.0.0"
        packet[scapy.IP].dst = "0.0.0.0"
    elif packet.haslayer('IPv6'):
        packet['IPv6'].src = "::"
        packet['IPv6'].dst = "::"

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = 0 
        packet[scapy.UDP].dport = 0  
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = 0  
        packet[scapy.TCP].dport = 0  
    
    return packet

In [None]:
mapping_json = f'./{dataset}.json'
class_indexs = json.load(open(mapping_json, 'r'))

def group_string_by_n(pkt, n=4):
    s = binascii.hexlify(bytes(pkt)).decode()
    return ' '.join(s[i:i+n] for i in range(0, len(s), n))

for split_folder in os.listdir(dataset_path):
    print(f"Processing file: {split_folder}")
    if split_folder == 'test':
        dataset_file = [['question', 'class', 'type_q', 'context']]
        for file in os.listdir(f"{dataset_path}/{split_folder}"):
            if file.endswith('.pcap'):
                print(f"Processing file: {file}")
                with scapy.PcapReader(f"{dataset_path}/{split_folder}/{file}") as pkt_reader:
                    for pkt in pkt_reader:
                        pkt = clean_packet(pkt)
                        context = group_string_by_n(pkt)
                        dataset_file.append(['What is the representation of this packet?', class_indexs[file[:-5]], file[:-5], context])
        os.makedirs(f"{output_withoutIP_path}", exist_ok=True)
        output_dataframe = pd.DataFrame(dataset_file[1:], columns=dataset_file[0])
        output_dataframe.to_parquet(f"{output_withoutIP_path}/{split_folder}.parquet", index=False)
    else:
        for type in os.listdir(f"{dataset_path}/{split_folder}"):
            dataset_file = [['question', 'class', 'type_q', 'context']]
            for file in os.listdir(f"{dataset_path}/{split_folder}/{type}"):
                if file.endswith('.pcap'):
                    print(f"Processing file: {file}")
                    with scapy.PcapReader(f"{dataset_path}/{split_folder}/{type}/{file}") as pkt_reader:
                        for pkt in pkt_reader:
                            pkt = clean_packet(pkt)
                            context = group_string_by_n(pkt)
                            dataset_file.append(['What is the representation of this packet?', class_indexs[file[:-5]], file[:-5], context])
            
            os.makedirs(f"{output_withoutIP_path}/{split_folder}", exist_ok=True)
            output_dataframe = pd.DataFrame(dataset_file[1:], columns=dataset_file[0])
            output_dataframe.to_parquet(f"{output_withoutIP_path}/{split_folder}/{type}.parquet", index=False)
