# Description
process pcap data for get traditional baseline
label.pcap -> raw hex code

In [1]:
import os
import logging
import scapy.all as scapy
from multiprocessing import Pool, cpu_count
import csv
import pandas as pd

os.chdir('LLM4Traffic/code/Traditional')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/raw.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [2]:
dataset_path = 'LLM4Traffic/polishedns/ustc-binary'
output_path = 'LLM4Traffic/code/Traditional/datasets/polishedns/ustc-binary'

In [3]:
def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP):
        packet[scapy.IP].src = "0.0.0.0"
        packet[scapy.IP].dst = "0.0.0.0"
    elif packet.haslayer('IPv6'):
        packet['IPv6'].src = "::"
        packet['IPv6'].dst = "::"
    else:
        return 0

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = 0  # 设置源端口为0
        packet[scapy.UDP].dport = 0  # 设置目的端口为0
        packet[scapy.UDP].remove_payload()  # 删除 UDP 载荷
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = 0  # 设置源端口为0
        packet[scapy.TCP].dport = 0  # 设置目的端口为0
        packet[scapy.TCP].remove_payload()  # 删除 TCP 载荷
    
    return packet

In [4]:
# get how many features/ip flags/tcp flags/.. in the dataset
# # file/{test.pcap} or file/{train_val_split_0}/{train.pcap}
max_payload_length = 0

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' in type:
        if 'pcap' in type:
            packets =  scapy.PcapReader(f'{dataset_path}/{type}')
            for id, packet in enumerate(packets):
                # only require ip header and transportaion layer
                packet = clean_packet(packet)
                if packet == 0:
                    continue

                hex_str = bytes(packet).hex()
                if len(hex_str) > max_payload_length:
                    max_payload_length = len(hex_str)
        continue

    for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
        logger.info(f'Processing {type} {file_name}')
        if 'pcap' not in file_name:
            continue

        packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
        for id, packet in enumerate(packets):
            # only require ip header and transportaion layer
            packet = clean_packet(packet)
            if packet == 0:
                continue

            hex_str = bytes(packet).hex()
            if len(hex_str) > max_payload_length:
                max_payload_length = len(hex_str)
            
print(max_payload_length)


2024-12-15 21:37:45,221 - root - INFO - Processing test.pcap
2024-12-15 21:38:20,236 - root - INFO - Processing train_val_split_2
2024-12-15 21:38:20,884 - root - INFO - Processing train_val_split_2 train.parquet
2024-12-15 21:38:21,021 - root - INFO - Processing train_val_split_2 val.pcap
2024-12-15 21:38:29,900 - root - INFO - Processing train_val_split_2 train.pcap
2024-12-15 21:39:13,601 - root - INFO - Processing train_val_split_2 val.parquet
2024-12-15 21:39:14,031 - root - INFO - Processing train_val_split_0
2024-12-15 21:39:14,044 - root - INFO - Processing train_val_split_0 train.parquet
2024-12-15 21:39:14,092 - root - INFO - Processing train_val_split_0 val.pcap
2024-12-15 21:39:22,871 - root - INFO - Processing train_val_split_0 train.pcap
2024-12-15 21:40:06,478 - root - INFO - Processing train_val_split_0 val.parquet
2024-12-15 21:40:07,118 - root - INFO - Processing train_val_split_1
2024-12-15 21:40:07,335 - root - INFO - Processing train_val_split_1 train.parquet
2024-

160


In [4]:
# get how many features/ip flags/tcp flags/.. in the dataset
# # file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap
max_payload_length = 0

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    # only require ip header and transportaion layer
                    packet = clean_packet(packet)
                    if packet == 0:
                        continue

                    hex_str = bytes(packet).hex()
                    if len(hex_str) > max_payload_length:
                        max_payload_length = len(hex_str)
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')

            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    for id, packet in enumerate(packets):
                        # only require ip header and transportaion layer
                        packet = clean_packet(packet)
                        if packet == 0:
                            continue

                        hex_str = bytes(packet).hex()
                        if len(hex_str) > max_payload_length:
                            max_payload_length = len(hex_str)
            
print(max_payload_length)

2025-01-06 09:23:55,224 - root - INFO - Processing train_val_split_2
2025-01-06 09:23:55,225 - root - INFO - Processing train_val_split_2 train
2025-01-06 09:24:37,064 - root - INFO - Processing train_val_split_2 val
2025-01-06 09:24:42,053 - root - INFO - Processing train_val_split_0
2025-01-06 09:24:42,055 - root - INFO - Processing train_val_split_0 train
2025-01-06 09:25:23,823 - root - INFO - Processing train_val_split_0 val
2025-01-06 09:25:28,906 - root - INFO - Processing test
2025-01-06 09:25:28,908 - root - INFO - Processing test malware.pcap
2025-01-06 09:27:05,395 - root - INFO - Processing test benign.pcap
2025-01-06 09:29:46,850 - root - INFO - Processing train_val_split_1
2025-01-06 09:29:46,854 - root - INFO - Processing train_val_split_1 train
2025-01-06 09:30:28,056 - root - INFO - Processing train_val_split_1 val


136


In [5]:
# # file/{test.pcap} or file/{train_val_split_0}/{train.pcap}
byte_vocab = set()

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' in type:
        if 'pcap' in type:
            packets =  scapy.PcapReader(f'{dataset_path}/{type}')
            for id, packet in enumerate(packets):
                # only require ip header and transportaion layer
                packet = clean_packet(packet)
                if packet == 0:
                    continue

                hex_str = bytes(packet).hex()
                if len(hex_str) < max_payload_length:
                    hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                hex_array = [hex_str[i:i+4] for i in range(0, len(hex_str), 4)]

                byte_vocab.update(hex_array)
        continue

    for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
        logger.info(f'Processing {type} {file_name}')
        if 'pcap' not in file_name:
            continue

        packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
        for id, packet in enumerate(packets):
            # only require ip header and transportaion layer
            packet = clean_packet(packet)
            if packet == 0:
                continue

            hex_str = bytes(packet).hex()
            if len(hex_str) < max_payload_length:
                hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
            hex_array = [hex_str[i:i+4] for i in range(0, len(hex_str), 4)]

            byte_vocab.update(hex_array)

byte_list = list(byte_vocab)
print(len(byte_list))  

2025-01-02 07:20:13,746 - root - INFO - Processing train_val_split_2
2025-01-02 07:20:13,748 - root - INFO - Processing train_val_split_2 train
2025-01-02 07:20:13,748 - root - INFO - Processing train_val_split_2 val
2025-01-02 07:20:13,749 - root - INFO - Processing train_val_split_0
2025-01-02 07:20:13,750 - root - INFO - Processing train_val_split_0 train
2025-01-02 07:20:13,751 - root - INFO - Processing train_val_split_0 val
2025-01-02 07:20:13,751 - root - INFO - Processing test
2025-01-02 07:20:13,752 - root - INFO - Processing train_val_split_1
2025-01-02 07:20:13,753 - root - INFO - Processing train_val_split_1 train
2025-01-02 07:20:13,753 - root - INFO - Processing train_val_split_1 val


0


In [5]:
# # file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap
byte_vocab = set()

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    # only require ip header and transportaion layer
                    packet = clean_packet(packet)
                    if packet == 0:
                        continue

                    hex_str = bytes(packet).hex()
                    if len(hex_str) < max_payload_length:
                        hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                    hex_array = [hex_str[i:i+4] for i in range(0, len(hex_str), 4)]

                    byte_vocab.update(hex_array)
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')

            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    for id, packet in enumerate(packets):
                        # only require ip header and transportaion layer
                        packet = clean_packet(packet)
                        if packet == 0:
                            continue

                        hex_str = bytes(packet).hex()
                        if len(hex_str) < max_payload_length:
                            hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                        hex_array = [hex_str[i:i+4] for i in range(0, len(hex_str), 4)]

                        byte_vocab.update(hex_array)

byte_list = list(byte_vocab)
print(len(byte_list))  

2025-01-06 09:30:59,612 - root - INFO - Processing train_val_split_2
2025-01-06 09:30:59,614 - root - INFO - Processing train_val_split_2 train
2025-01-06 09:31:41,884 - root - INFO - Processing train_val_split_2 val
2025-01-06 09:31:46,931 - root - INFO - Processing train_val_split_0
2025-01-06 09:31:46,935 - root - INFO - Processing train_val_split_0 train
2025-01-06 09:32:28,891 - root - INFO - Processing train_val_split_0 val
2025-01-06 09:32:34,092 - root - INFO - Processing test
2025-01-06 09:32:34,094 - root - INFO - Processing test malware.pcap
2025-01-06 09:34:12,872 - root - INFO - Processing test benign.pcap
2025-01-06 09:36:57,296 - root - INFO - Processing train_val_split_1
2025-01-06 09:36:57,298 - root - INFO - Processing train_val_split_1 train
2025-01-06 09:37:39,427 - root - INFO - Processing train_val_split_1 val


65536


In [11]:
# # file/{test.pcap} or file/{train_val_split_0}/{train.pcap}
def write2csv(type, data, label):
    file_path = f'{output_path}/raw/{type}.csv'

    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(list([str(i) for i in range(int(max_payload_length/4))]) + ['class'])

    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data + [label])


for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')
    if 'test' in type: 
        if 'pcap' in type:
            packets = scapy.PcapReader(f'{dataset_path}/{type}')
            data_frame = pd.read_parquet(f'{dataset_path}/{type[:-5]}.parquet')

            def process_packet(packet, id):
                packet = clean_packet(packet)
                if packet == 0:
                    return None
                
                hex_str = bytes(packet).hex()
                if len(hex_str) < max_payload_length:
                    hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                hex_array = [byte_list.index(hex_str[i:i+4]) for i in range(0, len(hex_str), 4)]
                return (hex_array, data_frame.iloc[id]['class_str'])

            with Pool(cpu_count()) as pool:
                results = pool.starmap(process_packet, [(packet, id) for id, packet in enumerate(packets)])
                
            for result in results:
                if result is not None:
                    write2csv(type[:-5], result[0], result[1])
    else:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {file_name}')
            if 'pcap' in file_name:
                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                data_frame = pd.read_parquet(f'{dataset_path}/{type}/{file_name[:-5]}.parquet')
                
                def process_packet(packet, id):
                    packet = clean_packet(packet)
                    if packet == 0:
                        return None
                    
                    hex_str = bytes(packet).hex()
                    if len(hex_str) < max_payload_length:
                        hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                    hex_array = [byte_list.index(hex_str[i:i+4]) for i in range(0, len(hex_str), 4)]
                    return (hex_array, data_frame.iloc[id]['class_str'])

                with Pool(cpu_count()) as pool:
                    results = pool.starmap(process_packet, [(packet, id) for id, packet in enumerate(packets)])
                    
                for result in results:
                    if result is not None:
                        write2csv(f'{type}/{file_name[:-5]}', result[0], result[1])
        
logger.info('Finished')  

2024-12-17 16:03:44,584 - root - INFO - Processing test.pcap
2024-12-17 16:08:57,656 - root - INFO - Processing train_val_split_2
2024-12-17 16:08:57,661 - root - INFO - Processing train_val_split_2 train.parquet
2024-12-17 16:08:57,662 - root - INFO - Processing train_val_split_2 val.pcap
2024-12-17 16:10:18,305 - root - INFO - Processing train_val_split_2 train.pcap
2024-12-17 16:18:06,987 - root - INFO - Processing train_val_split_2 val.parquet
2024-12-17 16:18:06,990 - root - INFO - Processing train_val_split_0
2024-12-17 16:18:06,992 - root - INFO - Processing train_val_split_0 train.parquet
2024-12-17 16:18:06,992 - root - INFO - Processing train_val_split_0 val.pcap
2024-12-17 16:19:29,644 - root - INFO - Processing train_val_split_0 train.pcap
2024-12-17 16:27:37,387 - root - INFO - Processing train_val_split_0 val.parquet
2024-12-17 16:27:37,390 - root - INFO - Processing train_val_split_1
2024-12-17 16:27:37,393 - root - INFO - Processing train_val_split_1 train.parquet
2024-

In [6]:
# # file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap
def write2csv(type, data, label):
    file_path = f'{output_path}/raw/{type}.csv'

    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(list([str(i) for i in range(int(max_payload_length/4))]) + ['class'])

    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data + [label])


for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                def process_packet(packet, id):
                    packet = clean_packet(packet)
                    if packet == 0:
                        return None
                    
                    hex_str = bytes(packet).hex()
                    if len(hex_str) < max_payload_length:
                        hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                    hex_array = [byte_list.index(hex_str[i:i+4]) for i in range(0, len(hex_str), 4)]
                    return (hex_array, file_name)

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                with Pool(cpu_count()) as pool:
                    results = pool.starmap(process_packet, [(packet, id) for id, packet in enumerate(packets)])
                    
                for result in results:
                    if result is not None:
                        write2csv(type, result[0], result[1])
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')

            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                if 'pcap' in file_name:
                    def process_packet(packet, id):
                        packet = clean_packet(packet)
                        if packet == 0:
                            return None
                        
                        hex_str = bytes(packet).hex()
                        if len(hex_str) < max_payload_length:
                            hex_str = hex_str + '0' * (max_payload_length - len(hex_str))
                        hex_array = [byte_list.index(hex_str[i:i+4]) for i in range(0, len(hex_str), 4)]
                        return (hex_array, file_name)
                    
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    with Pool(cpu_count()) as pool:
                        results = pool.starmap(process_packet, [(packet, id) for id, packet in enumerate(packets)])
                        
                    for result in results:
                        if result is not None:
                            write2csv(f'{type}/{folder}', result[0], result[1])
        
logger.info('Finished')  

2025-01-06 09:38:25,036 - root - INFO - Processing train_val_split_2
2025-01-06 09:38:25,038 - root - INFO - Processing train_val_split_2 train
2025-01-06 09:40:50,212 - root - INFO - Processing train_val_split_2 val
2025-01-06 09:41:17,693 - root - INFO - Processing train_val_split_0
2025-01-06 09:41:17,695 - root - INFO - Processing train_val_split_0 train
2025-01-06 09:43:44,559 - root - INFO - Processing train_val_split_0 val
2025-01-06 09:44:13,135 - root - INFO - Processing test
2025-01-06 09:44:13,137 - root - INFO - Processing test malware.pcap
2025-01-06 09:50:06,155 - root - INFO - Processing test benign.pcap
2025-01-06 10:02:07,116 - root - INFO - Processing train_val_split_1
2025-01-06 10:02:07,119 - root - INFO - Processing train_val_split_1 train
2025-01-06 10:04:50,090 - root - INFO - Processing train_val_split_1 val
2025-01-06 10:05:16,259 - root - INFO - Finished
