# Description
process pcap data for get traditional baseline
label.pcap -> feature alignment Matrix/csv

In [1]:
import os
import logging
import scapy.all as scapy
from multiprocessing import Pool, cpu_count
import csv
import pandas as pd

os.chdir('LLM4Traffic/code/Traditional')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/feature_alignment.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [24]:
dataset_path = 'LLM4Traffic/pipeline/polishednsLen/tls'
output_path = 'LLM4Traffic/code/Traditional/datasets/polishednsLen/tls'

In [25]:
def get_key_val(line):
    layer = None
    sublayer = None
    key = None
    val = None

    if '###' in line:             
        if '|###' in line:                
            sublayer = line.strip('|#[] ')  
            print(f'sublayer ### {sublayer}')    
        else:                
            layer = line.strip('#[] ')      

    if '=' in line:            
        if '|' in line and 'sublayer' in locals():                
            key, val = line.strip('| ').split('=', 1)        
            key, val = key.strip(), val.strip('\' ')          
            print(f'sublayer ||| {sublayer}')  
        else:                 
            key, val = line.split('=', 1)   
            key, val = key.strip(), val.strip('\' ')    
            
    return layer, sublayer, key, val

def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP) or packet.haslayer('IPv6'):
        if packet.haslayer(scapy.UDP):
            packet[scapy.UDP].remove_payload()   
        if packet.haslayer(scapy.TCP):
            packet[scapy.TCP].remove_payload()    
    
    return packet

# get how many features/ip flags/tcp flags/.. in the dataset

pass_fields = ['src', 'dst', 'sport', 'dport'] # ignore these fields
field_set = set()
ip_flags_set = set()
tcp_flags_set = set()

def get_all_features(packet):
    protocol = None
    for line in packet.show2(dump=True).split('\n'): 
        layer, sub, key, val = get_key_val(line)

        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    if len(val) == 0:
                        continue
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                            elif option[0] == 'SAck': # SAckOK is a flag
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                                continue

                            field_set.add(f'{protocol}_{option[0]}')
                    continue     

                # distinguish ip flags and tcp flags
                if key == 'flags': 
                    if protocol == 'IP':
                        ip_flags_set.add(f'{val}')
                    elif protocol == 'TCP':
                        tcp_flags_set.add(f'{val}')  

                if key == 'chksum' and protocol in ['TCP', 'UDP']:   # merge tcp and udp checksum
                    field_set.add('TL_chksum')
                elif f'{protocol}_{key}'== 'IP_tos' or f'{protocol}_{key}' == 'IPv6_tc': # merge tos and traffic class
                    field_set.add('IP_tos')
                elif key == 'version': # merge ip version and ipv6 version
                    field_set.add('IP_version')
                elif f'{protocol}_{key}'== 'IP_ttl' or f'{protocol}_{key}' == 'IPv6_hlim': # merge ttl and hot limit
                    field_set.add('IP_ttl')
                elif f'{protocol}_{key}'== 'IP_len' or f'{protocol}_{key}' == 'IPv6_plen': # merge len and payload len
                    field_set.add('IP_len')
                elif f'{protocol}_{key}'== 'IP_proto' or f'{protocol}_{key}' == 'IPv6_nh': # merge proto and next header
                    field_set.add('IP_proto')
                else:
                    field_set.add(f'{protocol}_{key}')

    return field_set

In [None]:
# file/{test.pcap} or file/{train_val_split_0}/{train.pcap}
for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' in type:
        if 'pcap' in type:
            packets =  scapy.PcapReader(f'{dataset_path}/{type}')
            for id, packet in enumerate(packets):
                # only require ip header and transportaion layer
                packet = clean_packet(packet)
                get_all_features(packet)
        continue

    for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
        logger.info(f'Processing {type} {file_name}')
        if 'pcap' not in file_name:
            continue

        packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
        for id, packet in enumerate(packets):
            # only require ip header and transportaion layer
            packet = clean_packet(packet)
            get_all_features(packet)

print(field_set)
print(ip_flags_set)
print(tcp_flags_set)
print()


In [26]:
# file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap
for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if type == 'test':
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    # only require ip header and transportaion layer
                    packet = clean_packet(packet)
                    get_all_features(packet)
                
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')
            
            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                logger.info(f'Processing {type} {folder} {file_name}')
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    for id, packet in enumerate(packets):
                        
                        # only require ip header and transportaion layer
                        packet = clean_packet(packet)
                        get_all_features(packet)
                        
                
                    
print(field_set)
print(ip_flags_set)
print(tcp_flags_set)
print()


2025-01-23 14:24:44,116 - root - INFO - Processing train_val_split_2
2025-01-23 14:24:44,138 - root - INFO - Processing train_val_split_2 train
2025-01-23 14:24:44,152 - root - INFO - Processing train_val_split_2 train 51cto.pcap
2025-01-23 14:24:44,786 - root - INFO - Processing train_val_split_2 train sina.pcap
2025-01-23 14:24:45,360 - root - INFO - Processing train_val_split_2 train overleaf.pcap
2025-01-23 14:24:45,926 - root - INFO - Processing train_val_split_2 train youtube.pcap
2025-01-23 14:24:46,442 - root - INFO - Processing train_val_split_2 train alibaba.pcap
2025-01-23 14:24:46,840 - root - INFO - Processing train_val_split_2 train instagram.pcap
2025-01-23 14:24:47,363 - root - INFO - Processing train_val_split_2 train iqiyi.pcap
2025-01-23 14:24:47,865 - root - INFO - Processing train_val_split_2 train ibm.pcap
2025-01-23 14:24:48,405 - root - INFO - Processing train_val_split_2 train dailymotion.pcap
2025-01-23 14:24:49,010 - root - INFO - Processing train_val_split_2

{'IP_id', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'IP_frag', 'TCP_seq', 'IP_ttl', 'TCP_ack', 'IP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_Timestamp', 'TCP_reserved', 'IP_proto', 'IP_ihl', 'IP_tos', 'IP_flags', 'TCP_window'}
{'', 'DF'}
{'A', 'FPA', 'PA', 'PAC', 'FPAC', 'FA', 'AC'}



In [27]:
# convert the flag to integer

ip_flags_map = {}
tcp_flags_map = {}

for id, ip_flag in enumerate(ip_flags_set):
    if ip_flag != '':
        ip_flags_map[ip_flag] = id + 1

for id, tcp_flag in enumerate(tcp_flags_set):
    if tcp_flag != '':
        tcp_flags_map[tcp_flag] = id + 1

print(ip_flags_map)
print(tcp_flags_map)
print(field_set)

{'DF': 2}
{'A': 1, 'FPA': 2, 'PA': 3, 'PAC': 4, 'FPAC': 5, 'FA': 6, 'AC': 7}
{'IP_id', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'IP_frag', 'TCP_seq', 'IP_ttl', 'TCP_ack', 'IP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_Timestamp', 'TCP_reserved', 'IP_proto', 'IP_ihl', 'IP_tos', 'IP_flags', 'TCP_window'}


In [28]:
field_set = {'IP_id', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'IP_frag', 'TCP_seq', 'IP_ttl', 'TCP_ack', 'IP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_Timestamp', 'TCP_reserved', 'IP_proto', 'IP_ihl', 'IP_tos', 'IP_flags', 'TCP_window'}
ip_flags_map = {'DF': 1}
tcp_flags_map = {'A': 1, 'FPA': 2, 'PA': 3, 'PAC': 4, 'FPAC': 5, 'FA': 6, 'AC': 7}
ip_proto_map = {'tcp': 1, 'udp': 2, 'TCP': 1, 'UDP': 2}


In [29]:
pass_fields = ['src', 'dst', 'sport', 'dport']
map_features = {'TCP_flags', 'IP_flags', 'IP_proto'}
hex_features = {'IP_tos', 'IP_chksum', 'TL_chksum', 'UDP_chksum', 'TCP_chksum'}
equal_features = {'TCP_chksum', 'UDP_chksum', 'IPV6_tc', 'IPv6_version', 'IPv6_plen', 'IPv6_nh', 'IPv6_hlim'}

def hex2int(value):
    return int(value, 16)

def special_rules(field, value):
    if not value: # convert '' to 0
        return 0
    elif field == 'TCP_flags':     # convert flags to int
        return tcp_flags_map[value]
    elif field == 'IP_flags':     # convert flags to int
        return ip_flags_map[value]
    elif field == 'IP_proto': # convert protocol to int
        return ip_proto_map[value]
    
def equal_rules(field): # convert ipv6 fields name to ipv4 fields name
    if field == 'TCP_chksum' or field == 'UDP_chksum':
        return 'TL_chksum'
    elif field == 'IPv6_tc':
        return 'IP_tos'
    elif field == 'IPv6_version':
        return 'IP_version'
    elif field == 'IPv6_plen':
        return 'IP_len'
    elif field == 'IPv6_nh':
        return 'IP_proto'
    elif field == 'IPv6_hlim':
        return 'IP_ttl'
     
def packet2features(packet):
    protocol = None
    packet_key_val = {}
    for line in packet.show2(dump=True).split('\n'): 
        layer, _, key, val = get_key_val(line)
        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_edcho_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAck': # The format of SAck is smiliar with timestamp and they are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_echo_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAckOK': 
                                packet_key_val[f'{protocol}_{option[0]}'] = int(val == 1)
                            elif option[0] == 'EOL': # EOL is a flag, always 0
                                packet_key_val[f'{protocol}_{option[0]}'] = 0
                            else:
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1]
                    continue   

                if f'{protocol}_{key}' in equal_features: # special rules for some fields
                    field = equal_rules(f'{protocol}_{key}')
                else:
                    field = f'{protocol}_{key}'


                if field in map_features: # special rules for some fields
                    packet_key_val[field] = special_rules(field, val)
                elif field in hex_features:
                    packet_key_val[field] = hex2int(val)
                else:
                    packet_key_val[field] = int(val) 
                
    packet_array = [packet_key_val.get(key, 0) for key in field_set]

    return packet_array

def write2csv(type, data, label):
    file_path = f'{output_path}/unclean/{type}.csv'

    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            print(list(field_set) + ['class'])
            writer.writerow(list(field_set) + ['class'])

    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data + [label])

In [None]:
# file/{test.pcap} or file/{train_val_split_0}/{train.pcap}

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')
    if 'test' in type: 
        if 'pcap' in type:
            packets =  scapy.PcapReader(f'{dataset_path}/{type}')
            data_frame = pd.read_parquet(f'{dataset_path}/{type[:-5]}.parquet')

            for id, packet in enumerate(packets):
                packet = clean_packet(packet)
                packet_array = packet2features(packet)
                write2csv(type[:-5], packet_array, data_frame.iloc[id]['class_str'])
    else:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {file_name}')
            if 'pcap' in file_name:
                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                data_frame = pd.read_parquet(f'{dataset_path}/{type}/{file_name[:-5]}.parquet')

                for id, packet in enumerate(packets):
                    packet = clean_packet(packet)
                    packet_array = packet2features(packet)
                    write2csv(f'{type}/{file_name[:-5]}', packet_array, data_frame.iloc[id]['class_str'])
 
logger.info('Finished')  

In [30]:
# file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    packet = clean_packet(packet)
                    packet_array = packet2features(packet)
                    write2csv(type, packet_array, file_name)
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')

            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    
                    for id, packet in enumerate(packets):
                        packet = clean_packet(packet)
                        packet_array = packet2features(packet)
                        write2csv(f'{type}/{folder}', packet_array, file_name)
 
logger.info('Finished')  

2025-01-23 14:32:22,961 - root - INFO - Processing train_val_split_2
2025-01-23 14:32:22,963 - root - INFO - Processing train_val_split_2 train


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:33:29,572 - root - INFO - Processing train_val_split_2 val


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:33:33,176 - root - INFO - Processing train_val_split_0
2025-01-23 14:33:33,177 - root - INFO - Processing train_val_split_0 train


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:34:13,964 - root - INFO - Processing train_val_split_0 val


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:34:28,824 - root - INFO - Processing test
2025-01-23 14:34:28,825 - root - INFO - Processing test 51cto.pcap


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:34:31,063 - root - INFO - Processing test sina.pcap
2025-01-23 14:34:33,469 - root - INFO - Processing test overleaf.pcap
2025-01-23 14:34:45,521 - root - INFO - Processing test youtube.pcap
2025-01-23 14:34:47,581 - root - INFO - Processing test alibaba.pcap
2025-01-23 14:34:47,830 - root - INFO - Processing test instagram.pcap
2025-01-23 14:34:48,887 - root - INFO - Processing test iqiyi.pcap
2025-01-23 14:34:50,299 - root - INFO - Processing test ibm.pcap
2025-01-23 14:34:51,674 - root - INFO - Processing test dailymotion.pcap
2025-01-23 14:35:01,519 - root - INFO - Processing test smzdm.pcap
2025-01-23 14:35:08,231 - root - INFO - Processing test gravatar.pcap
2025-01-23 14:35:09,297 - root - INFO - Processing test microsoft.pcap
2025-01-23 14:35:11,182 - root - INFO - Processing test cloudfront.pcap
2025-01-23 14:35:11,719 - root - INFO - Processing test jb51.pcap
2025-01-23 14:35:16,983 - root - INFO - Processing test pinduoduo.pcap
2025-01-23 14:35:18,952 - root - 

['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:39:01,830 - root - INFO - Processing train_val_split_1 val


['IP_id', 'TCP_SAck', 'TCP_echo_SAck', 'TCP_flags', 'TL_chksum', 'TCP_urgptr', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'TCP_reserved', 'IP_frag', 'TCP_seq', 'IP_ttl', 'IP_proto', 'TCP_ack', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'TCP_dataofs', 'IP_version', 'TCP_window', 'class']


2025-01-23 14:39:16,719 - root - INFO - Finished


# for data clean

normalization each row

**train and test should use same distribution of normalization**

### For USTCTFC2016
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'TCP_SAckOK', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_WScale', 'TCP_EOL', 'UDP_chksum', 'UDP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_MSS', 'IP_chksum', 'IP_flags', 'TCP_urgptr', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'S': 2, 'FA': 3, 'A': 4, 'FPA': 5, 'R': 6, 'SA': 7, 'RA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}



In [9]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')
            print(type, file)
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)

train_val_split_0 train.csv
train_val_split_0 val.csv
train_val_split_1 train.csv
train_val_split_1 val.csv
train_val_split_2 train.csv
train_val_split_2 val.csv


### For TLS
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'IP_chksum', 'TCP_urgptr', 'IP_flags', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'FA': 2, 'A': 3, 'FPA': 4, 'R': 5, 'PAC': 6, 'RA': 7}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}

In [31]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')
            if 'TCP_30' in df.columns:
                print('OK')
                df = df.drop(columns = ['TCP_30'])
            print(type, file)
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)

train_val_split_0 train.csv
train_val_split_0 val.csv
train_val_split_1 train.csv
train_val_split_1 val.csv
train_val_split_2 train.csv
train_val_split_2 val.csv


# For VPN
field_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'TCP_urgptr', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp', 'IP_version', 'TCP_window', 'IP_len', 'IP_flags', 'TCP_SAck', 'TCP_WScale', 'TCP_EOL', 'TCP_ack', 'TCP_chksum', 'IP_frag', 'TCP_flags', 'IP_ttl', 'IP_ihl', 'TCP_dataofs', 'TCP_Timestamp', 'IP_proto', 'IP_tos', 'TCP_reserved', 'TCP_SAckOK'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'R': 1, 'SA': 2, 'RA': 3, 'S': 4, 'PA': 5, 'FPA': 6, 'A': 7, 'FA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

 normalize_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum','UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}

In [23]:
import pandas as pd
import numpy as np

normalize_set = {'TL_chksum', 'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)
