# Description
process pcap data for get traditional baseline
label.pcap -> feature alignment Matrix/csv

In [15]:
import os
import logging
import scapy.all as scapy
from multiprocessing import Pool, cpu_count
import csv
import pandas as pd

os.chdir('/home/dauin_user/yzhao/debunk_representation/code/Traditional')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/add_ip_port.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [16]:
dataset_path = '/home/dauin_user/yzhao/explanation/data/per-packet/tls/ood'
output_path = '/home/dauin_user/yzhao/debunk_representation/code/Traditional/datasets/polishednsLen811/tls'

In [17]:
def get_key_val(line):
    layer = None
    sublayer = None
    key = None
    val = None

    if '###' in line:             
        if '|###' in line:                
            sublayer = line.strip('|#[] ')  
            print(f'sublayer ### {sublayer}')    
        else:                
            layer = line.strip('#[] ')      

    if '=' in line:            
        if '|' in line and 'sublayer' in locals():                
            key, val = line.strip('| ').split('=', 1)        
            key, val = key.strip(), val.strip('\' ')          
            print(f'sublayer ||| {sublayer}')  
        else:                 
            key, val = line.split('=', 1)   
            key, val = key.strip(), val.strip('\' ')    
            
    return layer, sublayer, key, val

def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP) or packet.haslayer('IPv6'):
        if packet.haslayer(scapy.UDP):
            packet[scapy.UDP].remove_payload()   
        if packet.haslayer(scapy.TCP):
            packet[scapy.TCP].remove_payload()    
    
    return packet

# get how many features/ip flags/tcp flags/.. in the dataset

pass_fields = [] # ignore these fields
field_set = set()
ip_flags_set = set()
tcp_flags_set = set()

def get_all_features(packet):
    protocol = None
    for line in packet.show2(dump=True).split('\n'): 
        layer, sub, key, val = get_key_val(line)

        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    if len(val) == 0:
                        continue
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                            elif option[0] == 'SAck': # SAckOK is a flag
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                                continue

                            field_set.add(f'{protocol}_{option[0]}')
                    continue     

                if protocol == 'IP':
                    if key == 'src' or key == 'dst':
                        for i in range(4):
                            field_set.add(f'{protocol}_{key}_{i}')
                        continue
                elif protocol == 'IPv6':
                    if key == 'src' or key == 'dst':
                        for i in range(8):
                            field_set.add(f'{protocol}_{key}_{i}')
                        continue

                # distinguish ip flags and tcp flags
                if key == 'flags': 
                    if protocol == 'IP':
                        ip_flags_set.add(f'{val}')
                    elif protocol == 'TCP':
                        tcp_flags_set.add(f'{val}')  

                if key == 'chksum' and protocol in ['TCP', 'UDP']:   # merge tcp and udp checksum
                    field_set.add('TL_chksum')
                if key == 'sport' or key == 'dport': # merge sport and dport
                    if protocol in ['TCP', 'UDP']:
                        field_set.add(f'TL_{key}')
                elif f'{protocol}_{key}'== 'IP_tos' or f'{protocol}_{key}' == 'IPv6_tc': # merge tos and traffic class
                    field_set.add('IP_tos')
                elif key == 'version': # merge ip version and ipv6 version
                    field_set.add('IP_version')
                elif f'{protocol}_{key}'== 'IP_ttl' or f'{protocol}_{key}' == 'IPv6_hlim': # merge ttl and hot limit
                    field_set.add('IP_ttl')
                elif f'{protocol}_{key}'== 'IP_len' or f'{protocol}_{key}' == 'IPv6_plen': # merge len and payload len
                    field_set.add('IP_len')
                elif f'{protocol}_{key}'== 'IP_proto' or f'{protocol}_{key}' == 'IPv6_nh': # merge proto and next header
                    field_set.add('IP_proto')
                else:
                    field_set.add(f'{protocol}_{key}')


                
    return field_set

In [7]:
# file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap
for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if type == 'test':
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    # only require ip header and transportaion layer
                    packet = clean_packet(packet)
                    get_all_features(packet)
                
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')
            
            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                logger.info(f'Processing {type} {folder} {file_name}')
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    for id, packet in enumerate(packets):
                        
                        # only require ip header and transportaion layer
                        packet = clean_packet(packet)
                        get_all_features(packet)
                                            
print(field_set)
print(ip_flags_set)
print(tcp_flags_set)
print()

2025-04-17 13:25:34,753 - root - INFO - Processing test
2025-04-17 13:25:34,754 - root - INFO - Processing test 51cto.pcap
2025-04-17 13:25:35,370 - root - INFO - Processing test sina.pcap
2025-04-17 13:25:36,055 - root - INFO - Processing test overleaf.pcap
2025-04-17 13:25:38,205 - root - INFO - Processing test youtube.pcap
2025-04-17 13:25:38,740 - root - INFO - Processing test alibaba.pcap
2025-04-17 13:25:38,973 - root - INFO - Processing test instagram.pcap
2025-04-17 13:25:39,395 - root - INFO - Processing test iqiyi.pcap
2025-04-17 13:25:39,808 - root - INFO - Processing test ibm.pcap
2025-04-17 13:25:40,246 - root - INFO - Processing test dailymotion.pcap
2025-04-17 13:25:41,943 - root - INFO - Processing test smzdm.pcap
2025-04-17 13:25:43,144 - root - INFO - Processing test gravatar.pcap
2025-04-17 13:25:43,547 - root - INFO - Processing test microsoft.pcap
2025-04-17 13:25:44,050 - root - INFO - Processing test cloudfront.pcap
2025-04-17 13:25:44,331 - root - INFO - Process

{'IP_proto', 'IP_chksum', 'TCP_echo_SAck', 'IP_len', 'IP_frag', 'IP_src_3', 'IP_src_1', 'IP_dst_2', 'IP_version', 'IP_flags', 'IP_id', 'IP_dst_0', 'TCP_echo_Timestamp', 'TL_chksum', 'IP_dst_1', 'TCP_SAck', 'IP_dst_3', 'TCP_reserved', 'IP_src_2', 'TL_dport', 'IP_ihl', 'TCP_urgptr', 'TCP_ack', 'TCP_Timestamp', 'TCP_chksum', 'IP_ttl', 'TCP_dataofs', 'TCP_flags', 'IP_src_0', 'IP_tos', 'TL_sport', 'TCP_seq', 'TCP_window'}
{'', 'DF'}
{'FPA', 'PA', 'A'}



In [4]:
# convert the flag to integer

ip_flags_map = {}
tcp_flags_map = {}

for id, ip_flag in enumerate(ip_flags_set):
    if ip_flag != '':
        ip_flags_map[ip_flag] = id + 1

for id, tcp_flag in enumerate(tcp_flags_set):
    if tcp_flag != '':
        tcp_flags_map[tcp_flag] = id + 1

print(ip_flags_map)
print(tcp_flags_map)
print(field_set)

{}
{}
set()


In [18]:
field_set = {'IP_src_2', 'TCP_chksum', 'TCP_echo_SAck', 'IP_src_0', 'TL_chksum', 'TL_sport', 'IP_ihl', 'IP_dst_1', 'IP_dst_2', 'IP_frag', 'TCP_ack', 'IP_len', 'TCP_window', 'TCP_flags', 'IP_src_1', 'IP_chksum', 'IP_flags', 'TCP_SAck', 'IP_src_3', 'TCP_echo_Timestamp', 'TCP_urgptr', 'IP_dst_3', 'IP_dst_0', 'IP_proto', 'IP_id', 'IP_tos', 'IP_ttl', 'TCP_reserved', 'TCP_seq', 'TL_dport', 'TCP_dataofs', 'TCP_Timestamp', 'IP_version'}
ip_flags_map = {'DF': 1}
tcp_flags_map = {'FA': 1, 'FPAC': 2, 'FPA': 3, 'A': 4, 'PA': 5, 'PAC': 6, 'AC': 7}
ip_proto_map = {'tcp': 1, 'udp': 2, 'TCP': 1, 'UDP': 2}
port_map = {'https': 443, 'supfiledbg': 1127, 'remctl': 4373, 'ospf6d': 2606, 'daap': 3689, 'svn': 3690, 'radius': 1812, 'dircproxy': 57000, 'gnutella_rtr': 6347, 'imaps': 993, 'fido': 60179, 'git': 9418, 'gsiftp': 2811, 'nfs': 2049, 'gnunet': 2086, 'x11_2': 6002, 'redis': 6379, 'radmin_port': 4899, 'amqp': 5672, 'cfengine': 5308, 'venus': 2430, 'iprop': 2121, 'x11_3': 6003, 'tfido': 60177, 'x11_1': 6000, 'x11_6': 6006, 'fax': 4557}


In [19]:
# pass_fields = ['src', 'dst', 'sport', 'dport']
pass_fields = []
port_fields = set()
map_features = {'TCP_flags', 'IP_flags', 'IP_proto'}
hex_features = {'IP_tos', 'IP_chksum', 'TL_chksum', 'UDP_chksum', 'TCP_chksum'}
port_features = {'TL_sport', 'TL_dport'}
equal_features = {'TCP_chksum', 'UDP_chksum', 'IPV6_tc', 'IPv6_version', 'IPv6_plen', 'IPv6_nh', 'IPv6_hlim', 'UDP_sport', 'UDP_dport', 'TCP_sport', 'TCP_dport'}

def hex2int(value):
    return int(value, 16)

def special_rules(field, value):
    if not value: # convert '' to 0
        return 0
    elif field == 'TCP_flags':     # convert flags to int
        return tcp_flags_map[value]
    elif field == 'IP_flags':     # convert flags to int
        return ip_flags_map[value]
    elif field == 'IP_proto': # convert protocol to int
        return ip_proto_map[value]
    
def equal_rules(field): # convert ipv6 fields name to ipv4 fields name
    if field == 'TCP_chksum' or field == 'UDP_chksum':
        return 'TL_chksum'
    elif field == 'IPv6_tc':
        return 'IP_tos'
    elif field == 'IPv6_version':
        return 'IP_version'
    elif field == 'IPv6_plen':
        return 'IP_len'
    elif field == 'IPv6_nh':
        return 'IP_proto'
    elif field == 'IPv6_hlim':
        return 'IP_ttl'
    elif field == 'UDP_sport' or field == 'TCP_sport':
        return 'TL_sport'
    elif field == 'UDP_dport' or field == 'TCP_dport':
        return 'TL_dport'
     
def packet2features(packet):
    protocol = None
    packet_key_val = {}
    for line in packet.show2(dump=True).split('\n'): 
        layer, _, key, val = get_key_val(line)
        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_echo_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAck': # The format of SAck is smiliar with timestamp and they are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_echo_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAckOK': 
                                packet_key_val[f'{protocol}_{option[0]}'] = int(val == 1)
                            elif option[0] == 'EOL': # EOL is a flag, always 0
                                packet_key_val[f'{protocol}_{option[0]}'] = 0
                            else:
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1]
                    continue   

                if protocol == 'IP':
                    if key == 'src' or key == 'dst':
                        for i, ip in enumerate(val.split('.')):
                            packet_key_val[f'{protocol}_{key}_{i}'] = int(ip)
                        continue
                elif protocol == 'IPv6':
                    if key == 'src' or key == 'dst':
                        ipv6_split = val.split(':')
                        for i, ip in enumerate(ipv6_split):
                            if ip != '':
                                packet_key_val[f'{protocol}_{key}_{i}'] = int(ip, 16)
                            else:
                                for _ in range(8 - len(ipv6_split)):
                                    packet_key_val[f'{protocol}_{key}_{i}'] = 0
                        continue

                if f'{protocol}_{key}' in equal_features: # special rules for some fields
                    field = equal_rules(f'{protocol}_{key}')
                else:
                    field = f'{protocol}_{key}'


                if field in map_features: # special rules for some fields
                    packet_key_val[field] = special_rules(field, val)
                elif field in hex_features:
                    packet_key_val[field] = hex2int(val)
                elif field in port_features:
                    if val in port_map:
                        packet_key_val[field] = port_map[val]
                    else:
                        packet_key_val[field] = val

                else:
                    packet_key_val[field] = int(val) 
                
    packet_array = [packet_key_val.get(key, 0) for key in field_set]

    return packet_array

def write2csv(type, data, label):
    os.makedirs(f'{output_path}/unclean_ood_test', exist_ok=True)
    file_path = f'{output_path}/unclean_ood_test/{type}.csv'

    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            print(list(field_set) + ['class'])
            writer.writerow(list(field_set) + ['class'])

    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data + [label])

In [20]:
# file/test/{test.pcap} or file/train_val_split_0/{train/val}/.pcap

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            if 'pcap' in file_name:
                logger.info(f'Processing {type} {file_name}')

                packets =  scapy.PcapReader(f'{dataset_path}/{type}/{file_name}')
                for id, packet in enumerate(packets):
                    packet = clean_packet(packet)
                    packet_array = packet2features(packet)
                    write2csv(type, packet_array, file_name)
    elif 'train_val_split' in type:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')

            for file_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                if 'pcap' in file_name:
                    packets =  scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{file_name}')
                    for id, packet in enumerate(packets):
                        packet = clean_packet(packet)
                        packet_array = packet2features(packet)
                        write2csv(f'{type}/{folder}', packet_array, file_name)
 
logger.info('Finished')  

2025-04-29 15:37:36,783 - root - INFO - Processing 3-tuple-test
2025-04-29 15:37:36,784 - root - INFO - Processing test
2025-04-29 15:37:36,809 - root - INFO - Processing test 51cto.pcap


['IP_chksum', 'IP_flags', 'IP_ttl', 'IP_src_0', 'TCP_urgptr', 'TCP_ack', 'TCP_Timestamp', 'IP_id', 'TL_chksum', 'TCP_seq', 'TCP_dataofs', 'IP_src_3', 'IP_dst_1', 'TCP_window', 'IP_dst_2', 'TCP_chksum', 'TL_dport', 'TCP_flags', 'IP_dst_3', 'IP_version', 'TL_sport', 'IP_src_2', 'IP_tos', 'IP_dst_0', 'IP_frag', 'IP_len', 'TCP_reserved', 'TCP_echo_Timestamp', 'IP_ihl', 'TCP_SAck', 'IP_proto', 'TCP_echo_SAck', 'IP_src_1', 'class']


2025-04-29 15:37:38,734 - root - INFO - Processing test sina.pcap
2025-04-29 15:37:39,160 - root - INFO - Processing test overleaf.pcap
2025-04-29 15:37:40,658 - root - INFO - Processing test youtube.pcap
2025-04-29 15:37:41,031 - root - INFO - Processing test alibaba.pcap
2025-04-29 15:37:41,184 - root - INFO - Processing test instagram.pcap
2025-04-29 15:37:41,458 - root - INFO - Processing test iqiyi.pcap
2025-04-29 15:37:41,759 - root - INFO - Processing test ibm.pcap
2025-04-29 15:37:42,089 - root - INFO - Processing test dailymotion.pcap
2025-04-29 15:37:43,359 - root - INFO - Processing test smzdm.pcap
2025-04-29 15:37:44,257 - root - INFO - Processing test gravatar.pcap
2025-04-29 15:37:44,541 - root - INFO - Processing test microsoft.pcap
2025-04-29 15:37:44,906 - root - INFO - Processing test cloudfront.pcap
2025-04-29 15:37:45,115 - root - INFO - Processing test jb51.pcap
2025-04-29 15:37:45,862 - root - INFO - Processing test pinduoduo.pcap
2025-04-29 15:37:46,231 - root - 

# for data clean

normalization each row

**train and test should use same distribution of normalization**

### For USTCTFC2016
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'TCP_SAckOK', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_WScale', 'TCP_EOL', 'UDP_chksum', 'UDP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_MSS', 'IP_chksum', 'IP_flags', 'TCP_urgptr', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'S': 2, 'FA': 3, 'A': 4, 'FPA': 5, 'R': 6, 'SA': 7, 'RA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}



In [None]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean_add_ip_port/{type}')
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        df.to_csv(f'{output_path}/add_info/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean_add_ip_port/{type}'):
            df = pd.read_csv(f'{output_path}/unclean_add_ip_port/{type}/{file}')
            print(type, file)
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/add_info/{type}/{file}', index=False)

train_val_split_0 train.csv
train_val_split_0 val.csv
train_val_split_1 train.csv
train_val_split_1 val.csv
train_val_split_2 train.csv
train_val_split_2 val.csv


### For TLS
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'IP_chksum', 'TCP_urgptr', 'IP_flags', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'FA': 2, 'A': 3, 'FPA': 4, 'R': 5, 'PAC': 6, 'RA': 7}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}

In [21]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean_ood_test/{type}')
        if 'TCP_30' in df.columns:
            print('OK')
            df = df.drop(columns = ['TCP_30'])
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        os.makedirs(f'{output_path}/ood_test', exist_ok=True)
        df.to_csv(f'{output_path}/ood_test/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean_ood_test/{type}'):
            df = pd.read_csv(f'{output_path}/unclean_ood_test/{type}/{file}')
            if 'TCP_30' in df.columns:
                print('OK')
                df = df.drop(columns = ['TCP_30'])
            print(type, file)
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/ood_test/{type}/{file}', index=False)

# For VPN
field_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'TCP_urgptr', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp', 'IP_version', 'TCP_window', 'IP_len', 'IP_flags', 'TCP_SAck', 'TCP_WScale', 'TCP_EOL', 'TCP_ack', 'TCP_chksum', 'IP_frag', 'TCP_flags', 'IP_ttl', 'IP_ihl', 'TCP_dataofs', 'TCP_Timestamp', 'IP_proto', 'IP_tos', 'TCP_reserved', 'TCP_SAckOK'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'R': 1, 'SA': 2, 'RA': 3, 'S': 4, 'PA': 5, 'FPA': 6, 'A': 7, 'FA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

 normalize_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum','UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}

In [10]:
import pandas as pd
import numpy as np

normalize_set = {'TL_chksum', 'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean_add_info/{type}')
        for field in normalize_set:
            if field in df.columns:
                df[field] = df[field].apply(log_normalize)
        df.to_csv(f'{output_path}/add_info/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean_add_info/{type}'):
            df = pd.read_csv(f'{output_path}/unclean_add_info/{type}/{file}')
            for field in normalize_set:
                if field in df.columns:
                    df[field] = df[field].apply(log_normalize)
            df.to_csv(f'{output_path}/add_info/{type}/{file}', index=False)
