# Description
process pcap data for get traditional baseline
label/flow.pcap -> feature alignment Matrix/csv

In [105]:
import os
import logging
import scapy.all as scapy
from multiprocessing import Pool, cpu_count
import csv
import pandas as pd

os.chdir('LLM4Traffic/code/Traditional')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/flow.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [106]:
num_of_pkts = 7 # the first number of packets to be considered in a flow

# dataset_path = '/share/smartdata/external_pcaps/ISCX-VPN-2016/Filtered/App/flow'
dataset_path = 'external_pcaps/CSTNET-TLS1.3/Filtered/flow'
output_path = f'LLM4Traffic/code/Traditional/datasets/flow/tls-{num_of_pkts}'


In [107]:
def get_key_val(line):
    layer = None
    sublayer = None
    key = None
    val = None

    if '###' in line:             
        if '|###' in line:                
            sublayer = line.strip('|#[] ')  
            print(f'sublayer {sublayer}')    
        else:                
            layer = line.strip('#[] ')      

    if '=' in line:            
        if '|' in line and 'sublayer' in locals():                
            key, val = line.strip('| ').split('=', 1)        
            key, val = key.strip(), val.strip('\' ')          
            print(f'sublayer {sublayer}')  
        else:                 
            key, val = line.split('=', 1)   
            key, val = key.strip(), val.strip('\' ')    
            
    return layer, sublayer, key, val

def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP) or packet.haslayer('IPv6'):
        if packet.haslayer(scapy.UDP):
            packet[scapy.UDP].remove_payload()   
        if packet.haslayer(scapy.TCP):
            packet[scapy.TCP].remove_payload()    
    
    return packet

# get how many features/ip flags/tcp flags/.. in the dataset

pass_fields = ['src', 'dst', 'sport', 'dport'] # ignore these fields
field_set = set()
ip_flags_set = set()
tcp_flags_set = set()

def get_all_features(packet):
    protocol = None
    for line in packet.show2(dump=True).split('\n'): 
        layer, _, key, val = get_key_val(line)

        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    if len(val) == 0:
                        continue
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                            elif option[0] == 'SAck': # SAckOK is a flag
                                field_set.add(f'{protocol}_{option[0]}')
                                field_set.add(f'{protocol}_echo_{option[0]}')
                                continue

                            field_set.add(f'{protocol}_{option[0]}')
                    continue     

                # distinguish ip flags and tcp flags
                if key == 'flags': 
                    if protocol == 'IP':
                        ip_flags_set.add(f'{val}')
                    elif protocol == 'TCP':
                        tcp_flags_set.add(f'{val}')  

                if key == 'chksum' and protocol in ['TCP', 'UDP']:   # merge tcp and udp checksum
                    field_set.add('TL_chksum')
                elif f'{protocol}_{key}'== 'IP_tos' or f'{protocol}_{key}' == 'IPv6_tc': # merge tos and traffic class
                    field_set.add('IP_tos')
                elif key == 'version': # merge ip version and ipv6 version
                    field_set.add('IP_version')
                elif f'{protocol}_{key}'== 'IP_ttl' or f'{protocol}_{key}' == 'IPv6_hlim': # merge ttl and hot limit
                    field_set.add('IP_ttl')
                elif f'{protocol}_{key}'== 'IP_len' or f'{protocol}_{key}' == 'IPv6_plen': # merge len and payload len
                    field_set.add('IP_len')
                elif f'{protocol}_{key}'== 'IP_proto' or f'{protocol}_{key}' == 'IPv6_nh': # merge proto and next header
                    field_set.add('IP_proto')
                else:
                    field_set.add(f'{protocol}_{key}')

    return field_set

In [108]:
# file/test/class/flow.pcap or file/train_val_split_0/{train/val}/class/flow.pcap
for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if type == 'test':
        for class_id, class_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {class_name}')
            for file_name in os.listdir(f'{dataset_path}/{type}/{class_name}'):
                with scapy.PcapReader(f'{dataset_path}/{type}/{class_name}/{file_name}') as packets:
                    for id, packet in enumerate(packets):
                        if id >= num_of_pkts:
                            break
                        # only require ip header and transportaion layer
                        packet = clean_packet(packet)
                        get_all_features(packet)
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')
            
            for class_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                logger.info(f'Processing {type} {folder} {class_name}')
                for file_name in os.listdir(f'{dataset_path}/{type}/{folder}/{class_name}'):
                    with scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{class_name}/{file_name}') as packets:
                        for id, packet in enumerate(packets):
                            if id >= num_of_pkts:
                                break
                            # only require ip header and transportaion layer
                            packet = clean_packet(packet)
                            get_all_features(packet)


print(field_set)
print(ip_flags_set)
print(tcp_flags_set)
print()


2025-01-08 10:28:22,900 - root - INFO - Processing train_val_split_2
2025-01-08 10:28:22,902 - root - INFO - Processing train_val_split_2 train
2025-01-08 10:28:22,907 - root - INFO - Processing train_val_split_2 train azureedge.net
2025-01-08 10:28:23,454 - root - INFO - Processing train_val_split_2 train hubspot.com
2025-01-08 10:28:23,929 - root - INFO - Processing train_val_split_2 train pinduoduo.com
2025-01-08 10:28:24,414 - root - INFO - Processing train_val_split_2 train jd.com
2025-01-08 10:28:25,000 - root - INFO - Processing train_val_split_2 train snapchat.com
2025-01-08 10:28:25,570 - root - INFO - Processing train_val_split_2 train qcloud.com
2025-01-08 10:28:26,012 - root - INFO - Processing train_val_split_2 train 51.la
2025-01-08 10:28:26,561 - root - INFO - Processing train_val_split_2 train arxiv.org
2025-01-08 10:28:27,052 - root - INFO - Processing train_val_split_2 train onlinedown.net
2025-01-08 10:28:27,572 - root - INFO - Processing train_val_split_2 train gram

{'IP_chksum', 'TCP_echo_SAck', 'IP_frag', 'TCP_WScale', 'TCP_reserved', 'TCP_echo_Timestamp', 'TL_chksum', 'TCP_flags', 'TCP_ack', 'IP_version', 'TCP_urgptr', 'TCP_Timestamp', 'TCP_MSS', 'TCP_SAck', 'IP_flags', 'IP_proto', 'TCP_SAckOK', 'TCP_EOL', 'TCP_seq', 'IP_tos', 'TCP_window', 'IP_len', 'IP_ttl', 'IP_id', 'IP_ihl', 'TCP_dataofs'}
{'', 'DF'}
{'FA', 'S', 'FPA', 'PA', 'SA', 'R', 'A', 'RA'}



In [109]:
# convert the flag to integer

ip_flags_map = {}
tcp_flags_map = {}
flow_fields_set = set()

for id, ip_flag in enumerate(ip_flags_set):
    if ip_flag != '':
        ip_flags_map[ip_flag] = id + 1

for id, tcp_flag in enumerate(tcp_flags_set):
    if tcp_flag != '':
        tcp_flags_map[tcp_flag] = id + 1

for id in range(num_of_pkts):
    for field in field_set:
        flow_fields_set.add(f'{field}_{id}')
    if id < num_of_pkts - 1:
        flow_fields_set.add(f'interval_{id}_{id+1}')

print(ip_flags_map)
print(tcp_flags_map)
print(flow_fields_set)

{'DF': 2}
{'FA': 1, 'S': 2, 'FPA': 3, 'PA': 4, 'SA': 5, 'R': 6, 'A': 7, 'RA': 8}
{'interval_4_5', 'TCP_Timestamp_1', 'IP_ihl_6', 'IP_id_5', 'TCP_SAckOK_2', 'TCP_Timestamp_4', 'IP_proto_6', 'TCP_flags_2', 'TCP_MSS_5', 'IP_chksum_3', 'TCP_reserved_6', 'IP_tos_2', 'IP_id_2', 'TCP_reserved_3', 'IP_proto_5', 'TCP_seq_6', 'IP_ttl_0', 'TL_chksum_1', 'TCP_EOL_2', 'TCP_echo_SAck_5', 'TCP_flags_4', 'TCP_MSS_0', 'TCP_SAck_1', 'TCP_reserved_0', 'TCP_SAck_5', 'IP_proto_0', 'IP_ttl_1', 'TCP_window_4', 'TCP_EOL_1', 'IP_version_1', 'TCP_seq_1', 'IP_id_4', 'TCP_reserved_1', 'TL_chksum_2', 'IP_frag_1', 'TCP_MSS_1', 'IP_frag_2', 'TCP_seq_4', 'IP_chksum_1', 'IP_flags_2', 'TCP_echo_SAck_6', 'IP_tos_4', 'TCP_MSS_6', 'TCP_flags_0', 'IP_version_0', 'IP_id_0', 'IP_len_0', 'TCP_seq_5', 'interval_3_4', 'IP_ihl_0', 'TCP_flags_3', 'TCP_SAck_4', 'TCP_reserved_5', 'IP_flags_6', 'TCP_echo_Timestamp_4', 'IP_len_4', 'TCP_window_6', 'IP_ttl_2', 'TCP_seq_3', 'IP_version_5', 'IP_version_6', 'TCP_SAck_6', 'TCP_SAckOK_3', '

In [110]:
field_set = {'IP_chksum', 'TCP_echo_SAck', 'IP_frag', 'TCP_WScale', 'TCP_reserved', 'TCP_echo_Timestamp', 'TL_chksum', 'TCP_flags', 'TCP_ack', 'IP_version', 'TCP_urgptr', 'TCP_Timestamp', 'TCP_MSS', 'TCP_SAck', 'IP_flags', 'IP_proto', 'TCP_SAckOK', 'TCP_EOL', 'TCP_seq', 'IP_tos', 'TCP_window', 'IP_len', 'IP_ttl', 'IP_id', 'IP_ihl', 'TCP_dataofs'}
ip_flags_map = {'DF': 1}
{'FA': 1, 'S': 2, 'FPA': 3, 'PA': 4, 'SA': 5, 'R': 6, 'A': 7, 'RA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2, 'TCP': 1, 'UDP': 2}


In [111]:
pass_fields = ['src', 'dst', 'sport', 'dport']
map_features = {'TCP_flags', 'IP_flags', 'IP_proto'}
hex_features = {'IP_tos', 'IP_chksum', 'TL_chksum', 'UDP_chksum', 'TCP_chksum'}
equal_features = {'TCP_chksum', 'UDP_chksum', 'IPV6_tc', 'IPv6_version', 'IPv6_plen', 'IPv6_nh', 'IPv6_hlim'}

def hex2int(value):
    return int(value, 16)

def special_rules(field, value):
    if not value: # convert '' to 0
        return 0
    elif field == 'TCP_flags':     # convert flags to int
        return tcp_flags_map[value]
    elif field == 'IP_flags':     # convert flags to int
        return ip_flags_map[value]
    elif field == 'IP_proto': # convert protocol to int
        return ip_proto_map[value]
    
def equal_rules(field): # convert ipv6 fields name to ipv4 fields name
    if field == 'TCP_chksum' or field == 'UDP_chksum':
        return 'TL_chksum'
    elif field == 'IPv6_tc':
        return 'IP_tos'
    elif field == 'IPv6_version':
        return 'IP_version'
    elif field == 'IPv6_plen':
        return 'IP_len'
    elif field == 'IPv6_nh':
        return 'IP_proto'
    elif field == 'IPv6_hlim':
        return 'IP_ttl'
     
def packet2features(packet):
    protocol = None
    packet_key_val = {}
    for line in packet.show2(dump=True).split('\n'): 
        layer, _, key, val = get_key_val(line)
        if layer is not None:
                protocol = layer
        else:
            if key not in pass_fields and key is not None:   
                if key == 'options': # options is a list, there are two new fields
                    for option in eval(val):
                            if option[0] == 'NOP': # ignore NOP (No Operation)
                                continue
                            elif option[0] == 'Timestamp': # Timestamp and Echo are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_echo_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAck': # The format of SAck is smiliar with timestamp and they are a pair, Echo is the second field
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1][0]
                                packet_key_val[f'{protocol}_echo_{option[0]}'] = option[1][1]
                            elif option[0] == 'SAckOK': 
                                packet_key_val[f'{protocol}_{option[0]}'] = int(val == 1)
                            elif option[0] == 'EOL': # EOL is a flag, always 0
                                packet_key_val[f'{protocol}_{option[0]}'] = 0
                            else:
                                packet_key_val[f'{protocol}_{option[0]}'] = option[1]
                    continue   

                if f'{protocol}_{key}' in equal_features: # special rules for some fields
                    field = equal_rules(f'{protocol}_{key}')
                else:
                    field = f'{protocol}_{key}'


                if field in map_features: # special rules for some fields
                    packet_key_val[field] = special_rules(field, val)
                elif field in hex_features:
                    packet_key_val[field] = hex2int(val)
                else:
                    packet_key_val[field] = int(val) 
                
    return packet_key_val

def write2csv(type, data, label):
    os.makedirs(f'{output_path}/unclean', exist_ok=True)
    file_path = f'{output_path}/unclean/{type}.csv'

    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            print(list(flow_fields_set) + ['class'])
            writer.writerow(list(flow_fields_set) + ['class'])

    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data + [label])

def packet2flow_field(packet, id, flow_key_val):
    packet = clean_packet(packet)
    packet_key_val = packet2features(packet)

    for key, value in packet_key_val.items():
        flow_key_val[f'{key}_{id}'] = value

    return flow_key_val

def get_flow_interval(packet_timestamps, flow_key_val):
    for id in range(len(packet_timestamps) - 1):
        flow_key_val[f'interval_{id}_{id+1}'] = packet_timestamps[id+1] - packet_timestamps[id]
    return flow_key_val

In [112]:
# file/test/class/flow.pcap or file/train_val_split_0/{train/val}/class/flow.pcap

for type in os.listdir(dataset_path):
    logger.info(f'Processing {type}')

    if 'test' == type:
        for class_id, class_name in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {class_name}')
            for file_name in os.listdir(f'{dataset_path}/{type}/{class_name}'):
                flow_key_val = {}
                packet_timestamps = []
                with scapy.PcapReader(f'{dataset_path}/{type}/{class_name}/{file_name}') as packets:
                    for id, packet in enumerate(packets):
                        if id >= num_of_pkts:
                            break
                        flow_key_val = packet2flow_field(packet, id, flow_key_val)
                        packet_timestamps.append(packet.time)

                flow_key_val = get_flow_interval(packet_timestamps, flow_key_val)
                flow_array = [flow_key_val.get(key, 0) for key in flow_fields_set]
                write2csv(type, flow_array, class_name)
    else:
        for class_id, folder in enumerate(os.listdir(f'{dataset_path}/{type}')):
            logger.info(f'Processing {type} {folder}')
            for class_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                for file_name in os.listdir(f'{dataset_path}/{type}/{folder}/{class_name}'):
                    flow_key_val = {}
                    packet_timestamps = []
                    with scapy.PcapReader(f'{dataset_path}/{type}/{folder}/{class_name}/{file_name}') as packets:
                        for id, packet in enumerate(packets):
                            if id >= num_of_pkts:
                                break
                            flow_key_val = packet2flow_field(packet, id, flow_key_val)
                            packet_timestamps.append(packet.time)

                    flow_key_val = get_flow_interval(packet_timestamps, flow_key_val)
                    flow_array = [flow_key_val.get(key, 0) for key in flow_fields_set]
                    write2csv(f"{type}/{folder}", flow_array, class_name)
 
logger.info('Finished')  

2025-01-08 10:33:55,919 - root - INFO - Processing train_val_split_2
2025-01-08 10:33:55,920 - root - INFO - Processing train_val_split_2 train
2025-01-08 10:34:28,992 - root - INFO - Processing train_val_split_2 val
2025-01-08 10:34:45,523 - root - INFO - Processing train_val_split_0
2025-01-08 10:34:45,526 - root - INFO - Processing train_val_split_0 train
2025-01-08 10:35:18,692 - root - INFO - Processing train_val_split_0 val
2025-01-08 10:35:35,630 - root - INFO - Processing test
2025-01-08 10:35:35,631 - root - INFO - Processing test azureedge.net
2025-01-08 10:35:36,546 - root - INFO - Processing test hubspot.com
2025-01-08 10:35:37,022 - root - INFO - Processing test pinduoduo.com
2025-01-08 10:35:37,604 - root - INFO - Processing test jd.com
2025-01-08 10:35:38,189 - root - INFO - Processing test snapchat.com
2025-01-08 10:35:38,785 - root - INFO - Processing test qcloud.com
2025-01-08 10:35:39,381 - root - INFO - Processing test 51.la
2025-01-08 10:35:39,543 - root - INFO - P

# for data clean

normalization each row

**train and test should use same distribution of normalization**

### For USTCTFC2016
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'TCP_SAckOK', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_WScale', 'TCP_EOL', 'UDP_chksum', 'UDP_len', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'TCP_MSS', 'IP_chksum', 'IP_flags', 'TCP_urgptr', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'S': 2, 'FA': 3, 'A': 4, 'FPA': 5, 'R': 6, 'SA': 7, 'RA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}



In [9]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'UDP_chksum', 'UDP_len','TCP_SAck', 'TCP_MSS', 'IP_chksum', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            for column in df.columns:
                if field in column:
                    df[column] = df[column].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')
            for field in normalize_set:
                for column in df.columns:
                    if field in column:
                        df[column] = df[column].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)

train_val_split_0 train.csv
train_val_split_0 val.csv
train_val_split_1 train.csv
train_val_split_1 val.csv
train_val_split_2 train.csv
train_val_split_2 val.csv


### For TLS
field_set = {'TCP_window', 'TCP_chksum', 'TCP_reserved', 'TCP_ack', 'TCP_echo_SAck', 'TCP_flags', 'IP_id', 'IP_proto', 'IP_len', 'TCP_seq', 'TCP_dataofs', 'IP_version', 'TCP_SAck', 'IP_chksum', 'TCP_urgptr', 'IP_flags', 'IP_frag', 'IP_ttl', 'TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_ihl', 'IP_tos'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'PA': 1, 'FA': 2, 'A': 3, 'FPA': 4, 'R': 5, 'PAC': 6, 'RA': 7}
ip_proto_map = {'tcp': 1, 'udp': 2}

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}

In [104]:
import pandas as pd
import numpy as np

normalize_set = {'TCP_window', 'TCP_chksum', 'TCP_ack', 'TCP_echo_SAck', 'IP_id', 'IP_len', 'TCP_seq', 'TCP_SAck', 'IP_chksum','TCP_echo_Timestamp', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            for column in df.columns:
                if field in column:
                    df[column] = df[column].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            print(type, file)
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')

            for field in normalize_set:
                for column in df.columns:
                    if field in column:
                        df[column] = df[column].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)

train_val_split_0 train.csv
train_val_split_0 val.csv
train_val_split_1 train.csv
train_val_split_1 val.csv
train_val_split_2 train.csv
train_val_split_2 val.csv


# For VPN
field_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'TCP_urgptr', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp', 'IP_version', 'TCP_window', 'IP_len', 'IP_flags', 'TCP_SAck', 'TCP_WScale', 'TCP_EOL', 'TCP_ack', 'TCP_chksum', 'IP_frag', 'TCP_flags', 'IP_ttl', 'IP_ihl', 'TCP_dataofs', 'TCP_Timestamp', 'IP_proto', 'IP_tos', 'TCP_reserved', 'TCP_SAckOK'}

ip_flags_map = {'DF': 1}
tcp_flags_map = {'R': 1, 'SA': 2, 'RA': 3, 'S': 4, 'PA': 5, 'FPA': 6, 'A': 7, 'FA': 8}
ip_proto_map = {'tcp': 1, 'udp': 2}

 normalize_set = {'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum','UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}

In [88]:
import pandas as pd
import numpy as np

normalize_set = {'TL_chksum', 'TCP_MSS', 'TCP_seq', 'IP_id', 'IP_chksum', 'UDP_len', 'TCP_echo_SAck', 'UDP_chksum', 'TCP_echo_Timestamp','TCP_window', 'IP_len',  'TCP_SAck','TCP_ack', 'TCP_chksum', 'TCP_Timestamp', 'IP_tos'}
types = ['train_val_split_0', 'train_val_split_1', 'train_val_split_2', 'test.csv']

def log_normalize(value):
    return np.log1p(value)

for type in types:
    if type == 'test.csv':
        df = pd.read_csv(f'{output_path}/unclean/{type}')
        for field in normalize_set:
            for column in df.columns:
                if field in column:
                    df[column] = df[column].apply(log_normalize)
        df.to_csv(f'{output_path}/normalized/{type}', index=False)
    else:
        for file in os.listdir(f'{output_path}/unclean/{type}'):
            df = pd.read_csv(f'{output_path}/unclean/{type}/{file}')
            for field in normalize_set:
                for column in df.columns:
                    if field in column:
                        df[column] = df[column].apply(log_normalize)
            df.to_csv(f'{output_path}/normalized/{type}/{file}', index=False)
