In [36]:
# process the data under packet-level classification

import os
import scapy.all as scapy
from utils import *
from scapy.layers.tls.handshake import TLSClientHello,TLSServerHello
import binascii

os.chdir('debunk_representation/code/TrafficFormer/data_generation')

In [31]:
def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    return packet

def random_ip_port(packet):
    if packet.haslayer(scapy.IP):
        packet[scapy.IP].src = random_ipv4()
        packet[scapy.IP].dst = random_ipv4()
    elif packet.haslayer('IPv6'):
        packet[scapy.IPv6].src = random_ipv6()
        packet[scapy.IPv6].dst = random_ipv6()

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = random_field(16)
        packet[scapy.UDP].dport = random_field(16)
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = random_field(16)  
        packet[scapy.TCP].dport = random_field(16)  
    
    return packet


def random_tcp_ts_option(packet):
    src_ts = None
    dst_ts = None
    random_src_ts = random_field(32)
    random_dst_ts = random_field(32)
    src_port = None

    if packet.haslayer(scapy.TCP):
        tcp_options = [list(option) for option in packet['TCP'].options]
        for option in tcp_options:
            if option[0] == 'Timestamp':
                if src_port==None:
                    src_port = packet['TCP'].sport
                    src_ts = option[1][0]
                    if option[1][1]!=0:
                        dst_ts = option[1][1]
                if packet['TCP'].sport == src_port:
                    if option[1][1]!=0:
                        option[1] = (random_src_ts + option[1][0]-src_ts, random_dst_ts + option[1][1]-dst_ts)
                    else:
                        option[1] = (random_src_ts + option[1][0]-src_ts, 0)
                else:
                    if dst_ts==None:
                        dst_ts = option[1][0]
                    if option[1][1]!=0:
                        option[1] = (random_dst_ts + option[1][0]-dst_ts, random_src_ts + option[1][1]-src_ts)
                    else:
                        option[1] = (random_dst_ts + option[1][0]-dst_ts, 0)

        packet['TCP'].options = [tuple(option) for option in tcp_options]
    return packet

def random_tls_randomtime(packet):
    if packet.haslayer(TLSClientHello):
        packet[TLSClientHello].gmt_unix_time = random_field(32)
    if packet.haslayer(TLSServerHello):
        packet[TLSServerHello].gmt_unix_time = random_field(32)
    
    return packet


In [32]:
def enhance_based_tsv(path,filename,new_file_prefix,enhance_factor=1):
    # path: the tsv path
    # filenmae: the name of tsv path
    # new_file_prefix: the prefix of enhanced tsv
    # enhance_factor: augmentation factor
    dataset  = []
    columns = {}
    with open(path+filename, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            if line_id == 0:
                for i, column_name in enumerate(line.strip().split("\t")):
                    columns[column_name] = i
                continue
            line = line[:-1].split("\t")
            tgt = int(line[columns["label"]])
            text_a = line[columns["text_a"]]
            text_list = text_a.split("[SEP]")[1:]
            for _ in range(enhance_factor):
                # IPID:4, src:12, dst: 16, sport:20, dport:22, seq:24, ack:28
                IP,proto,first_forward_datagrams,first_backward_datagrams = None, None, None, None
                datagramss = []
                for i in range(len(text_list)):
                    pac = text_list[i]
                    datagrams = pac.split(" ")[1:-1]
                    datagramss.append(datagrams)
                    if i==0:
                        if datagrams[0][0]=="4": 
                            IP = 4
                            if datagrams[9][:2]=="06": proto = 6
                            elif datagrams[9][:2]=="11": proto = 17
                        elif datagrams[0][0]=="6":
                            IP = 6
                            if datagrams[6][:2]=="06": proto = 6
                            elif datagrams[6][:2]=="11": proto = 17
                        src = datagrams[12:16]
                        first_forward_datagrams = datagrams
                    if datagrams[12:16]!=src and first_backward_datagrams is None:
                        first_backward_datagrams = datagrams
                
                if IP==None or proto==None:
                    print(line)
                    return

                if IP==4:
                    rsrc = random_field(32)
                    rdst = random_field(32)
                    rsrcid = random_field(16)
                    rdstid = random_field(16)
                elif IP==6:
                    print("IPV6 is waiting to process")
                    continue
                if proto==6:
                    rsrcp = random_field(16)
                    rdstp = random_field(16)
                    rsrcseq = random_field(32)
                    rdstseq = random_field(32)
                elif proto==17:
                    rsrcp = random_field(16)
                    rdstp = random_field(16)
                
                forward_4tstr = hex(rsrc)[2:].zfill(8) + hex(rdst)[2:].zfill(8) + hex(rsrcp)[2:].zfill(4) + hex(rdstp)[2:].zfill(4)
                backward_4tstr = hex(rdst)[2:].zfill(8) + hex(rsrc)[2:].zfill(8) + hex(rdstp)[2:].zfill(4) + hex(rsrcp)[2:].zfill(4)
                srcipid = int(first_forward_datagrams[4], 16)
                if first_backward_datagrams is not None:
                    dstipid = int(first_backward_datagrams[4], 16)
                
                
                if len(first_forward_datagrams) < 31:
                    print(first_forward_datagrams)
                    continue


                srcseq = int(first_forward_datagrams[24]+first_forward_datagrams[26], 16)
                srcack = int(first_forward_datagrams[28]+first_forward_datagrams[30], 16)
                if first_backward_datagrams is not None:
                    dstseq = int(first_backward_datagrams[24]+first_backward_datagrams[26], 16)
                elif srcack!=0:
                    dstseq = srcack
                else:
                    print("cant process dstseq...")
                #print(hex(srcseq),hex(dstseq),hex(rsrcseq),hex(rdstseq))
                
                #print(forward_4tstr, backward_4tstr)   
                for i in range(len(datagramss)):
                    # print("------")
                    # print(datagramss[i])
                    if datagramss[i][12:16]==src: #forward
                        datagramss[i][11] = datagramss[i][11][:2] + forward_4tstr[:2]
                        cc = 12
                        for elm in bigram_generation(forward_4tstr,token_len=len(forward_4tstr)/2).split(" ")[:-1]:
                            datagramss[i][cc] = elm
                            cc += 1
                        datagramss[i][cc] = forward_4tstr[-2:] + datagramss[i][cc][2:4]
                        # handle IPID
                        if IP==4:
                            if srcipid != 0:
                                temp = hex((int(datagramss[i][4],16) - srcipid + rsrcid)%(2**16))[2:].zfill(4)
                                datagramss[i][4] = temp
                                datagramss[i][3] = datagramss[i][3][:2] + temp[:2]
                                datagramss[i][5] = temp[2:] + datagramss[i][5][2:]
                        # handle seq
                        if proto==6:
                            tempsrcseq = hex((rsrcseq + int(datagramss[i][24]+datagramss[i][26], 16) - srcseq)%(2**32))[2:].zfill(8)
                            datagramss[i][23] = datagramss[i][23][:2] + tempsrcseq[:2]
                            cc = 24
                            for elm in bigram_generation(tempsrcseq,len(tempsrcseq)/2).split(" ")[:-1]:
                                datagramss[i][cc] = elm
                                cc += 1
                            datagramss[i][cc] = tempsrcseq[-2:] + datagramss[i][cc][2:4]
                            # handle ack
                            if int(datagramss[i][28]+datagramss[i][30], 16)!=0 and dstseq:
                                tempsrcack = hex((rdstseq + int(datagramss[i][28]+datagramss[i][30], 16) - dstseq)%(2**32))[2:].zfill(8)
                                datagramss[i][27] = datagramss[i][27][:2] + tempsrcack[:2]
                                cc = 28
                                for elm in bigram_generation(tempsrcack,len(tempsrcack)/2).split(" ")[:-1]:
                                    datagramss[i][cc] = elm
                                    cc += 1
                                datagramss[i][cc] = tempsrcack[-2:] + datagramss[i][cc][2:4]
                        
                    else:
                        datagramss[i][11] = datagramss[i][11][:2] + backward_4tstr[:2]
                        cc = 12
                        for elm in bigram_generation(backward_4tstr,token_len=len(backward_4tstr)/2).split(" ")[:-1]:
                            datagramss[i][cc] = elm
                            cc += 1
                        datagramss[i][cc] = backward_4tstr[-2:] + datagramss[i][cc][2:4]
                        # handle IPID
                        if IP==4:
                            if dstipid != 0:
                                temp = hex((int(datagramss[i][4],16) - dstipid + rdstid)%(2**16))[2:].zfill(4)
                                datagramss[i][4] = temp
                                datagramss[i][3] = datagramss[i][3][:2] + temp[:2]
                                datagramss[i][5] = temp[2:] + datagramss[i][5][2:]
                        # handle seq
                        if proto==6:
                            if dstseq:
                                tempdstseq = hex((rdstseq + int(datagramss[i][24]+datagramss[i][26], 16) - dstseq)%(2**32))[2:].zfill(8)
                                datagramss[i][23] = datagramss[i][23][:2] + tempdstseq[:2]
                                cc = 24
                                for elm in bigram_generation(tempdstseq,len(tempdstseq)/2).split(" ")[:-1]:
                                    datagramss[i][cc] = elm
                                    cc += 1
                                datagramss[i][cc] = tempdstseq[-2:] + datagramss[i][cc][2:4]
                            # handle ack
                            if int(datagramss[i][28]+datagramss[i][30], 16)!=0:
                                tempdstack = hex((rsrcseq + int(datagramss[i][28]+datagramss[i][30], 16) - srcseq)%(2**32))[2:].zfill(8)
                                datagramss[i][27] = datagramss[i][27][:2] + tempdstack[:2]
                                cc = 28
                                for elm in bigram_generation(tempdstack,len(tempdstack)/2).split(" ")[:-1]:
                                    datagramss[i][cc] = elm
                                    cc += 1
                                datagramss[i][cc] = tempdstack[-2:] + datagramss[i][cc][2:4]
                    #print(datagramss[i])
                
                newtext_a = ''
                for i in range(len(datagramss)):
                    if newtext_a!='': #2024.4.23 add
                        newtext_a += ' '
                    newtext_a += '[SEP]'
                    for j in range(len(datagramss[i])):
                        if newtext_a!='':
                            newtext_a += ' '
                        newtext_a += datagramss[i][j]
                        
                dataset.append([newtext_a,tgt])

    dataset = pd.DataFrame(dataset,columns=['datagram','label'])
    dataset = dataset.sample(frac = 1)
    # print(dataset.head())
    write_dataset_tsv(dataset['datagram'], dataset['label'], path, new_file_prefix)

In [33]:
def save_to_tsv(dataset_file, output_path, type):
    with open(f"{output_path}/{type}.tsv", 'w', newline='') as f:
        tsv_w = csv.writer(f, delimiter='\t')
        tsv_w.writerows(dataset_file)

def process_folder(folder_path, split):
    id = 0
    dataset_file_list = [["label", "text_a"]]
    
    for file in os.listdir(folder_path):
        print(f"Processing {folder_path} {file}")
        if 'pcap' not in file:
            continue
        with scapy.PcapReader(os.path.join(folder_path, file)) as pcap_reader:
            for pkt in pcap_reader:
                pkt = random_ip_port(pkt)
                pkt = random_tcp_ts_option(pkt)
                pkt = random_tls_randomtime(pkt)
                pkt = clean_packet(pkt)

                pkt_string = (binascii.hexlify(bytes(pkt))).decode()[:2 * 64] # the len of ethernet header is 14, 64 is the max len of a packet
                pkt_string = bigram_generation(pkt_string.strip(), token_len=len(pkt_string.strip()), flag = True)
                pkt_string = "[SEP] " + pkt_string
                
                dataset_file_list.append([id, pkt_string])
        id += 1
    
    return dataset_file_list

In [39]:
exp = 'polishedns'
task = 'vpn-app'

data_path = f'debunk_representation/data/{task}'
output_path = f'debunk_representation/code/TrafficFormer/data_polished/{task}'

In [40]:
for split_folder in os.listdir(data_path):
    # if split_folder == 'test':
    #     print("Processing test data...")
    #     dataset_file_list = process_folder(os.path.join(data_path, split_folder), split_folder)
    #     os.makedirs(f"{output_path}", exist_ok=True)
    #     save_to_tsv(dataset_file_list, output_path, split_folder)
    if split_folder == 'train_val_split_0':
        for type in os.listdir(os.path.join(data_path, split_folder)):
            print(f"Processing {split_folder} {type} data...")
            dataset_file_list = process_folder(os.path.join(data_path, split_folder, type), type)
            os.makedirs(f"{output_path}/{split_folder}", exist_ok=True)
            save_to_tsv(dataset_file_list, f"{output_path}/{split_folder}", type)

In [124]:
exp = 'polishednsLen811'
task = 'vpn-app'

data_path = f'debunk_representation/pipeline/{exp}/{task}'
output_path = f'debunk_representation/code/TrafficFormer/data_{exp}/{task}'

for split_folder in os.listdir(output_path):
    if 'test' in split_folder:
        continue

    enhance_based_tsv(f"{output_path}/{split_folder}/", 'train.tsv', "train_enhance.tsv", 5)