In [1]:
import os
import json
from tqdm import tqdm
from pprint import pprint
from scapy.utils import PcapReader, hexdump
from scapy.layers.inet import IP, TCP, UDP

In [2]:
rowdata_folder = 'rawData'
path_category_map = {}

for filename in os.listdir(rowdata_folder):
    category = filename.split('-')[0].split('.')[0]
    filepath = os.path.join(rowdata_folder, filename)
    path_category_map[filepath] = category

pprint(path_category_map)

{'rawData\\BitTorrent.pcap': 'BitTorrent',
 'rawData\\FTP.pcap': 'FTP',
 'rawData\\Facetime.pcap': 'Facetime',
 'rawData\\Gmail.pcap': 'Gmail',
 'rawData\\MySQL.pcap': 'MySQL',
 'rawData\\Outlook.pcap': 'Outlook',
 'rawData\\SMB-1.pcap': 'SMB',
 'rawData\\SMB-2.pcap': 'SMB',
 'rawData\\Skype.pcap': 'Skype',
 'rawData\\Weibo-1.pcap': 'Weibo',
 'rawData\\Weibo-2.pcap': 'Weibo',
 'rawData\\Weibo-3.pcap': 'Weibo',
 'rawData\\Weibo-4.pcap': 'Weibo',
 'rawData\\WorldOfWarcraft.pcap': 'WorldOfWarcraft'}


In [5]:
# flow integrate
# {
#     "category1": {
#         "five_tuple_1": ["payload1", "payload2",...]
#         "five_tuple_2": ["payload1", "payload2",...]
#     }
# }

def extract_payload(pkt):
    if UDP in pkt:
        payload = pkt[UDP].payload
    elif TCP in pkt:
        payload = pkt[TCP].payload
    pkt_payload_str = hexdump(payload, dump=True)
    pkt_payload_str_list = pkt_payload_str.split('\n')
    pkt_payload_list = []
    for line in pkt_payload_str_list:
        if len(line.split('  ')) > 1:
            pkt_payload_list.append(line.split('  ')[1])
        
    res = []
    for line in pkt_payload_list:
        res += line.split(" ")
    return ''.join(res)


def generate_flow_payload(path_category_map):
    flow_payload = {}
    for (filepath, category) in tqdm(path_category_map.items()):
        flow_payload.setdefault(category, {})
        pcap_reader = PcapReader(filepath)
        for pkt in pcap_reader:
            if not (TCP in pkt or UDP in pkt):
                continue

            src_ip = pkt[IP].src
            dst_ip = pkt[IP].dst
            if TCP in pkt:
                proto = 'tcp'
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
                payload = pkt[TCP].payload
            if UDP in pkt:
                proto = 'udp'
                src_port = pkt[UDP].sport
                dst_port = pkt[UDP].dport
                payload = pkt[UDP].payload
            
            five_tuple_str = '|'.join([src_ip, dst_ip, str(src_port), str(dst_port), str(proto)])
            processed_payload = extract_payload(pkt)
            flow_payload[category].setdefault(five_tuple_str, []).append(processed_payload[:256])
    return flow_payload

def save_flow_payload(save_path, flow_payload):
    with open(save_path, 'w') as f:
        json.dump(flow_payload, f)

In [6]:
save_path = 'flow_payload.json'
flow_payload = generate_flow_payload(path_category_map)
save_flow_payload(save_path, flow_payload)

100%|██████████| 14/14 [1:49:45<00:00, 470.42s/it]   


dataset generation
bigram process
tsv file
train:test = 8:2

In [8]:
for key in flow_payload.keys():
    print(f"{key}|{len(flow_payload[key])}")

BitTorrent|15000
Facetime|6000
FTP|202034
Gmail|17178
MySQL|172114
Outlook|14984
Skype|12000
SMB|77781
Weibo|79810
WorldOfWarcraft|15761


In [7]:
print((list(flow_payload['BitTorrent'].items())[0]))

('1.1.33.158|1.2.156.163|41319|443|tcp', ['170300057807090B36E9BEA600E6DFC59BD09C52754B56164C933C15D0ECA348229B36F55214F4FF5FFE2DFB595AB079887DFB96F4A4461252530B00F8A7B05CF3748FFED8BD8187A89A3184CFF195EC5EC28C4A45C6B5F7B891BA8E0821536D7EB140DD255AA7AA132217B4009CCDBA89D5923D2C2AFE118E90E1C921D8DE17'])


In [4]:
def Bi_gram(payload):
    res_list = []
    i = 0
    while(i<=len(payload)-4):
        res_list.append(payload[i:i+4])
        i += 2
    bi_gram_payload = ' '.join(res_list[:300])
    return bi_gram_payload

In [13]:
import json
import random

category_map_label = {
    "BitTorrent": 0,
    "Facetime": 1,
    "FTP":2,
    "Gmail": 3,
    "MySQL": 4,
    "Outlook": 3,
    "Skype": 5,
    "SMB": 2,
    "Weibo": 6,
    "WorldOfWarcraft": 7
}

flow_payload_path = "./flow_payload.json"
raw_dataset = {}

with open(flow_payload_path, 'r') as f:
    flow_payload = json.load(f)
    for category in flow_payload:
        label = category_map_label[category]
        for five_tuple in flow_payload[category]:
            flow = "".join(flow_payload[category][five_tuple])
            if (len(flow) > 50):
                raw_dataset.setdefault(label, []).append("".join(flow_payload[category][five_tuple]))
    for label in raw_dataset:
        random.shuffle(raw_dataset[label])
        raw_dataset[label] = raw_dataset[label][:3000]

for key in raw_dataset.keys():
    print(f"{key}|{len(raw_dataset[key])}")

bi_dataset = {}
for label in raw_dataset:
    bi_dataset[label] = [Bi_gram(payload) for payload in raw_dataset[label]]

save_path = './bi_dataset.json'
with open(save_path, 'w') as f:
    json.dump(bi_dataset, f)

0|3000
1|3000
2|3000
3|3000
4|3000
5|3000
6|3000
7|3000


In [1]:
import csv
import json
import random

raw_data_path = './bi_dataset.json'
dataset_folder = './dataset'

with open(raw_data_path, 'r') as f:
    data = []
    bi_data = json.load(f)
    for label in bi_data:
        for flow in bi_data[label]:
            data.append([label, flow])
    random.shuffle(data)
    train_data = data[:int(len(data)*0.8)]
    test_data = data[int(len(data)*0.8):]
    random.shuffle(train_data)
    random.shuffle(test_data)
    print(len(train_data))
    print(len(test_data))

with open(dataset_folder+'/train.tsv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(['label', 'text_a'])
    writer.writerows(train_data)

with open(dataset_folder+'/test.tsv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(['label', 'text_a'])
    writer.writerows(test_data)

19200
4800
