<a href="https://colab.research.google.com/github/changyui/Study/blob/main/SAM_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.Preprocess

In [None]:
!apt-get install libpcap-dev

In [None]:
!pip install pypcap

In [None]:
!pip install dpkt

In [None]:
import numpy as np
import dpkt
import random
import pickle

In [None]:
protocols = ['dns', 'smtp', 'ssh', 'ftp', 'http', 'https']
ports = [53, 25, 22, 21, 80, 443]

def gen_flows(pcap):
	flows = [{} for _ in range(len(protocols))]

	if pcap.datalink() != dpkt.pcap.DLT_EN10MB:
		print('unknow data link!')
		return

	xgr = 0
	for _, buff in pcap:
		eth = dpkt.ethernet.Ethernet(buff)
		xgr += 1
		if xgr % 500000 == 0:
			print('The %dth pkt!'%xgr)
			# break

		if isinstance(eth.data, dpkt.ip.IP) and (
		isinstance(eth.data.data, dpkt.udp.UDP)
		or isinstance(eth.data.data, dpkt.tcp.TCP)):
			# tcp or udp packet
			ip = eth.data

			# loop all protocols
			for name in protocols:
				index = protocols.index(name)
				if ip.data.sport == ports[index] or \
				ip.data.dport == ports[index]:
					if len(flows[index]) >= 10000:
						# each class has at most 1w flows
						break
					# match a protocol
					key = '.'.join(map(str, map(int, ip.src))) + \
					'>>>' + '.'.join(map(str, map(int, ip.dst))) + \
					':::' + '.'.join(map(str, [ip.p, ip.data.sport, ip.data.dport]))

					if key not in flows[index]:
						flows[index][key] = [ip]
					elif len(flows[index][key]) < 1000:
						# each flow has at most 1k flows
						flows[index][key].append(ip)
					# after match a protocol quit
					break

	return flows


# def split_train_test(flows, name, k):
# 	keys = list(flows.keys())

# 	test_keys = keys[k*int(len(keys)*0.1):(k+1)*int(len(keys)*0.1)]
# 	test_min = 0xFFFFFFFF
# 	test_flows = {}
# 	for k in test_keys:
# 		test_flows[k] = flows[k]
# 		test_min = min(test_min, len(flows[k]))

# 	train_keys = set(keys) - set(test_keys)
# 	train_min = 0xFFFFFFFF
# 	train_flows = {}
# 	for k in train_keys:
# 		train_flows[k] = flows[k]
# 		train_min = min(train_min, len(flows[k]))

# 	print('============================')
# 	print('Generate flows for %s'%name)
# 	print('Total flows: ', len(flows))
# 	print('Train flows: ', len(train_flows), ' Min pkts: ', train_min)
# 	print('Test flows: ', len(test_flows), ' Min pkts: ', test_min)

# 	return train_flows, test_flows


def closure(flows):
	flow_dict = {}
	for name in protocols:
		index = protocols.index(name)
		flow_dict[name] = flows[index]
		print('============================')
		print('Generate flows for %s'%name)
		print('Total flows: ', len(flows[index]))
		cnt = 0
		for k, v in flows[index].items():
			cnt += len(v)
		print('Total pkts: ', cnt)

	with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows.pkl', 'wb') as f:
		pickle.dump(flow_dict, f)

# if __name__ == '__main__':
pcap = dpkt.pcap.Reader(open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/aim_chat_3a.pcap', 'rb'))
flows = gen_flows(pcap)
closure(flows)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows.pkl', 'rb') as g:   
    data = pickle.load(g)

data

# 2. Tool

In [None]:
import pickle
import dpkt
import random
import numpy as np
# from preprocess import protocols
from tqdm import tqdm, trange

In [None]:
ip_features = {'hl':1,'tos':1,'len':2,'df':1,'mf':1,'ttl':1,'p':1}
tcp_features = {'off':1,'flags':1,'win':2}
udp_features = {'ulen':2}
max_byte_len = 50

def mask(p):
	p.src = b'\x00\x00\x00\x00'
	p.dst = b'\x00\x00\x00\x00'
	p.sum = 0
	p.id = 0
	p.offset = 0

	if isinstance(p.data, dpkt.tcp.TCP):
		p.data.sport = 0
		p.data.dport = 0
		p.data.seq = 0
		p.data.ack = 0
		p.data.sum = 0

	elif isinstance(p.data, dpkt.udp.UDP):
		p.data.sport = 0
		p.data.dport = 0
		p.data.sum = 0

	return p

def pkt2feature(data, k):
	flow_dict = {'train':{}, 'test':{}}

	# train->protocol->flowid->[pkts]
	for p in protocols:
		flow_dict['train'][p] = []
		flow_dict['test'][p] = []
		all_pkts = []
		p_keys = list(data[p].keys())

		for flow in p_keys:
			pkts = data[p][flow]
			all_pkts.extend(pkts)
		random.Random(1024).shuffle(all_pkts)

		for idx in range(len(all_pkts)):
			pkt = mask(all_pkts[idx])
			raw_byte = pkt.pack()

			byte = []
			pos = []
			for x in range(min(len(raw_byte),max_byte_len)):
				byte.append(int(raw_byte[x]))
				pos.append(x)

			byte.extend([0]*(max_byte_len-len(byte)))
			pos.extend([0]*(max_byte_len-len(pos)))
			# if len(byte) != max_byte_len or len(pos) != max_byte_len:
			# 	print(len(byte), len(pos))
			# 	input()
			if idx in range(k*int(len(all_pkts)*0.1), (k+1)*int(len(all_pkts)*0.1)):
				flow_dict['test'][p].append((byte, pos))
			else:
				flow_dict['train'][p].append((byte, pos))
	return flow_dict

def load_epoch_data(flow_dict, train='train'):
	flow_dict = flow_dict[train]
	x, y, label = [], [], []

	for p in protocols:
		pkts = flow_dict[p]
		for byte, pos in pkts:
			x.append(byte)
			y.append(pos)
			label.append(protocols.index(p))

	return np.array(x), np.array(y), np.array(label)[:, np.newaxis]


# if __name__ == '__main__':
	# f = open('flows.pkl','rb')
	# data = pickle.load(f)
	# f.close()

	# print(data.keys())

	# dns = data['dns']
	# # print(list(dns.keys())[:10])

	# # wide dataset contains payload
	# print('================\n',
	# 	len(dns['203.206.160.197.202.89.157.51.17.53.51648'][0]))

	# print('================')
	# flow_dict = pkt2feature(data)
	# x, y, label = train_epoch_data(flow_dict)
	# print(x.shape)
	# print(y.shape)
	# print(label[0])
with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows.pkl','rb') as f:
	data = pickle.load(f)

for i in trange(10, mininterval=2, \
	desc='  - (Building fold dataset)   ', leave=False):
	flow_dict = pkt2feature(data, i)
	with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows_%d_noip_fold.pkl'%i, 'wb') as f:
		pickle.dump(flow_dict, f)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows_0_noip_fold.pkl', 'rb') as g:   
    data1 = pickle.load(g)

data1

# 3. SAM

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import math

In [None]:
torch.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
np.random.seed(2020)
random.seed(2020)
torch.backends.cudnn.deterministic = True

class SelfAttention(nn.Module):
	"""docstring for SelfAttention"""
	def __init__(self, d_dim=256, dropout=0.1):
		super(SelfAttention, self).__init__()
		# for query, key, value, output
		self.dim = d_dim
		self.linears = nn.ModuleList([nn.Linear(d_dim, d_dim) for _ in range(4)])
		self.dropout = nn.Dropout(p=dropout)

	def attention(self, query, key, value):
		scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.dim)
		scores = F.softmax(scores, dim=-1)
		return scores

	def forward(self, query, key, value):
		# 1) query, key, value
		query, key, value = \
		[l(x) for l, x in zip(self.linears, (query, key, value))]

		# 2) Apply attention
		scores = self.attention(query, key, value)
		x = torch.matmul(scores, value)

		# 3) apply the final linear
		x = self.linears[-1](x.contiguous())
		# sum keepdim=False
		return self.dropout(x), torch.mean(scores, dim=-2)

class OneDimCNN(nn.Module):
	"""docstring for OneDimCNN"""
	# https://blog.csdn.net/sunny_xsc1994/article/details/82969867
	def __init__(self, max_byte_len, d_dim=256, \
		kernel_size = [3, 4], filters=256, dropout=0.1):
		super(OneDimCNN, self).__init__()
		self.kernel_size = kernel_size
		self.convs = nn.ModuleList([
						nn.Sequential(nn.Conv1d(in_channels=d_dim, 
												out_channels=filters, 
												kernel_size=h),
						#nn.BatchNorm1d(num_features=config.feature_size), 
						nn.ReLU(),
						# MaxPool1d: 
						# stride – the stride of the window. Default value is kernel_size
						nn.MaxPool1d(kernel_size=max_byte_len-h+1))
						for h in self.kernel_size
						]
						)
		self.dropout = nn.Dropout(p=dropout)

	def forward(self, x):
		out = [conv(x.transpose(-2,-1)) for conv in self.convs]
		out = torch.cat(out, dim=1)
		out = out.view(-1, out.size(1))
		return self.dropout(out)


class SAM(nn.Module):
	"""docstring for SAM"""
	# total header bytes 24
	def __init__(self, num_class, max_byte_len, kernel_size = [3, 4], \
		d_dim=256, dropout=0.1, filters=256):
		super(SAM, self).__init__()
		self.posembedding = nn.Embedding(num_embeddings=max_byte_len, 
								            embedding_dim=d_dim)
		self.byteembedding = nn.Embedding(num_embeddings=300, 
								            embedding_dim=d_dim)
		self.attention = SelfAttention(d_dim, dropout)
		self.cnn = OneDimCNN(max_byte_len, d_dim, kernel_size, filters, dropout)
		self.fc = nn.Linear(in_features=256*len(kernel_size),
                            out_features=num_class)

	def forward(self, x, y):
		out = self.byteembedding(x) + self.posembedding(y)
		out, score = self.attention(out, out, out)
		out = self.cnn(out)
		out = self.fc(out)
		if not self.training:
			return F.softmax(out, dim=-1).max(1)[1], score
		return out
		
#if __name__ == '__main__':
#    x = np.random.randint(0, 255, (10, 20))
#    y = np.random.randint(0, 20, (10, 20))
#    sam = SAM(num_class=5, max_byte_len=20)
#    out = sam(torch.from_numpy(x).long(), torch.from_numpy(y).long())
#    print(out[0])

#    sam.eval()
#    out, score = sam(torch.from_numpy(x).long(), torch.from_numpy(y).long())
#    print(out[0], score[0])


# 4. Train

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/SAM_learn')

In [None]:
import torch
import torch.nn.functional as F
from torch import cuda
import torch.utils.data
import torch.optim as optim
import torch.nn as nn
import argparse
import time
from tqdm import tqdm, trange
# from SAM import SAM
# from SAM import *
from tool import protocols, load_epoch_data, max_byte_len
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import pickle
import numpy as np

In [None]:
torch.cuda.is_available()

In [None]:
class Dataset(torch.utils.data.Dataset):
	"""docstring for Dataset"""
	def __init__(self, x, y, label):
		super(Dataset, self).__init__()
		self.x = x
		self.y = y
		self.label = label

	def __len__(self):
		return len(self.x)

	def __getitem__(self, idx):
		return self.x[idx], self.y[idx], self.label[idx]

def paired_collate_fn(insts):
	x, y, label = list(zip(*insts))
	return torch.LongTensor(x), torch.LongTensor(y), torch.LongTensor(label)

def cal_loss(pred, gold, cls_ratio=None):
	gold = gold.contiguous().view(-1)
	# By default, the losses are averaged over each loss element in the batch. 
	loss = F.cross_entropy(pred, gold)

	# torch.max(a,0) 
	pred = F.softmax(pred, dim = -1).max(1)[1]
	# 
	n_correct = pred.eq(gold)
	acc = n_correct.sum().item() / n_correct.shape[0]

	return loss, acc*100

def test_epoch(model, test_data):
	''' Epoch operation in training phase'''
	model.eval()

	total_acc = []
	total_pred = []
	total_score = []
	total_time = []

	for batch in tqdm(
		test_data, mininterval=2,
		desc='  - (Testing)   ', leave=False):

		# prepare data
		src_seq, src_seq2, gold = batch
		src_seq, src_seq2, gold = src_seq.cuda(), src_seq2.cuda(), gold.cuda()
		gold = gold.contiguous().view(-1)

		# forward
		torch.cuda.synchronize()
		start = time.time()
		pred, score = model(src_seq, src_seq2)
		torch.cuda.synchronize()
		end = time.time()
		# 
		n_correct = pred.eq(gold)
		acc = n_correct.sum().item()*100 / n_correct.shape[0]
		total_acc.append(acc)
		total_pred.extend(pred.long().tolist())
		total_score.append(torch.mean(score, dim=0).tolist())
		total_time.append(end - start)

	return sum(total_acc)/len(total_acc), np.array(total_score).mean(axis=0), \
	total_pred, sum(total_time)/len(total_time)

def train_epoch(model, training_data, optimizer):
	''' Epoch operation in training phase'''
	model.train()

	total_loss = []
	total_acc = []

	for batch in tqdm(
		training_data, mininterval=2,
		desc='  - (Training)   ', leave=False):

		# prepare data
		src_seq, src_seq2, gold = batch
		src_seq, src_seq2, gold = src_seq.cuda(), src_seq2.cuda(), gold.cuda()

		optimizer.zero_grad()
		# forward
		pred = model(src_seq, src_seq2)
		loss_per_batch, acc_per_batch = cal_loss(pred, gold)
		# update parameters
		loss_per_batch.backward()
		optimizer.step()

		# 
		total_loss.append(loss_per_batch.item())
		total_acc.append(acc_per_batch)

	return sum(total_loss)/len(total_loss), sum(total_acc)/len(total_acc)

def main(i, flow_dict):
	f = open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/results_%d.txt'%i, 'w')
	f.write('Train Loss Time Test\n')
	f.flush()


	model = SAM(num_class=len(protocols), max_byte_len=max_byte_len).cuda()
	optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()))
	loss_list = []
	# default epoch is 3
	for epoch_i in trange(5, mininterval=2, \
		desc='  - (Training Epochs)   ', leave=False):

		train_x, train_y, train_label = load_epoch_data(flow_dict, 'train')
		training_data = torch.utils.data.DataLoader(
				Dataset(x=train_x, y=train_y, label=train_label),
				num_workers=0,
				collate_fn=paired_collate_fn,
				batch_size=128,
				shuffle=True
			)
		train_loss, train_acc = train_epoch(model, training_data, optimizer)

		test_x, test_y, test_label = load_epoch_data(flow_dict, 'test')
		test_data = torch.utils.data.DataLoader(
				Dataset(x=test_x, y=test_y, label=test_label),
				num_workers=0,
				collate_fn=paired_collate_fn,
				batch_size=128,
				shuffle=False
			)
		test_acc, score, pred, test_time = test_epoch(model, test_data)
		with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/atten_%d.txt'%i, 'w') as f2:
			f2.write(' '.join(map('{:.4f}'.format, score)))

		# write F1, PRECISION, RECALL
		with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/metric_%d.txt'%i, 'w') as f3:
			f3.write('F1 PRE REC\n')
			p, r, fscore, _ = precision_recall_fscore_support(test_label, pred)
			for a, b, c in zip(fscore, p, r):
				# for every cls
				f3.write('%.2f %.2f %.2f\n'%(a, b, c))
				f3.flush()
			if len(fscore) != len(protocols):
				a = set(pred)
				b = set(test_label[:,0])
				f3.write('%s\n%s'%(str(a), str(b)))

		# write Confusion Matrix
		with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/cm_%d.pkl'%i, 'wb') as f4:
			pickle.dump(confusion_matrix(test_label, pred, normalize='true'), f4)


		# write ACC
		f.write('%.2f %.4f %.6f %.2f\n'%(train_acc, train_loss, test_time, test_acc))
		f.flush()

		# # early stop
		# if len(loss_list) == 5:
		# 	if abs(sum(loss_list)/len(loss_list) - train_loss) < 0.005:
		# 		break
		# 	loss_list[epoch_i%len(loss_list)] = train_loss
		# else:
		# 	loss_list.append(train_loss)

	f.close()


# if __name__ == '__main__':
for i in range(10):
	with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/pro_flows_%d_noip_fold.pkl'%i, 'rb') as f:
		flow_dict = pickle.load(f)
	print('====', i, ' fold validation ====')
	main(i, flow_dict)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/SAM_learn/cm_0.pkl', 'rb') as g:   
    data2 = pickle.load(g)

data2