# Import libraries

In [1]:
# Common
from common import data_paths as dp
from common import bigram_processing as bp
from common import globals as gl
from common import data_processing as dproc
from common import data_loading as dl

# General
import numpy as np
import pandas as pd
import scapy.all as scapy
import pickle
import torch
from torch import nn
from sklearn.base import BaseEstimator
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
import re

# Load the attack file used for training

In [2]:
data_attack = dl.load_ngram(dp.PCAP_PATH + dp.ATTACK_LIGHT_PATH + dp.ATTACK_PATH + 'light_compressed.pcap')
print(data_attack[:10])
print(len(data_attack))

['2.20.168.192.in-addr.arpa.', '2.20.168.192.in-addr..', '3.0.0.0.1.0.0.0.0.0.0.', '3.0.0.0.1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.', '3.0.0.0.1.0.0.0.0.0.0.', '3.0.0.0.1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.', '252.0.0.224.in-addr.a.', '252.0.0.224.in-addr.arpa.', '252.0.0.224.in-addr.a.', '252.0.0.224.in-addr.arpa.']
38420


In [3]:
data_attack_finetune = dl.load_ngram(dp.PCAP_PATH + dp.ATTACK_LIGHT_PATH + dp.ATTACK_PATH + 'light_audio.pcap')
print(data_attack_finetune[:10])
print(len(data_attack_finetune))

['200.20.168.192.in-add.', '200.20.168.192.in-addr.arpa.', '200.20.168.192.in-add.', '200.20.168.192.in-addr.arpa.', '252.0.0.224.in-addr.a.', '252.0.0.224.in-addr.arpa.', '252.0.0.224.in-addr.arpa.', '252.0.0.224.in-addr.a.', '3.0.0.0.1.0.0.0.0.0.0.', '3.0.0.0.1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.']
53328


In [4]:
data_benign = dl.load_ngram(dp.PCAP_PATH + dp.ATTACK_LIGHT_PATH + dp.BENIGN_PATH + 'benign.pcap')
print(data_benign[:10])
print(len(data_benign))

['100.20.168.192.in-add.', '100.20.168.192.in-addr.arpa.', '2.20.168.192.in-addr..', '2.20.168.192.in-addr.arpa.', '2.20.168.192.in-addr..', '2.20.168.192.in-addr.arpa.', '150.20.168.192.in-add.', '150.20.168.192.in-addr.arpa.', '150.20.168.192.in-add.', '150.20.168.192.in-addr.arpa.']
101588


# Load benign domains

In [5]:
benign_domains_file_3 = dp.PCAP_PATH + dp.ATTACK_LIGHT_PATH + dp.BENIGN_PATH + 'domains.txt'

benign_domains = []

with open(benign_domains_file_3) as f:
    for line in f:
        benign_domains.append(line.strip())
print(benign_domains[:10])
benign_domains = set(benign_domains)

['dvg-gestalt.de', 'dxgalaxy.com', 'e2n.de', 'easycredit-bbl.de', 'eatbu.com', 'ecomedes.com', 'edels-stube.eu', 'edmboost.org', 'edmfull.com', 'einfachflirts.com']


In [6]:
sldstlds = pd.read_csv(dp.MISC_PATH + 'SLDs.csv', header=None, names=['TLD', 'SLD'], sep=',')['SLD']
slds = []
for s in sldstlds:
    spl = s.split('.')[-2]
    if spl != '':
        slds.append(spl)
print(slds[:10])
slds = set(slds)

benign_dataset_domains = []
for bdom in benign_domains:
    benign_dataset_domains.append(dproc.get_domain_name(bdom, slds))
    
print(benign_dataset_domains[:10])
benign_dataset_domains = set(benign_dataset_domains)

['com', 'net', 'gov', 'org', 'mil', 'co', 'net', 'gov', 'ac', 'sch']
['plusportals', 'controlc', 'onconduit', 'beadored', 'camarasetvoila', 'torontopubliclibrary', 'doxo', 'avinfolie', 'manilatimes', 'uptrennd']


# Label data

In [7]:
data_labeled = []

for d in data_attack:
    if dproc.not_exfil(d):
        data_labeled.append((d, 1))
    else:
        data_labeled.append((d, 0))

print(len([d for d in data_labeled if d[1] == 1]))
print(len([d for d in data_labeled if d[1] == 0]))

X_test_2 = [d[0] for d in data_labeled]
y_test_2 = [d[1] for d in data_labeled]


28355
10065


In [8]:
data_labeled_finetune = []

for d in data_attack_finetune:
    if dproc.not_exfil(d):
        data_labeled_finetune.append((d, 1))
    else:
        data_labeled_finetune.append((d, 0))

print(len([d for d in data_labeled_finetune if d[1] == 1]))
print(len([d for d in data_labeled_finetune if d[1] == 0]))

X_train_finetune = [d[0] for d in data_labeled_finetune]
y_train_finetune = [d[1] for d in data_labeled_finetune]


49036
4292


In [9]:
data_labeled_train = []

for d in data_benign:
    data_labeled_train.append((d, 1))

print(len([d for d in data_labeled_train if d[1] == 1]))
print(len([d for d in data_labeled_train if d[1] == 0]))

X = [d[0] for d in data_labeled_train]
y = [d[1] for d in data_labeled_train]

101588
0


# Import domains

In [10]:
domains_file = dp.DOMAINS_PATH + 'crawlson.com-top-1m.txt'

domains = pd.read_csv(domains_file, header=None, names=['id', 'domain'], sep=',')

domains = [dproc.get_domain_name(d, slds) for d in domains['domain'].tolist()]

# benign_domains = list(benign_domains)
print(len(domains))
domains += benign_dataset_domains
print(domains[:10])
print(len(domains))
    

1000000
['wordpress', 'shopify', 'gravatar', 'wikipedia', 'bluehost', 'hover', 'yahoo', 'youtube', 'github', 'google']
1009766


# Transform to ngrams

In [11]:
# Split domains into bigrams
domains_bigrams = bp.bigram_list(domains)

# Get bigrams frequency
domains_bigrams = bp.bigram_freq(domains_bigrams)

# Rank bigrams
domains_bigrams = bp.rank_bigrams_freq(domains_bigrams)
i = 0
for dbkey, dbval in domains_bigrams.items():
    print(dbkey, dbval)
    i += 1
    if i == 10:
        break

er 188988
in 172467
s$ 147596
an 144060
en 131791
ar 121336
on 120375
es 120318
re 120309
e$ 114807


# Transform X to X_grams

In [12]:
X_gram = []
ae_unfound_value = 2**31 - 1

for x in X:
    
    resx = []
    
    for level in x.split('.'):
        if level == '':
            continue
        bx = bp.bigram_split(level)
        for b in bx:
            if b in domains_bigrams:
                resx.append(domains_bigrams[b])
            else:
                resx.append(ae_unfound_value)
    
    resx = np.array(resx)
    
    if resx.shape[0] < gl.input_dim:
        resx = np.pad(resx, (0, gl.input_dim - resx.shape[0]), 'constant', constant_values=(0))
    if resx.shape[0] > gl.input_dim:
        resx = resx[:gl.input_dim]
        
    
    X_gram.append(resx)

In [13]:
t = 0
for i in range(4):
    print(X_gram[i])
    print(X[i])
        

[  4014   2580   3253   4163   2292   2680   4163   4014   1237   1137
   5231   4014   1140    484   3672  32580 172467  15770  10320  41138
   6060  32490      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0]
100.20.168.192.in-add.
[  4014   2580   3253   4163   2292   2680   4163   4014   1237   1137
   5231   4014   1140    484   3672  32580 172467  15770  10320  41138
   6060  15393  53896  70861 121336   9223  42183  68938      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0]
100.20.168.192.in-addr.arpa.
[  2292   3672   2292   2680   4163 

In [14]:
X_gram_test = []

for x in X_test_2:
    
    resx = []
    
    for level in x.split('.'):
        if level == '':
            continue
        bx = bp.bigram_split(level)
        for b in bx:
            if b in domains_bigrams:
                resx.append(domains_bigrams[b])
            else:
                resx.append(ae_unfound_value)
    
    resx = np.array(resx)
    
    if resx.shape[0] < gl.input_dim:
        resx = np.pad(resx, (0, gl.input_dim - resx.shape[0]), 'constant', constant_values=(0))
    if resx.shape[0] > gl.input_dim:
        resx = resx[:gl.input_dim]
        
    
    X_gram_test.append(resx)

In [15]:
t = 0
for i in range(len(X_gram_test)):
    if t >=4:
        break
    
    if y_test_2[i] == 0 and t % 2 == 0:
        print(X_gram_test[i])
        print(X_test_2[i])
        t += 1
    if y_test_2[i] == 1 and t % 2 == 1:
        print(X_gram_test[i])
        print(X_test_2[i])
        
        t += 1

[     32580     172467      46555      68163      86841 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647      79609      37498
      63820      62150        204        320       4452          0
          0          0          0          0          0          0
          0          0          0          0          0          0
          0          0          0          0          0          0
          0          0          0          0]
init.INXWIZKWGIXHIYLSFZTXU7BTGQZQ.base64..
[  4014    975    783   4163   2292   2680   4163   4014   1237   1137
   5231   4014   1140    484   3672  32580 172467  15770  10320  41138
   6060  32490      0      0      0      0      0      0      0      0
      0      0      0      0

In [16]:
X_gram_finetune = []

for x in X_train_finetune:
    
    resx = []
    
    for level in x.split('.'):
        if level == '':
            continue
        bx = bp.bigram_split(level)
        for b in bx:
            if b in domains_bigrams:
                resx.append(domains_bigrams[b])
            else:
                resx.append(ae_unfound_value)
    
    resx = np.array(resx)
    
    if resx.shape[0] < gl.input_dim:
        resx = np.pad(resx, (0, gl.input_dim - resx.shape[0]), 'constant', constant_values=(0))
    if resx.shape[0] > gl.input_dim:
        resx = resx[:gl.input_dim]
        
    
    X_gram_finetune.append(resx)

In [17]:
t = 0
for i in range(len(X_gram_finetune)):
    if t >=4:
        break
    
    if y_train_finetune[i] == 0 and t % 2 == 0:
        print(X_gram_finetune[i])
        print(X_train_finetune[i])
        t += 1
    if y_train_finetune[i] == 1 and t % 2 == 1:
        print(X_gram_finetune[i])
        print(X_train_finetune[i])
        
        t += 1

[     32580     172467      46555      68163      86841 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
          0          0          0          0          0          0
          0          0          0          0          0          0
          0          0          0          0          0          0
          0          0          0          0]
init.MF2WI2LPL4ZDAMJYFUYDCLJRGBPTCMJNGU2C.
[  2476   3441   2292   2680   4163   4014   1237   1137   5231   4014
   1140    484   3672  32580 172467  15770  10320  41138   6060  15393
  53896      0      0      0      0      0      0      0      0      0
      0      0      0      0

# Model

## Setup

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

class Autoencoder(nn.Module):
    def __init__(self, input_dim = 64):
        super(Autoencoder, self).__init__()
        
        self.input_dim = input_dim        
        
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim, 32),
            nn.LeakyReLU(inplace=True)
            ).to(device)
        
        self.decoder = nn.Sequential(
            nn.Linear(32, self.input_dim),
            nn.LeakyReLU(inplace=True),
            ).to(device)
        
    def forward(self, x):
        if x.shape[1] < self.input_dim:
            x = nn.functional.pad(x, (0, self.input_dim - x.shape[1])).to(device)
        elif x.shape[1] > self.input_dim:
            x = x[:, :self.input_dim]

        return self.process(x)
    
    def predict(self, x):
        return self.process(x)
    
    def process(self, x):
        x = self.encoder(x).to(device)
        x = self.decoder(x).to(device)
        
        return x
    
    def get_input_dim(self):
        return self.input_dim
    
 
class AEModelWrapper(BaseEstimator):
    def __init__(self, model, max_epochs = 1, threshold = 0.5, threshold_cmp_mode = 'max'):
        self.model = model
        self.max_epochs = max_epochs
        self.prev_loss = None
        self.loss_counter = 0
        self.threshold = threshold
        self.threshold_cmp_mode = threshold_cmp_mode
            
    def threshold_cmp(self, a, b):
        if self.threshold_cmp_mode == 'min':
            return a < b
        else:
            return a > b        
        
    def fit(self, X, y):
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001, weight_decay=0)
        
        self.model.train()
        for epoch in range(self.max_epochs):
            batch_size = 64
            for i in range(0, len(X), batch_size):
                mbatch = batch_size if i + batch_size < len(X) else len(X) - i
                inputs = torch.tensor(X[i:i+mbatch], dtype=torch.float32).to(device)
                
                optimizer.zero_grad()
                outputs = self.model(inputs)
                
                loss = criterion(outputs, inputs)
                loss.backward()
                optimizer.step()
                   
                
            print(f"Epoch {epoch+1}/{self.max_epochs}")
            
        return self
            
    def normalize(self, arr):
        return (arr - arr.min())/(arr.max() - arr.min())
    
    def predict(self, X):
        crt = nn.MSELoss()
        
        self.model.eval()
        
        input_dim = self.model.get_input_dim()
        
        X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
            
        if X_tensor.shape[1] < input_dim:
            X_tensor = nn.functional.pad(X_tensor, (0, input_dim - X_tensor.shape[1])).to(device)
            
        if X_tensor.shape[1] > input_dim:
            X_tensor = X_tensor[:, :input_dim]
        
        decoder_data = self.model.predict(X_tensor.to(device)).to('cpu')
        
        X_tensor = X_tensor.to('cpu').detach().numpy()
        
        decoder_data = decoder_data.to('cpu').detach().numpy()
        rd = []
        
        for i in range(decoder_data.shape[0]):
            dd = torch.tensor(decoder_data[i], dtype=torch.float32).to('cpu')
            xt = torch.tensor(X_tensor[i], dtype=torch.float32).to('cpu')
            local_loss = crt(dd, xt)
            
            local_loss = local_loss.item()
            rd.append(local_loss)
        
        
        rd = np.array(rd)
        rd = self.normalize(rd)
        
        rd = self.threshold_cmp(rd, self.threshold)
        rd = rd.astype(int)
        
        return rd
    
    def predict2(self, X):
        crt = nn.MSELoss()
        
        self.model.eval()
        
        input_dim = self.model.get_input_dim()
        
        X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
            
        if X_tensor.shape[1] < input_dim:
            X_tensor = nn.functional.pad(X_tensor, (0, input_dim - X_tensor.shape[1])).to(device)
            
        if X_tensor.shape[1] > input_dim:
            X_tensor = X_tensor[:, :input_dim]
        
        decoder_data = self.model.predict(X_tensor.to(device)).to('cpu')
        
        X_tensor = X_tensor.to('cpu').detach().numpy()
        
        decoder_data = decoder_data.to('cpu').detach().numpy()
        rd = []
        
        for i in range(decoder_data.shape[0]):
            dd = torch.tensor(decoder_data[i], dtype=torch.float32).to('cpu')
            xt = torch.tensor(X_tensor[i], dtype=torch.float32).to('cpu')
            local_loss = crt(dd, xt)
            
            local_loss = local_loss.item()
            rd.append(local_loss)
        
        
        rd = np.array([[yp, yp**2] for yp in rd])
        # rd = self.normalize(rd)
        
        # rd = self.threshold_cmp(rd, self.threshold)
        # rd = rd.astype(int)
        
        return rd

## Train

In [19]:
ae_model = AEModelWrapper(Autoencoder(input_dim=gl.input_dim), max_epochs=8,threshold=0.005, threshold_cmp_mode='min')

## Process for autoencoder

In [20]:
torch.cuda.empty_cache()

ae_model.fit(X_gram, y)

  inputs = torch.tensor(X[i:i+mbatch], dtype=torch.float32).to(device)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


## Finetune

In [21]:
class SVCModelWrapper(TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = self.model.predict2(X)
        return X
    
ae_svc_pipeline = make_pipeline(SVCModelWrapper(ae_model), StandardScaler(), SVC(kernel='rbf', gamma='auto'))

ae_svc_pipeline.fit(X_gram_finetune, y_train_finetune)

## Test

In [22]:
y_pred = ae_svc_pipeline.predict(X_gram_test)

print('Classification report:\n', classification_report(y_test_2, y_pred))

# P# Print all misclassified
falses = 0
falert = 0
nonalert = 0
falerts = []
nonalerts=[]
ppp= []
qqq = []
for i in range(len(y_test_2)):
    if y_test_2[i] != y_pred[i]:
        falses += 1
        if y_pred[i] == 0:
            falert += 1
            falerts.append(X_test_2[i])
        else:
            nonalert += 1
            nonalerts.append(X_test_2[i])
        # print(f"'{X[i]}'", y[i], y_pred[i])
    if y_test_2[i] == 1 and y_pred[i] == 1:
        ppp.append(X[i])
    if y_test_2[i] == 0 and y_pred[i] == 0:
        qqq.append(X[i])
# print(falses)
alerts = len([d for d in y_test_2 if d == 0])
good = len([d for d in y_test_2 if d == 1])

print(falert)
print(good)
print(falert/good * 100)
print(len(ppp) / good * 100)
print(falerts[:10])
print('=======')
print(nonalert)
print(alerts)
print(nonalert/alerts * 100)
print(len(qqq) / alerts * 100)
print(nonalerts[:10])

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10065
           1       1.00      1.00      1.00     28355

    accuracy                           1.00     38420
   macro avg       1.00      1.00      1.00     38420
weighted avg       1.00      1.00      1.00     38420

0
28355
0.0
100.0
[]
0
10065
0.0
100.0
[]


## Save the model

In [23]:
with open('autoencoder_all.pkl', 'wb') as f:
    pickle.dump(ae_svc_pipeline, f)

## Export data about the model

In [25]:
roc_auc = roc_auc_score(y_test_2, y_pred)
print(f'ROC AUC: {roc_auc}')

fpr, tpr, _ = roc_curve(y_test_2, y_pred)

roc_auc = auc(fpr, tpr)

TP = 0
TN = 0
FP = 0
FN = 0

for i in range(len(y_test_2)):
    if y_test_2[i] == y_pred[i]:
        # True
        if y_test_2[i] == 1:
            TP += 1
        else:
            TN += 1
    else:
        # False
        if y_pred[i] == 1:
            FP += 1
        else:
            FN += 1
            
accuracy = (TP + TN)/(TP + TN + FN + FP)

precision1 = TP / (TP + FP)
precision0 = TN / (TN + FN)

recall1 = TP / (TP + FN)
recall0 = TN / (TN + FP)

f11 = 2 * (precision1 * recall1)/(precision1 + recall1)
f10 = 2 * (precision0 * recall0)/(precision0 + recall0)

all_attack = len([d for d in y_test_2 if d == 0])

lower_ae = {
    'roc_auc': roc_auc,
    'accuracy': accuracy,
    'benign':{
        'precision': precision1,
        'recall': recall1,
        'f1': f11
    },
    'attack':{
        'precision': precision0,
        'recall': recall0,
        'f1': f10
    },
    'tp': TP,
    'tn': TN,
    'fp': FP,
    'fn': FN,
    'false_alerts': FN,
    'attack_passed': (FP/all_attack) * 100
}

import json
print(json.dumps(lower_ae, indent=4))

with open('autoencoder_all.json', 'w') as f:
    json.dump(lower_ae, f, indent=4)

ROC AUC: 1.0
{
    "roc_auc": 1.0,
    "accuracy": 1.0,
    "benign": {
        "precision": 1.0,
        "recall": 1.0,
        "f1": 1.0
    },
    "attack": {
        "precision": 1.0,
        "recall": 1.0,
        "f1": 1.0
    },
    "tp": 28355,
    "tn": 10065,
    "fp": 0,
    "fn": 0,
    "false_alerts": 0,
    "attack_passed": 0.0
}
