<h1>Binary classification for domain names in dns tunnel queries</h1>
<h3>Based on https://www.kaggle.com/code/danielgraham1997/dns-tunneling-detection</h3>

In [1]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dns-tunneling-queries-classification/validating.csv
/kaggle/input/dns-tunneling-queries-classification/training.csv


In [2]:
import math
from sklearn.model_selection import train_test_split,cross_val_score 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, roc_auc_score
from sklearn.utils import shuffle

<h3>Calculating entropy for domain name</h3>

In [3]:
def calculate_entropy(text):
    if not text: 
        return 0 
    entropy = 0
    for x in range(256): 
        p_x = float(text.count(chr(x)))/len(text) 
        if p_x > 0: 
            entropy += - p_x*math.log(p_x, 2) 
    return entropy

In [4]:
def calculate_length(text):
    if not text:
        return 0
    return len(text)    

In [5]:
training_data = pd.read_csv('/kaggle/input/dns-tunneling-queries-classification/training.csv', names=['label','query'])
validating_data = pd.read_csv('/kaggle/input/dns-tunneling-queries-classification/validating.csv', names=['label','query'])

In [6]:
training_data.head(5)

Unnamed: 0,label,query
0,1,q+Z8AnwaBA.hidemyself.org.
1,1,q+Z8A3wbBA.hidemyself.org.
2,1,q+Z8BHwcBA.hidemyself.org.
3,1,q+Z8BXwdBA.hidemyself.org.
4,1,q+Z8BnweCORdAGL4+W7DB5xH1cUwcwM1gejAQoJF8hbs2J...


In [7]:
entropy_train_vals = []
length_train_vals = []

for query in training_data['query']:
    entropy = calculate_entropy(query)
    entropy_train_vals.append(entropy)
    length = calculate_length(query)
    length_train_vals.append(length)
    
training_data['entropy'] = entropy_train_vals
training_data['length'] = length_train_vals

In [8]:
entropy_test_vals = []
length_test_vals = []

for query in validating_data['query']:
    entropy = calculate_entropy(query)
    entropy_test_vals.append(entropy)
    length = calculate_length(query)
    length_test_vals.append(length)

validating_data['entropy'] = entropy_test_vals
validating_data['length'] = length_test_vals

In [9]:
training_data[training_data['label']==1].head(10)

Unnamed: 0,label,query,entropy,length
0,1,q+Z8AnwaBA.hidemyself.org.,4.363713,26
1,1,q+Z8A3wbBA.hidemyself.org.,4.363713,26
2,1,q+Z8BHwcBA.hidemyself.org.,4.363713,26
3,1,q+Z8BXwdBA.hidemyself.org.,4.28679,26
4,1,q+Z8BnweCORdAGL4+W7DB5xH1cUwcwM1gejAQoJF8hbs2J...,5.752803,194
5,1,q+Z8B3wfBA.hidemyself.org.,4.28679,26
6,1,q+Z8CHwgCMyPHABMWPwbfizHv1WcH9RclfrtcCTQNMck6E...,5.726079,194
7,1,q+Z8CXwhCDQOp1pQiGYfISl7BqfLNQCo7/oMOMCNQ+P6ea...,5.739535,194
8,1,q+Z8CnwiCIaRuw9fzE/xnkiXw4i0YO/U0raXZAH/IHNFqz...,5.871244,194
9,1,q+Z8C3wjCOnPuA7KYEVrik+hWBbo1AYv1VT8DUfpXISQhj...,5.806959,194


In [10]:
training_data[training_data['label']==0].head(10)

Unnamed: 0,label,query,entropy,length
6000,0,google.com.,2.663533,11
6001,0,facebook.com.,3.026987,13
6002,0,doubleclick.net.,3.5,16
6003,0,google-analytics.com.,3.689704,21
6004,0,akamaihd.net.,3.180833,13
6005,0,googlesyndication.com.,3.64125,22
6006,0,googleapis.com.,3.323231,15
6007,0,googleadservices.com.,3.558519,21
6008,0,facebook.net.,3.238901,13
6009,0,youtube.com.,3.084963,12


In [11]:
X_train = training_data[['entropy', 'length']] # the training input entropy
Y_train = training_data['label']   # the corresponding classifying label for training

X_train = X_train.values.reshape(-1, 2)
Y_train = Y_train.values.reshape(-1, 1).ravel()
X_train.shape

(15000, 2)

In [12]:
lgs_model = LogisticRegression(max_iter=500)

<h3>Testing metrics only on train data</h3>

In [13]:
accuracy = cross_val_score(lgs_model, X_train, Y_train, cv=5,  scoring='accuracy')
precision = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='precision')
recall = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='recall')
f1 = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='f1')
roc_auc = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='roc_auc')

In [14]:
print("Accuracy", accuracy)
print("Precision", precision)
print("Recall", recall)
print("F1", f1)
print("ROC AUC", roc_auc)

Accuracy [0.99666667 0.99766667 0.993      0.99333333 0.99433333]
Precision [0.99585062 0.99709182 0.99213902 0.99173554 0.99296649]
Recall [1.         1.         0.99916667 1.         1.        ]
F1 [0.997921   0.99854379 0.99564044 0.99585062 0.99647083]
ROC AUC [0.99939444 1.         0.99979549 0.99750382 0.9912375 ]


In [15]:
X_train, Y_train = shuffle(X_train, Y_train)
lgs_model.fit(X_train, Y_train)

LogisticRegression(max_iter=500)

In [16]:
X_test = validating_data[['entropy', 'length']]      # the test entropy for testing
Y_test = validating_data['label']       # the expected corresponding Label after training
Y_test.values.reshape(-1,1).ravel()

X_test  = X_test.values.reshape(-1, 2)

Y_pred = lgs_model.predict(X_test)

<h3>Metrics on validating data</h3>

In [17]:
print("Accuracy", accuracy_score(Y_test, Y_pred))
print("Precision", precision_score(Y_test, Y_pred))
print("Recall", recall_score(Y_test, Y_pred))
print("F1", f1_score(Y_test, Y_pred))
print("ROC AUC", roc_auc_score(Y_test, Y_pred))

Accuracy 0.9926
Precision 0.9908347783007183
Recall 1.0
F1 0.9953962921488118
ROC AUC 0.9814999999999999


In [18]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1000
           1       0.99      1.00      1.00      4000

    accuracy                           0.99      5000
   macro avg       1.00      0.98      0.99      5000
weighted avg       0.99      0.99      0.99      5000



<h3>Export</h3>

In [19]:
import pickle

pickle.dump(lgs_model, open("/kaggle/working/DNS_tunnel_domains_classificator.pkl", "wb"))