<h1>Binary classification for domain names in dns tunnel queries</h1>
<h3>Based on https://www.kaggle.com/code/danielgraham1997/dns-tunneling-detection</h3>

In [1]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dns-tunneling-queries-classification/validating.csv
/kaggle/input/dns-tunneling-queries-classification/training.csv


In [2]:
import math
from sklearn.model_selection import train_test_split,cross_val_score 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, roc_auc_score
from sklearn.utils import shuffle

<h3>Calculating entropy for domain name</h3>

In [3]:
def calculate_entropy(text):
    if not text: 
        return 0 
    entropy = 0
    for x in range(256): 
        p_x = float(text.count(chr(x)))/len(text) 
        if p_x > 0: 
            entropy += - p_x*math.log(p_x, 2) 
    return entropy

In [4]:
training_data = pd.read_csv('/kaggle/input/dns-tunneling-queries-classification/training.csv', names=['label','query'])
validating_data = pd.read_csv('/kaggle/input/dns-tunneling-queries-classification/validating.csv', names=['label','query'])

In [5]:
training_data.head(5)

Unnamed: 0,label,query
0,1,q+Z8AnwaBA.hidemyself.org.
1,1,q+Z8A3wbBA.hidemyself.org.
2,1,q+Z8BHwcBA.hidemyself.org.
3,1,q+Z8BXwdBA.hidemyself.org.
4,1,q+Z8BnweCORdAGL4+W7DB5xH1cUwcwM1gejAQoJF8hbs2J...


In [6]:
entropy_train_vals = []

for query in training_data['query']:
    entropy = calculate_entropy(query)
    entropy_train_vals.append(entropy)
    
training_data['entropy'] = entropy_train_vals

In [7]:
entropy_test_vals = []

for query in validating_data['query']:
    entropy = calculate_entropy(query)
    entropy_test_vals.append(entropy)

validating_data['entropy'] = entropy_test_vals

In [8]:
training_data[training_data['label']==1].head(10)

Unnamed: 0,label,query,entropy
0,1,q+Z8AnwaBA.hidemyself.org.,4.363713
1,1,q+Z8A3wbBA.hidemyself.org.,4.363713
2,1,q+Z8BHwcBA.hidemyself.org.,4.363713
3,1,q+Z8BXwdBA.hidemyself.org.,4.28679
4,1,q+Z8BnweCORdAGL4+W7DB5xH1cUwcwM1gejAQoJF8hbs2J...,5.752803
5,1,q+Z8B3wfBA.hidemyself.org.,4.28679
6,1,q+Z8CHwgCMyPHABMWPwbfizHv1WcH9RclfrtcCTQNMck6E...,5.726079
7,1,q+Z8CXwhCDQOp1pQiGYfISl7BqfLNQCo7/oMOMCNQ+P6ea...,5.739535
8,1,q+Z8CnwiCIaRuw9fzE/xnkiXw4i0YO/U0raXZAH/IHNFqz...,5.871244
9,1,q+Z8C3wjCOnPuA7KYEVrik+hWBbo1AYv1VT8DUfpXISQhj...,5.806959


In [9]:
X_train = training_data['entropy'] # the training input entropy
Y_train = training_data['label']   # the corresponding classifying label for training

X_train.head()

0    4.363713
1    4.363713
2    4.363713
3    4.286790
4    5.752803
Name: entropy, dtype: float64

In [10]:
X_train = X_train.values.reshape(-1, 1)
Y_train = Y_train.values.reshape(-1, 1).ravel()


In [11]:
lgs_model = LogisticRegression(max_iter=500)

<h3>Testing metrics only on train data</h3>

In [12]:
accuracy = cross_val_score(lgs_model, X_train, Y_train, cv=5,  scoring='accuracy')
precision = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='precision')
recall = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='recall')
f1 = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='f1')
roc_auc = cross_val_score(lgs_model, X_train, Y_train, cv=5, scoring='roc_auc')

In [13]:
print("Accuracy", accuracy)
print("Precision", precision)
print("Recall", recall)
print("F1", f1)
print("ROC AUC", roc_auc)

Accuracy [0.97466667 0.97466667 0.97333333 0.95833333 0.66833333]
Precision [0.96930533 0.96930533 0.97346939 0.96108634 0.97952218]
Recall [1.         1.         0.99375    0.98791667 0.59791667]
F1 [0.98441345 0.98441345 0.98350515 0.97431683 0.74256145]
ROC AUC [1.         0.99976528 0.99607014 0.98785556 0.96457847]


In [14]:
X_train, Y_train = shuffle(X_train, Y_train)
lgs_model.fit(X_train, Y_train)

LogisticRegression(max_iter=500)

In [15]:
X_test = validating_data['entropy']      # the test entropy for testing
Y_test = validating_data['label']       # the expected corresponding Label after training
Y_test.values.reshape(-1,1).ravel()

X_test  = X_test.values.reshape(-1, 1)

Y_pred = lgs_model.predict(X_test)

<h3>Metrics on validating data</h3>

In [16]:
print("Accuracy", accuracy_score(Y_test, Y_pred))
print("Precision", precision_score(Y_test, Y_pred))
print("Recall", recall_score(Y_test, Y_pred))
print("F1", f1_score(Y_test, Y_pred))
print("ROC AUC", roc_auc_score(Y_test, Y_pred))

Accuracy 0.9628
Precision 0.9653489507076622
Recall 0.989
F1 0.9770313657693257
ROC AUC 0.9235


In [17]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90      1000
           1       0.97      0.99      0.98      4000

    accuracy                           0.96      5000
   macro avg       0.96      0.92      0.94      5000
weighted avg       0.96      0.96      0.96      5000



<h3>Export</h3>

In [19]:
import pickle

pickle.dump(lgs_model, open("/kaggle/working/DNS_tunnel_domains_classificator.pkl", "wb"))