In [1]:
import torch
import torch.nn as nn
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd


In [2]:
# Here we are going to import the first benign dataset fron 'data' folder
benign = pd.read_csv('C:\\Users\\gaibo\\OneDrive\\Escritorio\\Python Projects\\DNS Traffic Control\\Data\\stateless_features-light_benign.pcap.csv')

# Now we are going to create another column in the dataset named 'class' which will have value 0 for all the rows
benign['class'] = 0 # This is because we assume that all the data dns traffic in that dataset is benign

# Here we are going to import the first benign dataset fron 'data' folder
benign_2 = pd.read_csv('C:\\Users\\gaibo\\OneDrive\\Escritorio\\Python Projects\\DNS Traffic Control\\Data\\stateless_features-benign_1.pcap.csv')

# Now we are going to create another column in the dataset named 'class' which will have value 0 for all the rows
benign_2['class'] = 0 # This is because we assume that all the data dns traffic in that dataset is benign

# Now we are going to import the first malicious dataset from 'data' folder
malicious = pd.read_csv('C:\\Users\\gaibo\\OneDrive\\Escritorio\\Python Projects\\DNS Traffic Control\\Data\\stateless_features-light_compressed.pcap.csv')

# Now we are going to create another column in the dataset named 'class' which will have value 1 for all the rows
malicious['class'] = 1 # This is because we assume that all the data dns traffic in that dataset is malicious

# Now we are going to merge both the datasets
data_merged = pd.concat([benign, benign_2, malicious])

In [3]:
# To avoid bias we are going to shuffle the dataset
data = data_merged.sample(frac=1).reset_index(drop=True)

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode the longest_word column
data['longest_word'] = le.fit_transform(data['longest_word'])

# Encode the sld column
data['sld'] = le.fit_transform(data['sld'])

In [5]:
# Before splitting the dataset, we have to clean the dataset. 
# Firs we are going to remove the columns "longest_word" and "sld"
data = data.drop(['timestamp'], axis=1)

# Then lets remove NaN values from the dataset
data = data.dropna()

# Print head of the dataset
data.head()

Unnamed: 0,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain,class
0,26,9,0,10,10,2.742338,6,6,7,3.5,2,168,13,1,0
1,32,0,32,0,0,2.276268,0,1,32,32.0,10,798,33,0,1
2,14,0,0,13,0,2.952882,1,2,11,6.5,2694,4020,12,0,0
3,26,3,0,22,0,3.358857,4,3,19,8.0,10089,26700,23,1,0
4,15,3,0,13,0,2.417665,2,3,7,4.333333,11582,10115,11,1,0


In [6]:
# Now lets split the dataset into training and testing data
X = data.drop(['class'], axis=1)
y = data['class']
n_samples, n_features = X.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(data.describe())

          FQDN_count  subdomain_length          upper          lower  \
count  202831.000000     202831.000000  202831.000000  202831.000000   
mean       18.843722          4.112108       0.510987      10.852739   
std         6.928431          4.076035       3.851880       3.551985   
min         2.000000          0.000000       0.000000       0.000000   
25%        13.000000          0.000000       0.000000      10.000000   
50%        19.000000          3.000000       0.000000      10.000000   
75%        25.000000          8.000000       0.000000      12.000000   
max        36.000000         29.000000      32.000000      34.000000   

             numeric        entropy        special         labels  \
count  202831.000000  202831.000000  202831.000000  202831.000000   
mean        4.106631       2.468462       3.373365       3.885170   
std         4.708304       0.461948       2.304147       1.862196   
min         0.000000       0.219195       0.000000       1.000000   
25%   

In [9]:
# scale
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.to_numpy().astype(np.float32)) # convert y_train to ndarray
y_test = torch.from_numpy(y_test.to_numpy().astype(np.float32)) # convert y_test to ndarray

y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)

# 1) Model
# Linear model f = wx + b , sigmoid at the end
class Model(nn.Module):
    def __init__(self, n_input_features):
        super(Model, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

model = Model(n_features)

# 2) Loss and optimizer
num_epochs = 100
learning_rate = 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# 3) Training loop
for epoch in range(num_epochs):
    # Forward pass and loss
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    # Backward pass and update
    loss.backward()
    optimizer.step()

    # zero grad before new step
    optimizer.zero_grad()

    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')


with torch.no_grad():
    y_predicted = model(X_test)
    y_predicted_cls = y_predicted.round()
    acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
    print(f'accuracy: {acc.item():.4f}')

epoch: 10, loss = 0.7008
epoch: 20, loss = 0.6789
epoch: 30, loss = 0.6583
epoch: 40, loss = 0.6389
epoch: 50, loss = 0.6205
epoch: 60, loss = 0.6032
epoch: 70, loss = 0.5868
epoch: 80, loss = 0.5714
epoch: 90, loss = 0.5567
epoch: 100, loss = 0.5428
accuracy: 0.9395


In [30]:
# Now lets predict
y_pred = model(X_test)
y_pred = y_pred.detach().numpy()

# Now lets use threshold to classify the data in benign, suspicious and malicious
# List of results
results = []

# Loop through the predictions
for i in y_pred:
    if i < 0.4:
        results.append('Benign')
    elif i >= 0.4 and i < 0.7:
        results.append('Suspicious')
    else:
        results.append('Malicious')

print(results)


# List of results
results = []

# Loop through the predictions
for i in y_test:
    results.append(i)

print(results[0:10])

['Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Benign', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Benign', 'Benign', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Benign', 'Benign', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Benign', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Benign', 'Benign', 'Benign', 'Suspicious', 'Benign', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', 'Suspicious', 'Suspicious', 'Suspicious', 'Benign', '