<a href="https://colab.research.google.com/github/fpsomad/fpsomad/blob/main/Darknet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
# εισαγωγή βιβλιοθηκών
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [131]:
# εισαγωγή συνόλου δεδομένων
data = pd.read_csv('https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/DarkNet.csv', low_memory = False)
data.info()





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68580 entries, 0 to 68579
Data columns (total 83 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Src_IP                      68580 non-null  object 
 1   Src_Port                    68580 non-null  int64  
 2   Dst_IP                      68580 non-null  object 
 3   Dst_Port                    68580 non-null  int64  
 4   Protocol                    68580 non-null  int64  
 5   Flow_Duration               68580 non-null  int64  
 6   Total_Fwd_Packet            68580 non-null  int64  
 7   Total_Bwd_packets           68580 non-null  int64  
 8   Total_Length_of_Fwd_Packet  68580 non-null  int64  
 9   Total_Length_of_Bwd_Packet  68580 non-null  int64  
 10  Fwd_Packet_Length_Max       68580 non-null  int64  
 11  Fwd_Packet_Length_Min       68580 non-null  int64  
 12  Fwd_Packet_Length_Mean      68580 non-null  float64
 13  Fwd_Packet_Length_Std       685

In [117]:
# ενοποίηση ονομασιών υπηρεσιών DarkNet
data['Label-2'] = data['Label-2'].replace(['AUDIO-STREAMING', 'Video-streaming', 'File-transfer'], ['Audio-Streaming', 'Video-Streaming','File-Transfer'])
data['Label-2'].unique()
data.groupby(data['Label-2']).size()

Label-2
Audio-Streaming     7539
Browsing            5192
Chat               11478
Email               6145
File-Transfer      11182
P2P                13711
VOIP                3566
Video-Streaming     9767
dtype: int64

In [119]:
# ορισμός κλάσης για ταξινόμηση κίνησης "Class: 0 (normal)  1 (DarkNet)"
conditions = [
    (data['Label-1'] == 'Non-Tor'),
    (data['Label-1'] == 'NonVPN'),
    (data['Label-1'] == 'Tor'),
    (data['Label-1'] == 'VPN')]
values = [0,0,1,1]
data['Class'] = np.select(conditions, values)
data.groupby(data['Class']).size()



Class
0    53671
1    14909
dtype: int64

In [120]:
# έλεγχος του συνόλου δεδομένων για κενές ή ελλείπουσες τιμές
data.isnull().sum()

Src_IP      0
Src_Port    0
Dst_IP      0
Dst_Port    0
Protocol    0
           ..
Idle_Max    0
Idle_Min    0
Label-1     0
Label-2     0
Class       0
Length: 84, dtype: int64

In [121]:
# αφαίρεση μεταβλητών Src_IP, Dst_IP, Flow_Bytes/s
data = data.drop(['Src_IP', 'Dst_IP', 'Flow_Bytes/s', 'Label-1', 'Label-2'], axis = 1)

In [130]:
#κλιμάκωση δεδομένων
data0 = data.drop(['Src_Port', 'Dst_Port','Protocol'], axis = 1)

for column in data0.columns :
  data0[column] = data0[column].apply(lambda x: np.log(x) if x !=0 else 0)

data0 =data0.sample(frac=1)
data0.head()







Unnamed: 0,Flow_Duration,Total_Fwd_Packet,Total_Bwd_packets,Total_Length_of_Fwd_Packet,Total_Length_of_Bwd_Packet,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,...,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Class
40131,2.258184,-0.366513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.097189,0,0,0,0,0.0,0.0,0.0,0.0,0
22495,2.561028,-0.366513,0.0,1.330832,0.0,1.128508,1.128508,1.128508,0.0,0.0,...,0.732099,0,0,0,0,0.0,0.0,0.0,0.0,0
30235,2.571609,1.704233,1.276345,2.547193,0.0,1.985981,0.0,1.984311,1.615755,0.0,...,1.097189,0,0,0,0,3.552528,2.357712,3.552528,3.552528,0
57662,2.908524,0.910235,0.834032,1.915134,1.732715,1.832268,0.0,1.459298,1.610635,1.611563,...,1.242925,0,0,0,0,3.552435,2.832063,3.552435,3.552435,0
40488,2.408755,0.094048,-0.366513,1.439718,0.0,1.439718,0.0,1.13812,1.300247,0.0,...,1.097189,0,0,0,0,3.552908,2.356918,3.552908,3.552908,0


In [123]:
# ανακάτεμα συνόλου δεδομένων
data = data.sample(frac=1)
data.head()

Unnamed: 0,Src_Port,Dst_Port,Protocol,Flow_Duration,Total_Fwd_Packet,Total_Bwd_packets,Total_Length_of_Fwd_Packet,Total_Length_of_Bwd_Packet,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,...,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Class
47693,10.711302,10.315994,1.791759,16.891835,2.079442,2.079442,6.572283,7.189168,6.464588,0.0,...,2.995732,0,0,0,0,34.840546,33.595122,34.914655,16.804061,0.0
36675,10.727773,10.847646,1.791759,15.314198,1.386294,1.386294,5.153292,0.0,4.672829,0.0,...,2.995732,0,0,0,0,34.914689,7.38736,34.914689,34.914689,0.0
52406,10.832912,3.970292,2.833213,12.695232,0.0,0.0,4.418841,4.59512,4.418841,4.418841,...,2.079442,0,0,0,0,0.0,0.0,0.0,0.0,0.0
27345,4.382027,10.408406,1.791759,1.609438,0.0,0.0,0.0,0.0,0.0,0.0,...,2.995732,0,0,0,0,0.0,0.0,0.0,0.0,0.0
55889,10.405747,6.09357,1.791759,18.283907,1.94591,1.386294,7.987185,6.976348,6.888572,0.0,...,3.465736,0,0,0,0,34.894988,17.275744,34.894988,34.894988,0.0


In [None]:
# ορισμός Χ και Υ
X = data.drop('Class', axis=1)
y = data['Class']
X.shape, y.shape


In [125]:
# τμηματοποίηση συνόλου δεδομένων 80% για εκπαίδευση, 20% για έλεγχο
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((54864, 78), (13716, 78), (54864,), (13716,))

In [126]:
# αλγόριθμος Decision Tree

tree = DecisionTreeClassifier(max_depth = 5)
tree.fit(X_train, y_train)

y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)

acc_train_tree = accuracy_score(y_train,y_train_tree)
acc_test_tree = accuracy_score(y_test,y_test_tree)

print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))



ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values