In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
# Load the dataset
url = "https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/DarkNet.csv"
df = pd.read_csv(url)

  df = pd.read_csv(url)


In [34]:
# Preprocess the data
X = df.drop(["Label-1","Label-2"], axis=1)  # features
y = df["Label-1"]  # target variable

In [36]:
# Scale the data using StandardScaler
numerical_cols = df.select_dtypes(include=['int64']).columns
scaler = RobustScaler()
X_scaled = scaler.fit_transform(df[numerical_cols])

In [37]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [38]:
# Train a Random Forest Classifier model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [39]:
# Evaluate the model
y_pred = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9720034995625547
Classification Report:
              precision    recall  f1-score   support

     Non-Tor       1.00      1.00      1.00      6159
      NonVPN       0.95      0.97      0.96      4590
         Tor       0.98      0.88      0.92       279
         VPN       0.95      0.93      0.94      2688

    accuracy                           0.97     13716
   macro avg       0.97      0.94      0.95     13716
weighted avg       0.97      0.97      0.97     13716

Confusion Matrix:
[[6154    1    0    4]
 [   7 4440    6  137]
 [   0   34  245    0]
 [   7  188    0 2493]]


In [40]:
# Use the trained model to detect anomalies
def detect_anomalies(X_new):
    predictions = rfc.predict(X_new)
    anomalies = [i for i, x in enumerate(predictions) if x == 1]
    # assume label 1 is anomaly
    return anomalies

In [72]:
print(df.columns)

Index(['Src_IP', 'Src_Port', 'Dst_IP', 'Dst_Port', 'Protocol', 'Flow_Duration',
       'Total_Fwd_Packet', 'Total_Bwd_packets', 'Total_Length_of_Fwd_Packet',
       'Total_Length_of_Bwd_Packet', 'Fwd_Packet_Length_Max',
       'Fwd_Packet_Length_Min', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Min', 'Bwd_Packet_Length_Mean',
       'Bwd_Packet_Length_Std', 'Flow_Bytes/s', 'Flow_Packets/s',
       'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std',
       'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags',
       'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Min', 'Packet_Length_Max', 'Packet_Length_Mean',
       'Packet_Length_Std', 'Packet_Length_Variance', 'FIN_Flag

In [73]:
# Create a new dataset with the same feature names as the original dataset
new_data = pd.DataFrame({

    'Src_IP': ['192.168.1.1', '192.168.1.2', '192.168.1.3', '192.168.1.4', '192.168.1.5'],
    'Src_Port': [80, 443, 22, 8080, 21],
    'Dst_IP': ['8.8.8.8', '8.8.4.4', '1.1.1.1', '4.4.4.4', '2.2.2.2'],
    'Dst_Port': [53, 80, 443, 22, 21],
    'Protocol': ['TCP', 'UDP', 'TCP', 'UDP', 'TCP'],
    'Flow_Duration': [10.2, 5.1, 3.4, 2.1, 1.5],
    'Total_Fwd_Packet': [100, 50, 20, 10, 5],
    'Total_Bwd_packets': [50, 20, 10, 5, 2],
    'Total_Length_of_Fwd_Packet': [1000, 500, 200, 100, 50],
    'Total_Length_of_Bwd_Packet': [500, 200, 100, 50, 20],
    'Fwd_Packet_Length_Max': [1500, 1000, 500, 200, 100],
    'Fwd_Packet_Length_Min': [50, 20, 10, 5, 2],
    'Fwd_Packet_Length_Mean': [500, 200, 100, 50, 20],
    'Fwd_Packet_Length_Std': [100, 50, 20, 10, 5],
    'Bwd_Packet_Length_Max': [1000, 500, 200, 100, 50],
    'Bwd_Packet_Length_Min': [20, 10, 5, 2, 1],
    'Bwd_Packet_Length_Mean': [200, 100, 50, 20, 10],
    'Flow_Bytes/s': [1000, 500, 200, 100, 50],
    'Flow_Packets/s': [50, 20, 10, 5, 2],
    'Flow_IAT_Mean': [10, 5, 2, 1, 0.5],
    'Flow_IAT_Std': [5, 2, 1, 0.5, 0.2],
    'Flow_IAT_Max': [20, 10, 5, 2, 1],
    'Flow_IAT_Min': [0.1, 0.05, 0.02, 0.01, 0.005],
    'Fwd_IAT_Total': [100, 50, 20, 10, 5],
    'Fwd_IAT_Mean': [10, 5, 2, 1, 0.5],
    'Fwd_IAT_Std': [5, 2, 1, 0.5, 0.2],
    'Fwd_IAT_Max': [20, 10, 5, 2, 1],
    'Fwd_IAT_Min': [0.1, 0.05, 0.02, 0.01, 0.005],
    'Bwd_IAT_Total': [50, 20, 10, 5, 2],
    'Bwd_IAT_Mean': [5, 2, 1, 0.5, 0.2],
    'Bwd_IAT_Std': [2, 1, 0.5, 0.2, 0.1],
    'Bwd_IAT_Max': [10, 5, 2, 1, 0.5],
    'Bwd_IAT_Min': [0.05, 0.02, 0.01, 0.005, 0.002],
    'Fwd_PSH_Flags': [10, 5, 2, 1, 0],
    'Bwd_PSH_Flags': [5, 2, 1, 0, 0],
    'Fwd_URG_Flags': [2, 1, 0, 0, 0],
    'Bwd_URG_Flags': [1, 0, 0, 0, 0],
    'Fwd_Header_Length': [50, 20, 10, 5, 2],
    'Bwd_Header_Length': [20, 10, 5, 2, 1],
    'Fwd_Packets/s': [100, 50, 20, 10, 5],
    'Bwd_Packets/s': [50, 20, 10, 5, 2],
    'Packet_Length_Min': [50, 20, 10, 5, 2],
    'Packet_Length_Max': [1500, 1000, 500, 200, 100],
    'Packet_Length_Mean': [500, 200, 100, 50, 20],
    'Packet_Length_Std': [100, 50, 20, 10, 5],
    'Packet_Length_Variance': [200, 100, 50, 20, 10],
    'FIN_Flag_Count': [10, 5, 2, 1, 0],
    'SYN_Flag_Count': [5, 2, 1, 0, 0],
    'RST_Flag_Count': [2, 1, 0, 0, 0],
    'PSH_Flag_Count': [10, 5, 2, 1, 0],
    'ACK_Flag_Count': [50, 20, 10, 5, 2],
    'URG_Flag_Count': [2, 1, 0, 0, 0],
    'CWE_Flag_Count': [1, 0, 0, 0, 0],
    'ECE_Flag_Count': [1, 0, 0, 0, 0],
    'Down/Up_Ratio': [2, 1, 0.5, 0.2, 0.1],
    'Average_Packet_Size': [500, 200, 100, 50, 20],
    'Fwd_Segment_Size_Avg': [500, 200, 100, 50, 20],
    'Bwd_Segment_Size_Avg': [200, 100, 50, 20, 10],
    'Fwd_Bytes/Bulk_Avg': [1000, 500, 200, 100, 50],
    'Fwd_Packet/Bulk_Avg': [50, 20, 10, 5, 2],
    'Fwd_Bulk_Rate_Avg': [100, 50, 20, 10, 5],
    'Bwd_Bytes/Bulk_Avg': [500, 200, 100, 50, 20],
    'Bwd_Packet/Bulk_Avg': [20, 10, 5, 2, 1],
    'Bwd_Bulk_Rate_Avg': [50, 20, 10, 5, 2],
    'Subflow_Fwd_Packets': [100, 50, 20, 10, 5],
    'Subflow_Fwd_Bytes': [1000, 500, 200, 100, 50],
    'Subflow_Bwd_Packets': [50, 20, 10, 5, 2],
    'Subflow_Bwd_Bytes': [500, 200, 100, 50, 20],
    'FWD_Init_Win_Bytes': [1000, 500, 200, 100, 50],
    'Bwd_Init_Win_Bytes': [500, 200, 100, 50, 20],
    'Fwd_Act_Data_Pkts': [50, 20, 10, 5, 2],
    'Fwd_Seg_Size_Min': [50, 20, 10, 5, 2],
    'Active_Mean': [10, 5, 2, 1, 0.5],
    'Active_Std': [5, 2, 1, 0.5, 0.2],
    'Active_Max': [20, 10, 5, 2, 1],
    'Active_Min': [0.1, 0.05, 0.02, 0.01, 0.005],
    'Idle_Mean': [5, 2, 1, 0.5, 0.2],
    'Idle_Std': [2, 1, 0.5, 0.2, 0.1],
    'Idle_Max': [10, 5, 2, 1, 0.5],
    'Idle_Min': [0.05, 0.02, 0.01, 0.005, 0]
})

new_data_scaled = scaler.transform(new_data)

# Detect anomalies
anomalies = detect_anomalies(new_data_scaled)
print("Anomalies:", anomalies)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Average_Packet_Size
- Bwd_IAT_Mean
- Bwd_IAT_Std
- Bwd_Packet_Length_Mean
- Bwd_Packets/s
- ...
