In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Suppress warnings
warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

Libraries imported successfully.


In [None]:
DATA_PATH = 'CIC-Darknet2020.csv'
TARGET_LABELS = ['Tor', 'Non-Tor', 'VPN', 'NonVPN']

try:
    df = pd.read_csv(DATA_PATH)
    print(f"Dataset '{DATA_PATH}' loaded successfully.")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: '{DATA_PATH}' not found.")
    raise

df.columns = [*df.columns[:-2], 'Label', 'Label_Type']

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(f"Shape after dropping NaN/Inf: {df.shape}")

df_multi = df[df['Label'].isin(TARGET_LABELS)].copy()
print(f"Shape after filtering for {TARGET_LABELS}: {df_multi.shape}")

non_feature_cols = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Label', 'Label_Type']
X = df_multi.drop(columns=non_feature_cols)
y = df_multi['Label']

X = X.apply(pd.to_numeric)

print(f"\n--- Label Value Counts (Multi-Class) ---")
print(y.value_counts())
print(f"Features shape (X): {X.shape}")
print(f"Labels shape (y): {y.shape}")

Dataset 'CIC-Darknet2020.csv' loaded successfully.
Initial shape: (158616, 85)
Shape after dropping NaN/Inf: (158566, 85)
Shape after filtering for ['Tor', 'Non-Tor', 'VPN', 'NonVPN']: (158566, 85)

--- Label Value Counts (Multi-Class) ---
Label
Non-Tor    110394
NonVPN      23861
VPN         22919
Tor          1392
Name: count, dtype: int64
Features shape (X): (158566, 76)
Labels shape (y): (158566,)


In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

y_ohe = to_categorical(y_encoded, num_classes=len(TARGET_LABELS))

print("\n--- Class Encoding Mapping ---")
label_mapping = {index: label for index, label in enumerate(le.classes_)}
for index, label in label_mapping.items():
    print(f"Class Index {index} -> {label} (Vector: {y_ohe[y_encoded == index][0]})")

n_classes = len(TARGET_LABELS)
print(f"\nTotal classes: {n_classes}")


--- Class Encoding Mapping ---
Class Index 0 -> Non-Tor (Vector: [1. 0. 0. 0.])
Class Index 1 -> NonVPN (Vector: [0. 1. 0. 0.])
Class Index 2 -> Tor (Vector: [0. 0. 1. 0.])
Class Index 3 -> VPN (Vector: [0. 0. 0. 1.])

Total classes: 4


In [None]:
X_train, X_test, y_train_ohe, y_test_ohe, y_train_encoded, y_test_encoded = train_test_split(
    X, y_ohe, y_encoded,  # Pass both OHE and encoded labels
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"\nTraining set shape (X_train): {X_train.shape}")
print(f"Test set shape (X_test): {X_test.shape}")
print(f"Training labels shape (y_train_ohe): {y_train_ohe.shape}")
print(f"Test labels shape (y_test_ohe): {y_test_ohe.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, 'scaler-multi.pkl')
print("\nScaler object saved to 'scaler-multi.pkl'")


Training set shape (X_train): (126852, 76)
Test set shape (X_test): (31714, 76)
Training labels shape (y_train_ohe): (126852, 4)
Test labels shape (y_test_ohe): (31714, 4)

Scaler object saved to 'scaler-multi.pkl'


In [None]:
n_features = X_train_scaled.shape[1]
X_train_cnn = X_train_scaled.reshape((X_train_scaled.shape[0], n_features, 1))
X_test_cnn = X_test_scaled.reshape((X_test_scaled.shape[0], n_features, 1))

print(f"\nData reshaped for CNN.")
print(f"X_train_cnn shape: {X_train_cnn.shape}")
print(f"X_test_cnn shape: {X_test_cnn.shape}")


Data reshaped for CNN.
X_train_cnn shape: (126852, 76, 1)
X_test_cnn shape: (31714, 76, 1)


In [None]:
input_shape = (n_features, 1)

model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape, padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(256, kernel_size=3, activation='relu', padding='same'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))

model.add(Dense(n_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)
class_weights_dict = dict(enumerate(class_weights))

print(f"Class weights calculated to handle imbalance:")
for i in range(n_classes):
      print(f"Weight for class {i} ({label_mapping[i]}): {class_weights_dict[i]:.2f}")

early_stopping = EarlyStopping(monitor='val_accuracy',
                             patience=10,
                             mode='max',
                             restore_best_weights=True)

print("\nStarting model training...")
history = model.fit(X_train_cnn, y_train_ohe,
                    epochs=50,
                    batch_size=64,
                    validation_data=(X_test_cnn, y_test_ohe),
                    callbacks=[early_stopping],
                    class_weight=class_weights_dict,  # Apply weights
                    verbose=1)

print("Model training complete.")

Class weights calculated to handle imbalance:
Weight for class 0 (Non-Tor): 0.36
Weight for class 1 (NonVPN): 1.66
Weight for class 2 (Tor): 28.49
Weight for class 3 (VPN): 1.73

Starting model training...
Epoch 1/50
[1m1983/1983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 17ms/step - accuracy: 0.7017 - loss: 0.7715 - val_accuracy: 0.8630 - val_loss: 0.4755
Epoch 2/50
[1m1983/1983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.8897 - loss: 0.5046 - val_accuracy: 0.9206 - val_loss: 0.2816
Epoch 3/50
[1m1983/1983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 16ms/step - accuracy: 0.9192 - loss: 0.4147 - val_accuracy: 0.9345 - val_loss: 0.2056
Epoch 4/50
[1m1983/1983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 17ms/step - accuracy: 0.9278 - loss: 0.3673 - val_accuracy: 0.9342 - val_loss: 0.1981
Epoch 5/50
[1m1983/1983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 16ms/step - accuracy: 0.9308 - loss: 0.3439 - val_

In [None]:
print("\n Baseline Multi-Class Model Evaluation ")
loss, accuracy = model.evaluate(X_test_cnn, y_test_ohe, verbose=0)

print(f"Test Accuracy: {accuracy * 100:.4f}%")
print(f"Test Loss: {loss:.4f}")

y_pred_probs = model.predict(X_test_cnn)

y_pred_encoded = np.argmax(y_pred_probs, axis=1)

target_names = le.classes_

print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_encoded, target_names=target_names))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred_encoded))


 Baseline Multi-Class Model Evaluation 
Test Accuracy: 95.3585%
Test Loss: 0.1313
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step

Classification Report:
              precision    recall  f1-score   support

     Non-Tor       1.00      0.99      0.99     22079
      NonVPN       0.86      0.85      0.85      4772
         Tor       0.82      0.87      0.85       279
         VPN       0.86      0.88      0.87      4584

    accuracy                           0.95     31714
   macro avg       0.88      0.90      0.89     31714
weighted avg       0.95      0.95      0.95     31714


Confusion Matrix:
[[21885   137     1    56]
 [   43  4073    48   608]
 [    0    27   244     8]
 [   19   520     5  4040]]


In [None]:
model.save('model-multi.h5')
print("\nFinal multi-class baseline model saved to 'model-multi.h5'")




Final multi-class baseline model saved to 'model-multi.h5'
