In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import os
import datetime

In [2]:
# Step 1: Data Loading with the correct delimiter
file_path = r"C:\Users\Canela\Desktop\Project\CTU-IoT-Malware-Capture-34-1\bro\conn.log.labeled.csv"

df = pd.read_csv(file_path, delimiter='\t', comment='#', na_values='-', header=None, dtype={'service': str})

# Manually assign column names based on the expected structure
df.columns = [
    'ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'service',
    'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp',
    'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
    'label'
]

# Check the unique values in the 'label' column
print("Final unique labels:")
print(df['label'].unique())
print("Number of unique labels:", len(df['label'].unique()))


Final unique labels:
['-   Benign   -' '-   Malicious   C&C'
 '-   Malicious   PartOfAHorizontalPortScan' '-   Malicious   DDoS']
Number of unique labels: 4


In [3]:
# Step 2: Handle Categorical Variables and Feature Engineering
# Encode label
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Drop columns that might cause data leakage
df.drop(columns=['uid', 'ts', 'id.orig_h', 'id.resp_h'], inplace=True)

# One-Hot Encode categorical variables
categorical_features = ['proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Fill remaining NaN values if any
df.fillna(0, inplace=True)

# Split features and target
X = df.drop(['label', 'label_encoded'], axis=1)
y = df['label_encoded']

In [4]:
# Step 3: Handle Imbalance using SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

In [5]:
# Step 4: Scale Features
scaler = RobustScaler()
X_res_scaled = scaler.fit_transform(X_res)

In [6]:
# Step 5: Stratified K-Fold Cross-Validation Setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []
roc_auc_scores = []
log_losses = []

# Model training and evaluation within each fold
for train_index, val_index in skf.split(X_res_scaled, y_res):
    X_train, X_val = X_res_scaled[train_index], X_res_scaled[val_index]
    y_train, y_val = y_res[train_index], y_res[val_index]

    # Compute class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    # Step 6: Build and Compile Model
    model = Sequential([
        Flatten(input_shape=(X_train.shape[1],)),
        Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.3),
        Dense(len(np.unique(y_res)), activation='softmax')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Setup callbacks for early stopping and tensorboard
    log_dir = os.path.join("train_logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Step 7: Model Training
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=256,
              class_weight=class_weights_dict, callbacks=[tensorboard_callback, early_stop_callback])

    # Step 8: Model Evaluation
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=2)
    val_predictions = model.predict(X_val)
    y_pred_classes = np.argmax(val_predictions, axis=1)

    cross_val_scores.append(val_accuracy)
    roc_auc_scores.append(roc_auc_score(y_val, val_predictions, multi_class='ovo'))
    log_losses.append(log_loss(y_val, val_predictions))

    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_classes))
    print("Classification Report:")
    print(classification_report(y_val, y_pred_classes, target_names=label_encoder.classes_))



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
360/360 - 0s - loss: 0.2147 - accuracy: 0.9933 - 230ms/epoch - 640us/step
Confusion Matrix:
[[2834   28    0   17]
 [   0 2879    0    0]
 [  29    0 2847    3]
 [   0    0    0 2879]]
Classification Report:
                                           precision    recall  f1-score   support

                           -   Benign   -       0.99      0.98      0.99      2879
                      -   Maliciou

In [7]:
# Step 9: Summary of Cross-Validation Results
print(f"Mean CV Accuracy: {np.mean(cross_val_scores)}")
print(f"Mean ROC-AUC Score: {np.mean(roc_auc_scores)}")
print(f"Mean Log Loss: {np.mean(log_losses)}")

Mean CV Accuracy: 0.9871473193168641
Mean ROC-AUC Score: 0.9932508252323821
Mean Log Loss: 0.1373002624623973
