# Network Anomaly Detection
Random forests, which are ensembles of decision trees, effectively handle complex, high-dimensional data and can be used to detect these anomalous patterns.
When used for anomaly detection, a random forest is trained exclusively on data representing normal conditions.
# NSL-KDD Dataset

The NSL-KDD dataset refines the original KDD Cup 1999 dataset by eliminating redundant entries and correcting imbalanced class distributions. Researchers commonly adopt it as a standard reference for measuring the performance of various intrusion detection models.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline # this one support SMOT
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
# Set the file path to the dataset
file_path = r'KDD+.txt'

# What is inside KDD+.txt:
# 0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
# 0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
# 0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19

# Define the column names corresponding to the NSL-KDD dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'level'
]
# Read the combined NSL-KDD dataset into a DataFrame
df = pd.read_csv(file_path, names=columns)
df.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19


In [3]:
# we can see the attack column having different values: normal and others.
# create a new column and add 0 for normal traffic and 1 for the rest
df['attack_flag'] = df['attack'].apply(lambda a: 0 if a == 'normal' else 1)

Define lists categorizing specific attacks into four major groups:

    DoS (Denial of Service) attacks such as neptune and smurf
    Probe attacks that scan networks for vulnerabilities, like satan or ipsweep
    Privilege Escalation attacks that attempt to gain unauthorized admin-level control, such as buffer_overflow
    Access attacks that seek to breach system access controls, like guess_passwd


    0 for normal traffic
    1 for DoS attacks
    2 for Probe attacks
    3 for Privilege Escalation attacks
    4 for Access attacks


In [4]:
# Multi-class classification target categories
dos_attacks = ['apache2', 'back', 'land', 'neptune', 'mailbomb', 'pod', 
               'processtable', 'smurf', 'teardrop', 'udpstorm', 'worm']
probe_attacks = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']
privilege_attacks = ['buffer_overflow', 'loadmdoule', 'perl', 'ps', 
                     'rootkit', 'sqlattack', 'xterm']
access_attacks = ['ftp_write', 'guess_passwd', 'http_tunnel', 'imap', 
                  'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 
                  'snmpguess', 'spy', 'warezclient', 'warezmaster', 
                  'xclock', 'xsnoop']
def map_attack(attack):
    if attack in dos_attacks:
        return 1
    elif attack in probe_attacks:
        return 2
    elif attack in privilege_attacks:
        return 3
    elif attack in access_attacks:
        return 4
    else:
        return 0

# Assign multi-class category to each row
df['attack_map'] = df['attack'].apply(map_attack)

In [5]:
# Encoding Categorical Variables
# protocol_type (e.g., tcp, udp) and service (e.g., http, ftp)
# Encoding categorical variables
features_to_encode = ['protocol_type', 'service']
encoded = pd.get_dummies(df[features_to_encode])

In [6]:
# Selecting Numeric Features
# Beyond categorical variables, the dataset contains a range of numeric features that describe various aspects of network traffic. 
#These include basic metrics like duration, src_bytes, and dst_bytes, as well as more specialized features such as serror_rate and dst_host_srv_diff_host_rate, 
# which capture statistical properties of the network sessions. 
# Numeric features that capture various statistical properties of the traffic
numeric_features = [
    'duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 
    'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted', 
    'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 
    'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
    'dst_host_srv_rerror_rate'
]

In [7]:
# combine the one-hot encoded categorical features with the selected numeric features. We join them into a single DataFrame train_set
# Combine encoded categorical variables and numeric features
train_set = encoded.join(df[numeric_features])
# target variable
multi_y = df['attack_map']

In [8]:
# Split data into training and test sets (80% pentru antrenare (train_X și train_y) 20% pentru testare (test_X și test_y))
train_X, test_X, train_y, test_y = train_test_split(train_set, multi_y, test_size=0.2, random_state=42)

In [9]:
# Further split the training set into separate training and validation sets
# This supports model tuning and hyperparameter optimization without contaminating the final test data.

multi_train_X, multi_val_X, multi_train_y, multi_val_y = train_test_split(train_X, train_y, test_size=0.3, random_state=42)

After splitting, we have:

    train_X, train_y: Core training set
    test_X, test_y: Reserved for the final performance evaluation
    multi_train_X, multi_train_y: Training subset for fitting the model
    multi_val_X, multi_val_y: Validation subset for hyperparameter tuning


TOATE DATELE

 └── 80% Train/Validation
 
     ├── 70% Train (multi_train_X, multi_train_y)
     └── 30% Validation (multi_val_X, multi_val_y)
 
 └── 20% Test (test_X, test_y)

In [10]:
###################### if using SMOTE only on privilege class prepare before ###################
# Verificăm distribuția claselor înainte de a aplica SMOTE
print("Distribuția claselor înainte de SMOTE:")
print(np.bincount(multi_train_y))
# clasa 3 (Privilege) conține doar 59 de exemple, ceea ce este prea puțin pentru a aplica SMOTE eficient.
# OPTIONS TO FIX THIS:
# 1) add more privesc data
# 2) add smote only on the other classes except privesc

Distribuția claselor înainte de SMOTE:
[43032 30057  7933    59  2088]


In [11]:
# -------------------------------------------------------------------------
# 🔥 PIPELINE + SMOTE + RANDOM FOREST + GRID SEARCH 🔥
# ------------------------------------------------------------------------


pipeline = Pipeline([
    #('smote', SMOTE(random_state=42)),
    #('rf', RandomForestClassifier(n_estimators=200,random_state=42)), # with SMOTE
    #('rf', RandomForestClassifier(n_estimators=200, class_weight='balanced',random_state=42)) # without SMOTE but with class_weight='balanced'
    ('xgb', XGBClassifier(random_state=42))
])

# Define hyperparameter grid for RandomForest
param_grid = {
    #'rf__n_estimators': [100, 200],
    #'rf__max_depth': [10, 20, None],
    #'rf__min_samples_split': [2, 5]
    #'xgb__n_estimators': [100, 200, 300],  # Numărul de estimatori
    #'xgb__learning_rate': [0.01, 0.1, 0.2],  # Rata de învățare
    #'xgb__max_depth': [3, 6, 9],  # Adâncimea maximă a arborilor
    #'xgb__subsample': [0.8, 1.0],  # Proporția de date folosite la antrenament
    #'xgb__colsample_bytree': [0.8, 1.0]  # Proporția de trăsături folosite pe fiecare arbore
}



# Define GridSearchCV
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3, 
    scoring='f1_weighted',
    n_jobs=2 # use 2 cores
    #verbose=2,
)

# Train model
grid_search.fit(multi_train_X, multi_train_y)

# Best model results
print("\nBest Parameters:", grid_search.best_params_)
print("Best Score (on train+val split):", grid_search.best_score_)


Best Parameters: {}
Best Score (on train+val split): 0.9954592645095399


In [12]:
# -------------------------------------------------------------------------
# 🔥 EVALUARE PE VALIDATION 🔥
# -------------------------------------------------------------------------

multi_predictions = grid_search.best_estimator_.predict(multi_val_X)

accuracy = accuracy_score(multi_val_y, multi_predictions)
precision = precision_score(multi_val_y, multi_predictions, average='weighted')
recall = recall_score(multi_val_y, multi_predictions, average='weighted')
f1 = f1_score(multi_val_y, multi_predictions, average='weighted')

print("\nValidation Set Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Validation Set Evaluation:
Accuracy: 0.9964
Precision: 0.9964
Recall: 0.9964
F1-Score: 0.9964


In [13]:
# Classification Report for Validation Set
print("\nClassification Report for Validation Set:")
class_labels = ['Normal', 'DoS', 'Probe', 'Privilege', 'Access']
print(classification_report(multi_val_y, multi_predictions, target_names=class_labels))


Classification Report for Validation Set:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     18689
         DoS       1.00      1.00      1.00     12642
       Probe       1.00      1.00      1.00      3395
   Privilege       0.76      0.50      0.60        26
      Access       0.99      0.93      0.96       892

    accuracy                           1.00     35644
   macro avg       0.95      0.89      0.91     35644
weighted avg       1.00      1.00      1.00     35644



In [14]:
# -------------------------------------------------------------------------
# 🔥 EVALUARE PE TEST 🔥
# -------------------------------------------------------------------------

test_multi_predictions = grid_search.best_estimator_.predict(test_X)

test_accuracy = accuracy_score(test_y, test_multi_predictions)
test_precision = precision_score(test_y, test_multi_predictions, average='weighted')
test_recall = recall_score(test_y, test_multi_predictions, average='weighted')
test_f1 = f1_score(test_y, test_multi_predictions, average='weighted')

print("\nTest Set Evaluation:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")


Test Set Evaluation:
Accuracy: 0.9957
Precision: 0.9956
Recall: 0.9957
F1-Score: 0.9956


In [15]:
# Classification Report for Test Set
print("\nClassification Report for Test Set:")
print(classification_report(test_y, test_multi_predictions, target_names=class_labels))


Classification Report for Test Set:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     15486
         DoS       1.00      1.00      1.00     10688
       Probe       0.99      1.00      0.99      2749
   Privilege       0.82      0.61      0.70        23
      Access       0.98      0.93      0.95       758

    accuracy                           1.00     29704
   macro avg       0.96      0.91      0.93     29704
weighted avg       1.00      1.00      1.00     29704



In [1]:
########################################## RANDOM FOREST SIMPLE ######################################################

In [11]:
# -------------------------------------------------------------------------
# RANDOM FOREST simple 🔥
# ------------------------------------------------------------------------
#######class_weight='balanced' => take more serious rare classes like privilege escalation

#rf_model_multi = RandomForestClassifier(class_weight='balanced', random_state=42)
#rf_model_multi.fit(multi_train_X, multi_train_y)

###### Predict and evaluate the model on the validation set#######
#multi_predictions = rf_model_multi.predict(multi_val_X)
#accuracy = accuracy_score(multi_val_y, multi_predictions)
#precision = precision_score(multi_val_y, multi_predictions, average='weighted')
#recall = recall_score(multi_val_y, multi_predictions, average='weighted')
#f1 = f1_score(multi_val_y, multi_predictions, average='weighted')
#print(f"Validation Set Evaluation:")
#print(f"Accuracy: {accuracy:.4f}")
#print(f"Precision: {precision:.4f}")
#print(f"Recall: {recall:.4f}")
#print(f"F1-Score: {f1:.4f}")
##### Classification Report for Validation Set#####
#print("Classification Report for Validation Set:")
#class_labels = ['Normal', 'DoS', 'Probe', 'Privilege', 'Access']
#print(classification_report(multi_val_y, multi_predictions, target_names=class_labels))

###### Final evaluation on the test set #####
#test_multi_predictions = rf_model_multi.predict(test_X)
#test_accuracy = accuracy_score(test_y, test_multi_predictions)
#test_precision = precision_score(test_y, test_multi_predictions, average='weighted')
#test_recall = recall_score(test_y, test_multi_predictions, average='weighted')
#test_f1 = f1_score(test_y, test_multi_predictions, average='weighted')
#print("\nTest Set Evaluation:")
#print(f"Accuracy: {test_accuracy:.4f}")
#print(f"Precision: {test_precision:.4f}")
#print(f"Recall: {test_recall:.4f}")
#print(f"F1-Score: {test_f1:.4f}")

######## Classification Report for Test Set #############
#print("Classification Report for Test Set:")
#print(classification_report(test_y, test_multi_predictions, target_names=class_labels))

################ Very bad on privilege escalation because I have only few examples.###################
################ Let's use SMOTE to "invent" more and balance the categories###########################
#from imblearn.over_sampling import SMOTE
#smote = SMOTE(random_state=42)
#multi_train_X_smote, multi_train_y_smote = smote.fit_resample(multi_train_X, multi_train_y)
#print("Shape original:", multi_train_X.shape)
#print("Shape dupa SMOTE:", multi_train_X_smote.shape)

################ Train on the generated data ##################
#rf_model_multi.fit(multi_train_X_smote, multi_train_y_smote)

########### Prezici pe validation (care nu a fost modificat!!)#################
#multi_predictions = rf_model_multi.predict(multi_val_X)

######### Evaluezi #################
#print(classification_report(multi_val_y, multi_predictions, target_names=class_labels))

In [59]:
# Save the trained model to a file
#model_filename = 'network_anomaly_detection_model.joblib'
#joblib.dump(rf_model_multi, model_filename)