### Installations

In [1]:
!pip install pandas scikit-learn tensorflow matplotlib



### Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

### File Paths

In [2]:
files = [
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
]

### Preprocess Data

In [3]:
def preprocess_data(file):
    print(f"Process {file}...")
    df = pd.read_csv(file)
    
    # remove leading and trailing white space from attribute names
    df.columns = df.columns.str.strip()
    
    # replace infinite values with column medians
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)
    
    # encode labels (BENING = 0; DDoS = 1)
    df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
    
    #debug
    print("Class distribution:")
    print(df['Label'].value_counts())
                                    
    X = df.drop(columns=['Label'])
    y = df['Label']
                                    
    return X, y

### Train Models and Evaluate Effectiveness

In [4]:
def train_and_evaluate(X, y):
    
    # standardize features for NN and SVM models
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # split data into 80% train, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    results = {}
    
    # random forest classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    results["Random Forest"] = {
        "accuracy": rf_accuracy,
        "classification_report": classification_report(y_test, rf_predictions)
    }
    
    # support vector machine
    svm_model = SVC(kernel='rbf', random_state=42)
    svm_model.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    results["Support Vector Machine"] = {
        "accuracy": svm_accuracy,
        "classification_report": classification_report(y_test, svm_predictions)
    }
    
    # neural network
    nn_model = Sequential()
    nn_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    nn_model.add(Dense(32, activation='relu'))
    nn_model.add(Dense(1, activation='sigmoid'))  # Binary classification output layer

    nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    nn_predictions = nn_model.predict(X_test).flatten()
    nn_predictions_binary = [1 if pred > 0.5 else 0 for pred in nn_predictions]
    
    nn_accuracy = accuracy_score(y_test, nn_predictions_binary)
    results["Neural Network"] = {
        "accuracy": nn_accuracy,
        "classification_report": classification_report(y_test, nn_predictions_binary)
    }

    return results
    

### File Processing and Evaluations

In [None]:
all_results = {}

# go through each file and process individually while also evaluating models
for file in files:
    try:
        X, y = preprocess_data(file)
        results = train_and_evaluate(X, y)
        all_results[file] = results
        
        print(f"\nResults for {file}:")
        for model_name, metrics in results.items():
            print(f"{model_name} Accuracy: {metrics['accuracy']:.4f}")
            print(metrics['classification_report'])
            
        print("-" * 50)

    except Exception as e:
        print(f"Error processing {file}: {e}")

# summarize results
print("\nSummary of Results Across All Files:")
for file_name, result in all_results.items():
    print(f"\nFile: {file_name}")
    for model_name, metrics in result.items():
        print(f"{model_name}: Accuracy: {metrics['accuracy']:.4f}")

Process Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Class distribution:
1    128027
0     97718
Name: Label, dtype: int64

Results for Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv:
Random Forest Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19405
           1       1.00      1.00      1.00     25744

    accuracy                           1.00     45149
   macro avg       1.00      1.00      1.00     45149
weighted avg       1.00      1.00      1.00     45149

Support Vector Machine Accuracy: 0.9987
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19405
           1       1.00      1.00      1.00     25744

    accuracy                           1.00     45149
   macro avg       1.00      1.00      1.00     45149
weighted avg       1.00      1.00      1.00     45149

Neural Network Accuracy: 0.9997
              precision    recall  f1-score   supp

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv:
Random Forest Accuracy: 0.9999
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1       0.75      0.43      0.55         7

    accuracy                           1.00     57721
   macro avg       0.87      0.71      0.77     57721
weighted avg       1.00      1.00      1.00     57721

Support Vector Machine Accuracy: 0.9999
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1       0.00      0.00      0.00         7

    accuracy                           1.00     57721
   macro avg       0.50      0.50      0.50     57721
weighted avg       1.00      1.00      1.00     57721

Neural Network Accuracy: 0.9998
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1       0.31      0.57      0.40         7

    a

### Removing Redundant Features

In [6]:
def remove_highly_correlated_features(X, correlation_threshold=0.95):
    print("Removing highly correlated features...")
    
    
    corr_matrix = X.corr().abs()
    
    # find features with correlation above the threshold
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
    
    print(f"Features to drop due to high correlation (>{correlation_threshold}): {to_drop}")
    
    # drop the highly correlated features
    X_reduced = X.drop(columns=to_drop)
    
    return X_reduced

all_results_reduced = {}

print("\nEvaluating models on reduced datasets (after removing highly correlated features)...")

for file in files:
    try:
        # preprocess data (original dataset)
        X_original, y_original = preprocess_data(file)
        
        # remove highly correlated features
        X_reduced = remove_highly_correlated_features(X_original)
        
        # train and evaluate models on reduced dataset
        results_reduced = train_and_evaluate(X_reduced, y_original)
        all_results_reduced[file] = results_reduced

        print(f"\nResults for {file} (Reduced Dataset - Correlated Features Removed):")
        for model_name, metrics in results_reduced.items():
            print(f"{model_name} Accuracy: {metrics['accuracy']:.4f}")
            print(metrics['classification_report'])
        
        print("-" * 50)

    except Exception as e:
        print(f"Error processing {file}: {e}")

# summarize results
print("\nSummary of Results Across All Files (Reduced Dataset):")
for file_name, result in all_results_reduced.items():
    print(f"\nFile: {file_name}")
    for model_name, metrics in result.items():
        print(f"{model_name}: Accuracy: {metrics['accuracy']:.4f}")


Evaluating models on reduced datasets (after removing highly correlated features)...
Process Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Class distribution:
1    128027
0     97718
Name: Label, dtype: int64
Removing highly correlated features...
Features to drop due to high correlation (>0.95): ['Total Backward Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Max', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Packet Length Std', 'SYN Flag Count', 'ECE Flag Count', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Header Length.1', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Active Min', 'Idle Max']

Results for Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv (Reduced Dataset - Correlated Features Removed):
Random Forest Accuracy: 1.0000
              pre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv (Reduced Dataset - Correlated Features Removed):
Random Forest Accuracy: 0.9999
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1       1.00      0.57      0.73         7

    accuracy                           1.00     57721
   macro avg       1.00      0.79      0.86     57721
weighted avg       1.00      1.00      1.00     57721

Support Vector Machine Accuracy: 0.9999
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1       0.00      0.00      0.00         7

    accuracy                           1.00     57721
   macro avg       0.50      0.50      0.50     57721
weighted avg       1.00      1.00      1.00     57721

Neural Network Accuracy: 0.9999
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57714
           1