In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [2]:
file_paths = {
    "Friday_Afternoon_DDos": r"Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Friday_Afternoon_PortScan": r"Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday_Morning": r"Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Monday": r"Monday-WorkingHours.pcap_ISCX.csv",
    "Thursday_Afternoon_Infiltration": r"Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Thursday_Morning_WebAttacks": r"Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Tuesday": r"Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday": r"Wednesday-workingHours.pcap_ISCX.csv"
}

In [3]:
combined_df = pd.DataFrame()

In [4]:
for name, path in file_paths.items():
    try:
        temp_df = pd.read_csv(path)
        temp_df['Source'] = name  
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"Error reading {name}: {e}")


combined_df = combined_df.drop_duplicates()

In [5]:
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)
combined_df.fillna(0, inplace=True)

In [6]:
features = combined_df.drop(columns=[' Label', 'Source'])
labels = combined_df[' Label']

In [7]:
labels_encoded = labels.apply(lambda x: 0 if x.strip().upper() == "BENIGN" else 1)

In [8]:
k = 20  
selector = SelectKBest(score_func=f_classif, k=k)
features_selected = selector.fit_transform(features, labels_encoded)
selected_feature_names = features.columns[selector.get_support()]




In [9]:
features = features[selected_feature_names]

In [10]:
optimal_pca_components = 15  
pca = PCA(n_components=optimal_pca_components)
features_reduced = pca.fit_transform(features)

In [11]:
anomaly_ratio = labels_encoded.mean()  
contamination = max(anomaly_ratio, 0.01)  

In [12]:
iso_forest = IsolationForest(n_estimators=300, max_samples=0.8, contamination=contamination, random_state=42)
iso_forest.fit(features_reduced)

In [13]:
iso_predictions = iso_forest.predict(features_reduced)
iso_predictions = pd.Series(iso_predictions).map({1: 0, -1: 1})  # Map 0 = Normal, 1 = Anomaly

In [14]:

print("Updated Classification Report:")
print(classification_report(labels_encoded, iso_predictions))

print("Updated Confusion Matrix:")
print(confusion_matrix(labels_encoded, iso_predictions))

roc_auc = roc_auc_score(labels_encoded, iso_predictions)
print(f"Updated ROC-AUC Score: {roc_auc}")

print("Selected Features:")
print(selected_feature_names)

Updated Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89   2148386
           1       0.45      0.45      0.45    425878

    accuracy                           0.82   2574264
   macro avg       0.67      0.67      0.67   2574264
weighted avg       0.82      0.82      0.82   2574264

Updated Confusion Matrix:
[[1912759  235627]
 [ 235627  190251]]
Updated ROC-AUC Score: 0.6685251200466029
Selected Features:
Index([' Flow Duration', 'Bwd Packet Length Max', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', ' Flow IAT Std', ' Flow IAT Max',
       'Fwd IAT Total', ' Fwd IAT Std', ' Fwd IAT Max', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', 'FIN Flag Count', ' Average Packet Size',
       ' Avg Bwd Segment Size', 'Idle Mean', ' Idle Max', ' Idle Min'],
      dtype='object')
