# Load and Train the desire testing dataset files.

In [None]:
import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# data = pd.read_csv('https://ddosciu.s3.us-east-2.amazonaws.com/CSVs/UNB/Portmap.csv')
# data = pd.read_csv('https://unsw-my.sharepoint.com/:x:/r/personal/z5025758_ad_unsw_edu_au/_layouts/15/Doc.aspx?sourcedoc=%7B2A810F6A-CC3D-4D98-909E-37489D8DAF98%7D&file=UNSW_NB15_testing-set.csv&action=default&mobileredirect=true')
data = pd.read_csv("https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/Train.csv")
data.head(5)

In [None]:
set(data[' Label'])

In [None]:
data.info()

Total Fwd Packets (feature 9), Total Backward Packets (feature 10), Flow Packets/s (feature 43):, Flow Bytes/s (feature 21), Protocol (feature 6): , Flow Duration (feature 8):, Packet Length Features (features 45, 46, 47, 48):, Flag Counts (features 50-57)

# Pre-process the data

In [None]:
# Encode categorical features (e.g., IP addresses)
encoder = LabelEncoder()
data[' Source IP'] = encoder.fit_transform(data[' Source IP'])
data[' Destination IP'] = encoder.fit_transform(data[' Destination IP'])
data[' Timestamp'] = pd.to_datetime(data[' Timestamp']).astype(np.int64)
data[' Label'] = encoder.fit_transform(data[' Label'])
data.head()

In [None]:
# Inspect the mapping between original labels and encoded numbers
print("Mapping between original labels and encoded numbers:")
for label, encoded_label in zip(encoder.classes_, encoder.transform(encoder.classes_)):
    print(f"{label}: {encoded_label}")

In [None]:
# Drop unnecessary columns in the CVs's
try:
  data = data.drop(['Unnamed: 0', 'Flow ID', 'SimillarHTTP'], axis=1)
  data = data.drop([' Fwd Header Length.1'], axis=1)
except:
  print('Columns are dropped already')

  # Drop unnecessary columns in the CVs's
try:
  data = data.drop([' Fwd Header Length.1'], axis=1)
except:
  print('Columns are dropped already')

In [None]:
# Set the desired number of majority class samples
num_majority_samples = 5000 #100000

# Get the minority class label
minority_class = data[' Label'].value_counts().idxmin()

# Separate majority and minority class samples
majority_samples = data[data[' Label'] != minority_class]
minority_samples = data[data[' Label'] == minority_class]

# Sample the majority class samples
majority_samples_sampled = majority_samples.sample(num_majority_samples, random_state=42)

# Combine the sampled majority class samples and minority class samples
balanced_data = pd.concat([majority_samples_sampled, minority_samples], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_data.head()

In [None]:
num_unique_labels = balanced_data[' Label'].nunique()
print("Number of unique labels in the balanced data:", num_unique_labels)
label_frequency = balanced_data[' Label'].value_counts()
print("Frequency of each unique label in the balanced data:")
print(label_frequency)
label_frequency_data = data[' Label'].value_counts()
print("Frequency of each unique label in the original data:")
print(label_frequency_data)

In [None]:
import pandas as pd

# Assuming your dataset is named 'data'
correlation = balanced_data[' Inbound'].corr(balanced_data[' Label'])
print("Correlation between 'Inbound' and 'Label':", correlation)

In [None]:
balanced_data[' Inbound'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your dataframe is named 'df'
plt.figure(figsize=(10, 6))
sns.countplot(data=balanced_data, x=' Inbound', hue=' Label')
plt.title('Frequency of Inbound with Label as Hue')
plt.xlabel('Inbound')
plt.ylabel('Count')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your dataframe is named 'df'
plt.figure(figsize=(10, 6))
sns.countplot(data=balanced_data, x=' Label', hue=' Inbound')
plt.title('Frequency of Label with Inbound as Hue')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# Assuming your dataframe is named 'df'
inbound_label_counts = balanced_data.groupby([' Inbound', ' Label']).size().reset_index(name='Count')
total_counts = balanced_data.groupby([' Inbound']).size().reset_index(name='Total_Count')

inbound_label_counts = inbound_label_counts.merge(total_counts, on=' Inbound')
inbound_label_counts['Percentage'] = (inbound_label_counts['Count'] / inbound_label_counts['Total_Count']) * 100

plt.figure(figsize=(10, 6))
sns.barplot(data=inbound_label_counts, x=' Inbound', y='Percentage', hue=' Label')
plt.title('Frequency of Inbound with Label as Hue (Percentage)')
plt.xlabel('Inbound')
plt.ylabel('Percentage')
plt.show()

In [None]:
inbound_label_counts

In [None]:
# Split the data into train and test sets
X = balanced_data.drop([' Inbound',' Label'], axis=1)
y = balanced_data[' Label']

selected_columns = [
    ' Total Fwd Packets',
    ' Total Backward Packets',
    ' Flow Packets/s',
    'Flow Bytes/s',
    ' Protocol',
    # ' Flow Duration',
    ' SYN Flag Count',
    ' RST Flag Count',
    ' PSH Flag Count',
    ' ACK Flag Count',
    ' URG Flag Count',
    ' CWE Flag Count',
    ' ECE Flag Count',
    'Fwd Packets/s',
    ' Bwd Packets/s',
    ' Min Packet Length',
    ' Max Packet Length',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance'
]

# Selecting only the columns of interest
X_specific = X[selected_columns]


X_train, X_test, y_train, y_test = train_test_split(X_specific, y, test_size=0.2,stratify = y, random_state=42)

In [None]:
X_train.columns

In [None]:
X_train.isnull().sum().sum()

In [None]:
import numpy as np

def check_data_issues(data):
    if data.isnull().values.any():
        print("There are NaN values in the dataset.")
    # Check for infinite values
    if np.isinf(data).values.any():
        print("There are infinite values in the dataset.")
    # Check for extremely large values
    max_value = data.max().max()
    if max_value > np.finfo(np.float64).max:
        print(f"There are values too large for dtype('float64') in the dataset. Max value: {max_value}")
    # Check for extremely small values
    min_value = data.min().min()
    if min_value < np.finfo(np.float64).min:
        print(f"There are values too small for dtype('float64') in the dataset. Min value: {min_value}")

In [None]:
# Check the training and testing data for any issues
check_data_issues(X_train)
check_data_issues(X_test)

In [None]:
# Replace infinity values with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
X_train.columns
column_names =X_train.columns
column_names

In [None]:
# Impute NaN values with the mean of the corresponding column
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Now, standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
type(X_train)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from joblib import dump

# Create an empty DataFrame with the desired columns
model_comparison = pd.DataFrame(columns=["Model", "Accuracy", "F1 Score"])
saved_models = {}  # Dictionary to store saved models


# Train and evaluate machine learning models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}
print("X shape",X_train.shape)
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Append the metrics to the DataFrame
    model_comparison = model_comparison._append({"Model": name, "Accuracy": accuracy, "F1 Score": f1}, ignore_index=True)

    # Save the trained model weight
    filename = f"{name}_model.joblib"
    dump(model, filename)
    saved_models[name] = filename  # Store the filename for later use in the tested DataSets

    print(f"{name}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 score: {f1}")
    print(classification_report(y_test, y_pred))
    print(f"Model saved as: {filename}")
    print("\n")

display(model_comparison)

In [None]:
# Display the comparison DataFrame in the splited train data set 
model_comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Initialize the DataFrame to store model comparison
model_comparison = pd.DataFrame(columns=["Model", "Accuracy", "F1 Score"])

# Initialize dictionary to store confusion matrices
confusion_matrices = {}

# Initialize lists to store accuracy and F1 score for each model
accuracy_scores = []
f1_scores = []

# Iterate over each loaded model
for name, model in models.items():
    # Make predictions on test data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[name] = cm

    # Append results to model comparison DataFrame
    model_comparison = model_comparison._append({"Model": name, "Accuracy": accuracy, "F1 Score": f1}, ignore_index=True)

    # Append accuracy and F1 score to lists
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

    # Print model evaluation metrics and confusion matrix
    print(f"{name}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 score: {f1}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

# Plotting accuracy scores
plt.figure(figsize=(8, 4))
plt.bar(model_comparison['Model'], accuracy_scores, color='skyblue')
plt.title('Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.show()

# Plotting F1 scores
plt.figure(figsize=(8, 4))
plt.bar(model_comparison['Model'], f1_scores, color='orange')
plt.title('F1 Score Comparison')
plt.xlabel('Model')
plt.ylabel('F1 Score')
plt.xticks(rotation=45)
plt.show()

# Displaying the model comparison DataFrame
print("Model Comparison:")
print(model_comparison)

# Plot confusion matrices for each model using Seaborn
for name, cm in confusion_matrices.items():
    plt.figure(figsize=(8, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=True, yticklabels=True)
    plt.title(f"Confusion Matrix for {name}")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

# Loading Saved model weights

In [None]:
#Load the saved model
from joblib import load

# Load each model individually
# Give path of each model here
logistic_regression_model = load("Logistic Regression_model.joblib")
random_forest_model = load("Random Forest_model.joblib")
svm_model = load("Support Vector Machine_model.joblib")

# Store the loaded models in a dictionary
loaded_models = {
    "Logistic Regression": logistic_regression_model,
    "Random Forest": random_forest_model,
    "Support Vector Machine": svm_model
}

In [None]:
loaded_models.items()

##Preprocessing For Testing DATA Change Title

In [None]:
feature_columns = X_specific.columns
feature_columns

In [None]:

# In case i dont have book.xlsx I have manually place the columns mapping here too' where keys are testing columns and values are corresponding training columns

column_mapping = {'Flow ID': 'Flow ID',
 'Src IP': ' Source IP',
 'Src Port': ' Source Port',
 'Dst IP': ' Destination IP',
 'Dst Port': ' Destination Port',
 'Protocol': ' Protocol',
 'Timestamp': ' Timestamp',
 'Flow Duration': ' Flow Duration',
 'Tot Fwd Pkts': ' Total Fwd Packets',
 'Tot Bwd Pkts': ' Total Backward Packets',
 'TotLen Fwd Pkts': 'Total Length of Fwd Packets',
 'TotLen Bwd Pkts': ' Total Length of Bwd Packets',
 'Fwd Pkt Len Max': ' Fwd Packet Length Max',
 'Fwd Pkt Len Min': ' Fwd Packet Length Min',
 'Fwd Pkt Len Mean': ' Fwd Packet Length Mean',
 'Fwd Pkt Len Std': ' Fwd Packet Length Std',
 'Bwd Pkt Len Max': 'Bwd Packet Length Max',
 'Bwd Pkt Len Min': ' Bwd Packet Length Min',
 'Bwd Pkt Len Mean': ' Bwd Packet Length Mean',
 'Bwd Pkt Len Std': ' Bwd Packet Length Std',
 'Flow Byts/s': 'Flow Bytes/s',
 'Flow Pkts/s': ' Flow Packets/s',
 'Flow IAT Mean': ' Flow IAT Mean',
 'Flow IAT Std': ' Flow IAT Std',
 'Flow IAT Max': ' Flow IAT Max',
 'Flow IAT Min': ' Flow IAT Min',
 'Fwd IAT Tot': 'Fwd IAT Total',
 'Fwd IAT Mean': ' Fwd IAT Mean',
 'Fwd IAT Std': ' Fwd IAT Std',
 'Fwd IAT Max': ' Fwd IAT Max',
 'Fwd IAT Min': ' Fwd IAT Min',
 'Bwd IAT Tot': 'Bwd IAT Total',
 'Bwd IAT Mean': ' Bwd IAT Mean',
 'Bwd IAT Std': ' Bwd IAT Std',
 'Bwd IAT Max': ' Bwd IAT Max',
 'Bwd IAT Min': ' Bwd IAT Min',
 'Fwd PSH Flags': 'Fwd PSH Flags',
 'Bwd PSH Flags': ' Bwd PSH Flags',
 'Fwd URG Flags': ' Fwd URG Flags',
 'Bwd URG Flags': ' Bwd URG Flags',
 'Fwd Header Len': ' Fwd Header Length',
 'Bwd Header Len': ' Bwd Header Length',
 'Fwd Pkts/s': 'Fwd Packets/s',
 'Bwd Pkts/s': ' Bwd Packets/s',
 'Pkt Len Min': ' Min Packet Length',
 'Pkt Len Max': ' Max Packet Length',
 'Pkt Len Mean': ' Packet Length Mean',
 'Pkt Len Std': ' Packet Length Std',
 'Pkt Len Var': ' Packet Length Variance',
 'FIN Flag Cnt': 'FIN Flag Count',
 'SYN Flag Cnt': ' SYN Flag Count',
 'RST Flag Cnt': ' RST Flag Count',
 'PSH Flag Cnt': ' PSH Flag Count',
 'ACK Flag Cnt': ' ACK Flag Count',
 'URG Flag Cnt': ' URG Flag Count',
 'CWE Flag Count': ' CWE Flag Count',
 'ECE Flag Cnt': ' ECE Flag Count',
 'Down/Up Ratio': ' Down/Up Ratio',
 'Pkt Size Avg': ' Average Packet Size',
 'Fwd Seg Size Avg': ' Avg Fwd Segment Size',
 'Bwd Seg Size Avg': ' Avg Bwd Segment Size',
 'Fwd Byts/b Avg': 'Fwd Avg Bytes/Bulk',
 'Fwd Pkts/b Avg': ' Fwd Avg Packets/Bulk',
 'Fwd Blk Rate Avg': ' Fwd Avg Bulk Rate',
 'Bwd Byts/b Avg': ' Bwd Avg Bytes/Bulk',
 'Bwd Pkts/b Avg': ' Bwd Avg Packets/Bulk',
 'Bwd Blk Rate Avg': 'Bwd Avg Bulk Rate',
 'Subflow Fwd Pkts': 'Subflow Fwd Packets',
 'Subflow Fwd Byts': ' Subflow Fwd Bytes',
 'Subflow Bwd Pkts': ' Subflow Bwd Packets',
 'Subflow Bwd Byts': ' Subflow Bwd Bytes',
 'Init Fwd Win Byts': 'Init_Win_bytes_forward',
 'Init Bwd Win Byts': ' Init_Win_bytes_backward',
 'Fwd Act Data Pkts': ' act_data_pkt_fwd',
 'Fwd Seg Size Min': ' min_seg_size_forward',
 'Active Mean': 'Active Mean',
 'Active Std': ' Active Std',
 'Active Max': ' Active Max',
 'Active Min': ' Active Min',
 'Idle Mean': 'Idle Mean',
 'Idle Std': ' Idle Std',
 'Idle Max': ' Idle Max',
 'Idle Min': ' Idle Min',
 'Label': 'Label'}

# Loading target dataset to test

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the new dataset we going to test the model on:
url = "https://ddosciu.s3.us-east-2.amazonaws.com/PCAPs/Test.csv"
#Test1
url1 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/HTTP_Ddos.pcap_Flow.csv"
url2 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/amp.TCP.syn.optionallyACK.optionallysamePort.pcapng_Flow.csv"
url3 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/amp.TCP.reflection.SYNACK.pcap_Flow.csv"
url4 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/amp.UDP.bacnet.IOT.37810.pcapng_Flow.csv"
url5 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/1.pcap_Flow.csv"

url6 ="https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/amp.dns.RRSIG.fragmented.pcap_Flow.csv"
url7 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/amp.UDP.isakmp.pcap_Flow.csv"
url8 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/pkt.ICMP.largeempty.pcap_Flow.csv"
url9 = "https://ddosciu.s3.us-east-2.amazonaws.com/CSV's/pkt.TCP.DOMINATE.syn.ecn.cwr.pcapng_Flow.csv"
url10 = "/content/NormalTraffic2.pcap_Flow.csv"
url11= "https://ddosciu.s3.us-east-2.amazonaws.com/IC_Taraf.root.1.pcap_Flow.csv"
url12= "https://ddosciu.s3.us-east-2.amazonaws.com/1.csv"
url13= "https://ddosciu.s3.us-east-2.amazonaws.com/21.csv"
url14 = "/content/1.csv"


new_data = pd.read_csv(url13)

# Apply the column mapping to the testing data
new_data = new_data.rename(columns=column_mapping)

In [None]:
new_data.info()

In [None]:
len(X.columns),len(new_data.columns)
# Check for missing columns in new_data compared to train data X
missing_columns = set(X.columns) - set(new_data.columns)

# Check for missing columns in train data X compared to new_data
extra_columns = set(new_data.columns) - set(X.columns)

print("Columns missing in new_data:", missing_columns)
print("Columns extra in new_data:", extra_columns)

In [None]:
# Same Preprocessing for new dataset as i did with training dataset
# Drop irrelevant columns if necessary
columns_to_drop = ['Unnamed: 0', 'Flow ID', 'SimillarHTTP']

# Filter out columns that are not present in the DataFrame
columns_to_drop_existing = [column for column in columns_to_drop if column in new_data.columns]

# Drop the existing columns
if columns_to_drop_existing:
    new_data = new_data.drop(columns_to_drop_existing, axis=1)
else:
    print("All columns to drop are not present in the DataFrame.")


In [None]:
new_data.head(5)

In [None]:
def standardize_timestamp_format(data, column_name=' Timestamp'):
    if column_name in data.columns:
        try:
            # Trying to convert the 'Timestamp' column to datetime using the first row to infer the format
            data[column_name] = pd.to_datetime(data[column_name])
        except ValueError:
            # If ValueError occurs (e.g., due to a different format), try to parse the timestamp with multiple formats
            data[column_name] = pd.to_datetime(data[column_name], errors='coerce', format='%d/%m/%Y %I:%M:%S %p')
            data[column_name].fillna(pd.to_datetime(data[column_name], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f'), inplace=True)
    return data

# Encode categorical features
encoder = LabelEncoder()
new_data[' Source IP'] = encoder.fit_transform(new_data[' Source IP'])
new_data[' Destination IP'] = encoder.fit_transform(new_data[' Destination IP'])
# new_data[' Label'] = encoder.fit_transform(new_data[' Label'])

# Standardize the format of the 'Timestamp' column in the testing data
new_data = standardize_timestamp_format(new_data)

# Convert 'Timestamp' to datetime and then to int64
new_data[' Timestamp'] = pd.to_datetime(new_data[' Timestamp']).astype(np.int64)

In [None]:
# Define the columns to drop
columns_to_drop = ['Inbound','Label']

# Filter out columns that are not present in the DataFrame
columns_to_drop_existing = [column for column in columns_to_drop if column in new_data.columns]
# Drop the existing columns if there are any, otherwise use the original DataFrame
if columns_to_drop_existing:
    X_new = new_data.drop(columns_to_drop_existing, axis=1)
else:
    print("All columns to drop are not present in the DataFrame.")

In [None]:
X_new = X_new[X_specific.columns]

In [None]:
# Identify non-numeric columns
non_numeric_columns = X_new.select_dtypes(exclude=np.number).columns
# print("Non-numeric columns:", non_numeric_columns)

# Convert non-numeric columns to numeric format
for column in non_numeric_columns:
    X_new[column] = pd.to_numeric(X_new[column], errors='coerce')

X_new.replace([np.inf, -np.inf], np.nan, inplace=True)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_new = imputer.fit_transform(X_new)
scaler = StandardScaler()
X_new = scaler.fit_transform(X_new)

#Labels for comparison (They are not always available for some datasets)
# y_new = new_data[' Label']

### Prediction

In [None]:
# Now, i can use the loaded models to make predictions on the preprocessed new dataset
predictions = {}
for name, model in loaded_models.items():
    predictions[name] = model.predict(X_new)

# Print predictions for each model
for name, preds in predictions.items():
    print(f"Predictions using {name}: {preds}")


In [None]:
# Define a mapping from encoded labels to original labels
label_mapping = {0: 'BENIGN', 1: 'LDAP', 2: 'NetBIOS'}

# Now, i can use the loaded models to make predictions on the preprocessed new dataset
predictions = {}

# Iterate through each loaded model
for name, model in loaded_models.items():
    # Make predictions using the current model
    preds = model.predict(X_new)
    # Map encoded predictions to their original labels
    mapped_preds = [label_mapping[pred] for pred in preds]
    # Store the mapped predictions
    predictions[name] = mapped_preds

# Print mapped predictions for each model
for name, preds in predictions.items():
    print(f"Predictions using {name}: {preds}")


In [None]:
# Initialize an empty DataFrame
all_predictions_df = pd.DataFrame()

# Initialize a list to store model names
model_names = []

# Iterate over each loaded model
for name, model in loaded_models.items():
    # Make predictions on test data
    y_pred = model.predict(X_new)

    # Map the predicted labels
    mapped_predictions = [label_mapping[label] for label in y_pred]

    # Add the model name to the list
    model_names.append(name)

    # Create a DataFrame for predicted labels
    predicted_labels_df = pd.DataFrame({'Predicted_Label': y_pred, 'Mapped_Prediction': mapped_predictions})

    # Add the DataFrame to the overall DataFrame
    all_predictions_df = pd.concat([all_predictions_df, predicted_labels_df], axis=1)

# Set the column names for the first row
all_predictions_df.columns = pd.MultiIndex.from_product([model_names, ['Predicted_Label', 'Mapped_Prediction']])

# Print the DataFrame
all_predictions_df


In [None]:
# Define a mapping from encoded labels to original labels
label_mapping = {0: 'No Attack'}

# Now, you can use the loaded models to make predictions on the preprocessed new dataset
predictions = {}

# Iterate through each loaded model
for name, model in loaded_models.items():
    # Make predictions using the current model
    preds = model.predict(X_new)
    # Map encoded predictions to their original labels
    mapped_preds = ['No Attack' if pred == 0 else 'DDos Attack' for pred in preds]
    # Store the mapped predictions
    predictions[name] = mapped_preds

# Print mapped predictions for each model
for name, preds in predictions.items():
    print(f"Predictions using {name}: {preds}")

In [None]:
import pandas as pd

# Initialize an empty DataFrame
all_predictions_df = pd.DataFrame()

# Initialize a list to store model names
model_names = []

# Iterate over each loaded model
for name, model in loaded_models.items():
    # Make predictions on test data
    y_pred = model.predict(X_new)

    # Map the predicted labels
    mapped_predictions = ['No Attack' if label == 0 else 'Attack' for label in y_pred]

    # Add the model name to the list
    model_names.append(name)

    # Create a DataFrame for predicted labels
    predicted_labels_df = pd.DataFrame({'Predicted_Label': y_pred, 'Mapped_Prediction': mapped_predictions})

    # Add the DataFrame to the overall DataFrame
    all_predictions_df = pd.concat([all_predictions_df, predicted_labels_df], axis=1)

# Set the column names for the first row
all_predictions_df.columns = pd.MultiIndex.from_product([model_names, ['Predicted_Label', 'Mapped_Prediction']])

# Print the DataFrame
all_predictions_df


In [None]:
#Savinf model prediction to CSV file
all_predictions_df.to_csv('test_predictions.csv', index=False)