<a href="https://colab.research.google.com/github/gothiyag/firefly-intrusion-detection/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gdown

# Define your Google Drive folder URL
folder_url = "https://drive.google.com/drive/u/0/folders/1Dom1KFgteCQvBDoIKvQ9b1B619MUIrJk"
repo_url = "https://github.com/gothiyag/firefly-intrusion-detection.git"
repo_dir = "/content/firefly-intrusion-detection"

# Define file IDs (you need to find the file IDs for the two CSV files from Google Drive)
file_id_1 = "14RDFD50lHdug4ds-WpwtHFcGYsWQnvmm"  # Replace with the actual file ID of the first CSV
file_id_2 = "1f9dZTXsxlC6a5JywkAip9ySvIO_PMhHc"  # Replace with the actual file ID of the second CSV

# Define the sample_data directory
sample_data_dir = "/content/sample_data"

# Clone or pull the repository
if os.path.exists(repo_dir):
    # If the repo exists, pull the latest changes
    %cd {repo_dir}
    !git pull
else:
    # Clone the repository if it doesn't exist
    !git clone {repo_url}
    %cd {repo_dir}

# Download the CSV files from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id_1}", os.path.join(sample_data_dir, "IoTID20.csv"), quiet=False)
gdown.download(f"https://drive.google.com/uc?id={file_id_2}", os.path.join(sample_data_dir, "NF-BoT-IoT-v2-5%.csv"), quiet=False)

print("Files have been downloaded and saved to 'sample_data' directory.")



In [None]:
# Prompt user for Git username and email
username = input("Enter your GitHub username: ")
email = input("Enter your GitHub email: ")

# Configure Git locally for this session only
!git config user.name "{username}"
!git config user.email "{email}"

print("Git configured for this session.")

Enter your GitHub username: gothiyag
Enter your GitHub email: gothiyag@cisco.com
Git configured for this session.


Data Preprocessing Stage
(Data Cleansing, Normalization and Encoding)

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('NF-BoT-IoT-v2-5%.csv')


# Display the first few rows of the dataset to verify loading
print(df.head())

Data Cleansing - Handling Missing value, Remove duplicate column

In [None]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Fill missing values (if any)
df.fillna(df.mean(), inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# Check dataset shape after cleansing
print("Dataset shape after cleansing:", df.shape)


Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Selecting numerical columns for normalization
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()

# Apply MinMax scaling
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Print a summary to verify normalization
print("Normalized data sample:\n", df.head())


Data Encoding and Split

In [None]:
# Encoding categorical features
df = pd.get_dummies(df, drop_first=True)

# Display a sample to verify encoding
print("Data after encoding:\n", df.head())


In [None]:
from sklearn.model_selection import train_test_split

# 'Attack' is the target variable
X = df.drop('Attack', axis = 1)
y = df['Attack']


# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Check the shapes of the resulting splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Feature Selection


Spearman Rank Correlation

In [None]:
from scipy.stats import spearmanr

# Calculate Spearman rank correlation for features with the target
spearman_corr = X_train.corrwith(y_train, method='spearman').abs()  # Take absolute values for feature ranking
spearman_selected_features = spearman_corr[spearman_corr > 0.2].index  # Select features above a threshold

# Display selected features based on Spearman correlation
print("Selected features from Spearman correlation:\n", spearman_selected_features)


Mutual Information Feature Selection


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information for each feature in relation to the target
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info_series = pd.Series(mutual_info, index=X_train.columns)

# Set a threshold to select features with significant mutual information
mutual_info_selected_features = mutual_info_series[mutual_info_series > 0.1].index

# Display selected features based on Mutual Information
print("Selected features from Mutual Information:\n", mutual_info_selected_features)


Firefly Algorithm

In [None]:
import numpy as np

# Parameters for Firefly Algorithm
num_fireflies = 20
max_iterations = 50
gamma = 1.0  # Absorption coefficient
alpha = 0.2  # Randomness factor

# Scoring function that combines Spearman and Mutual Information scores
# Normalize scores between 0 and 1, then sum
spearman_scores = spearman_corr / spearman_corr.max()
mutual_info_scores = mutual_info_series / mutual_info_series.max()
combined_scores = spearman_scores + mutual_info_scores

# Initialize fireflies with random subsets of features
np.random.seed(42)
fireflies = [np.random.choice([0, 1], len(X_train.columns)) for _ in range(num_fireflies)]

# Define a function to evaluate firefly's fitness based on selected features
def evaluate_fitness(firefly):
    selected_features = X_train.columns[firefly == 1]
    return combined_scores[selected_features].sum()

# Firefly Optimization Loop
for iteration in range(max_iterations):
    for i in range(num_fireflies):
        for j in range(num_fireflies):
            if evaluate_fitness(fireflies[j]) > evaluate_fitness(fireflies[i]):
                # Update firefly i towards firefly j
                fireflies[i] = np.where(
                    np.random.rand(len(X_train.columns)) < alpha * np.exp(-gamma * np.sum((fireflies[i] - fireflies[j]) ** 2)),
                    fireflies[j],
                    fireflies[i]
                )

    # Optional: Decrease alpha over iterations for reduced randomness
    # alpha *= 0.95

# Select the best firefly as the optimized feature subset
best_firefly = max(fireflies, key=evaluate_fitness)
optimized_features = X_train.columns[best_firefly == 1]

print("Optimized feature subset:\n", optimized_features)


# Output the final selected features
print("Final selected features after Firefly Optimization:\n", optimized_features)
