In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# 2. Load the Intrusion Detection Dataset
# Example using the KDD Cup 99 dataset
# Assuming the dataset is loaded as 'data.csv'
df = pd.read_csv("intrusion_detection.csv")

In [None]:
# 3. Data Preprocessing
# Handle missing values (fill or remove)
df.fillna(df.mean(), inplace=True)

# Convert categorical features to numerical using Label Encoding (or One-Hot Encoding if needed)
label_encoder = LabelEncoder()
df['categorical_feature'] = label_encoder.fit_transform(df['categorical_feature'])

# Split dataset into features and target
X = df.drop('target', axis=1)  # Features
y = df['target']  # Target variable (binary: 1 = intrusion, 0 = no intrusion)

# Feature Scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 4. Individual Models: Logistic Regression, Decision Tree, Random Forest, KNN
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Train and evaluate each individual model
model_metrics = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    model_metrics[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'ROC-AUC': roc_auc}

# Display performance metrics of individual models
model_metrics

# 5. Bagging Ensemble Method
bagging_model = RandomForestClassifier(random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

# Evaluate Bagging model
bagging_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_bagging),
    'Precision': precision_score(y_test, y_pred_bagging),
    'Recall': recall_score(y_test, y_pred_bagging),
    'F1-Score': f1_score(y_test, y_pred_bagging),
    'ROC-AUC': roc_auc_score(y_test, y_pred_bagging)
}

bagging_metrics

# 6. Boosting Ensemble Method: AdaBoost, GradientBoosting
boosting_models = {
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

boosting_metrics = {}
for model_name, model in boosting_models.items():
    model.fit(X_train, y_train)
    y_pred_boosting = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_boosting)
    precision = precision_score(y_test, y_pred_boosting)
    recall = recall_score(y_test, y_pred_boosting)
    f1 = f1_score(y_test, y_pred_boosting)
    roc_auc = roc_auc_score(y_test, y_pred_boosting)
    
    boosting_metrics[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'ROC-AUC': roc_auc}

boosting_metrics

# 7. Voting Ensemble Method: Hard and Soft Voting
hard_voting_model = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

soft_voting_model = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
], voting='soft')

# Train and evaluate the models
hard_voting_model.fit(X_train, y_train)
soft_voting_model.fit(X_train, y_train)

y_pred_hard_voting = hard_voting_model.predict(X_test)
y_pred_soft_voting = soft_voting_model.predict(X_test)

# Evaluate both models
hard_voting_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_hard_voting),
    'Precision': precision_score(y_test, y_pred_hard_voting),
    'Recall': recall_score(y_test, y_pred_hard_voting),
    'F1-Score': f1_score(y_test, y_pred_hard_voting),
    'ROC-AUC': roc_auc_score(y_test, y_pred_hard_voting)
}

soft_voting_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_soft_voting),
    'Precision': precision_score(y_test, y_pred_soft_voting),
    'Recall': recall_score(y_test, y_pred_soft_voting),
    'F1-Score': f1_score(y_test, y_pred_soft_voting),
    'ROC-AUC': roc_auc_score(y_test, y_pred_soft_voting)
}

hard_voting_metrics, soft_voting_metrics

# 8. Stacking Ensemble Method
stacking_model = StackingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
], final_estimator=LogisticRegression())

stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)

# Evaluate Stacking model
stacking_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_stacking),
    'Precision': precision_score(y_test, y_pred_stacking),
    'Recall': recall_score(y_test, y_pred_stacking),
    'F1-Score': f1_score(y_test, y_pred_stacking),
    'ROC-AUC': roc_auc_score(y_test, y_pred_stacking)
}

stacking_metrics

# 9. Performance Comparison Table
metrics_summary = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'KNN', 'Bagging', 'AdaBoost', 
              'Gradient Boosting', 'Hard Voting', 'Soft Voting', 'Stacking'],
    'Accuracy': [model_metrics['Logistic Regression']['Accuracy'], model_metrics['Decision Tree']['Accuracy'], 
                 model_metrics['Random Forest']['Accuracy'], model_metrics['KNN']['Accuracy'], 
                 bagging_metrics['Accuracy'], boosting_metrics['AdaBoost']['Accuracy'], 
                 boosting_metrics['Gradient Boosting']['Accuracy'], hard_voting_metrics['Accuracy'],
                 soft_voting_metrics['Accuracy'], stacking_metrics['Accuracy']],
    'Precision': [model_metrics['Logistic Regression']['Precision'], model_metrics['Decision Tree']['Precision'],
                  model_metrics['Random Forest']['Precision'], model_metrics['KNN']['Precision'],
                  bagging_metrics['Precision'], boosting_metrics['AdaBoost']['Precision'], 
                  boosting_metrics['Gradient Boosting']['Precision'], hard_voting_metrics['Precision'],
                  soft_voting_metrics['Precision'], stacking_metrics['Precision']],
    'Recall': [model_metrics['Logistic Regression']['Recall'], model_metrics['Decision Tree']['Recall'],
               model_metrics['Random Forest']['Recall'], model_metrics['KNN']['Recall'],
               bagging_metrics['Recall'], boosting_metrics['AdaBoost']['Recall'], 
               boosting_metrics['Gradient Boosting']['Recall'], hard_voting_metrics['Recall'],
               soft_voting_metrics['Recall'], stacking_metrics['Recall']],
    'F1-Score': [model_metrics['Logistic Regression']['F1-Score'], model_metrics['Decision Tree']['F1-Score'],
                 model_metrics['Random Forest']['F1-Score'], model_metrics['KNN']['F1-Score'], 
                 bagging_metrics['F1-Score'], boosting_metrics['AdaBoost']['F1-Score'], 
                 boosting_metrics['Gradient Boosting']['F1-Score'], hard_voting_metrics['F1-Score'],
                 soft_voting_metrics['F1-Score'], stacking_metrics['F1-Score']],
    'ROC-AUC': [model_metrics['Logistic Regression']['ROC-AUC'], model_metrics['Decision Tree']['ROC-AUC'],
                model_metrics['Random Forest']['ROC-AUC'], model_metrics['KNN']['ROC-AUC'], 
                bagging_metrics['ROC-AUC'], boosting_metrics['AdaBoost']['ROC-AUC'], 
                boosting_metrics['Gradient Boosting']['ROC-AUC'], hard_voting_metrics['ROC-AUC'],
                soft_voting_metrics['ROC-AUC'], stacking_metrics['ROC-AUC']]
}

# Create a DataFrame to display the comparison
metrics_df = pd.DataFrame(metrics_summary)
metrics_df

# 10. Visualization: Bar plot comparing model performance
plt.figure(figsize=(10, 6))
metrics_df.set_index('Model').plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.show()
