In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score, classification_report

#importing libraries for machine learning
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Dropout
from sklearn.feature_selection import SelectKBest, f_classif

import time

In [2]:
def stratify(df, n_points):
    # Using train_test_split with stratify to select n_points while maintaining class balance
    stratified_sample, _ = train_test_split(df, test_size=n_points, stratify=df['Is_Malicious'], random_state=42)
    return stratified_sample


In [3]:
def resample_dataframe(df, class_column):
    # Count occurrences of each class
    class_counts = df[class_column].value_counts()
    
    # Majority and minority classes
    majority_class = max(class_counts)
    minority_class = min(class_counts)
    
    # Target count for balancing
    target_count = int((majority_class + minority_class)/2)
    
    resampled_dfs = []  # List to store resampled DataFrames
    
    for class_value, count in class_counts.items():
        class_df = df[df[class_column] == class_value]
        
        if count < target_count:
            # Resample with replacement
            resampled_df = class_df.sample(n=target_count, replace=True, random_state=42)
        elif count > target_count:
            # Resample without replacement
            resampled_df = class_df.sample(n=target_count, replace=False, random_state=42)
        else:
            resampled_df = class_df
        
        resampled_dfs.append(resampled_df)
    
    # Combine all resampled DataFrames
    resampled_df = pd.concat(resampled_dfs)
    
    return resampled_df

In [4]:
# Function to preprocess the data
def preprocess_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, X.columns.tolist()

In [5]:
df = pd.read_csv('sensornetguard.csv')

In [6]:
df = df.drop( ['IP_Address','Node_ID','Timestamp'], axis=1) #[,'Pinged_IP']

In [7]:
target_column = 'Is_Malicious'

In [8]:
df = stratify(df, 5000)
len(df)

5000

In [9]:
df = resample_dataframe(df, target_column)
len(df)

5000

In [10]:
X_scaled, y, feature_names = preprocess_data(df, target_column)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:

# Function to create the LSTM model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=64, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(units=32))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
# Function to create the DNN model
def create_dnn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [13]:
# DNN Feature Selection
def dnn_feature_selection(X_train, y_train, feature_names, num_features=5):
    # Train a RandomForestClassifier to get feature importances
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Get feature importances and sort them
    importances = rf_model.feature_importances_
    
    sorted_idx = np.argsort(importances)[::-1]
    
    # Get top 'num_features' features
    selected_features = [feature_names[i] for i in sorted_idx[:num_features]]
    return selected_features

In [14]:
# LSTM Feature Selection
def lstm_feature_selection(X_train, y_train, X_valid, feature_names, num_features=5):
    time_steps = 1
    features = X_train.shape[1]
    input_shape = (time_steps, features)
    
    # Reshape data for LSTM
    X_train_lstm = np.reshape(X_train, (X_train.shape[0], time_steps, features))
    X_valid_lstm = np.reshape(X_valid, (X_valid.shape[0], time_steps, features))
    
    # Train LSTM model
    lstm_model = create_lstm_model(input_shape)
    lstm_model.fit(X_train_lstm, y_train, epochs=3, batch_size=32, verbose=0)
    
    # Use the LSTM layer outputs to rank features (e.g., use layer's weights)
    lstm_weights = lstm_model.layers[0].get_weights()[0]
    
    # Get the absolute sum of the weights for each feature
    feature_importance = np.abs(lstm_weights).sum(axis=1)
    
    # Sort features by importance
    sorted_idx = np.argsort(feature_importance)[::-1]
    
    # Get top 'num_features' features
    selected_features = [feature_names[i] for i in sorted_idx[:num_features]]
    return selected_features

In [15]:
# Preprocess data
X_scaled, y, feature_names = preprocess_data(df, target_column)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Get the time taken for DNN model
start_time = time.time()
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0)
dnn_time = time.time() - start_time

# Get the time taken for LSTM model
time_steps = 1
features = X_train.shape[1]
input_shape = (time_steps, features)
X_train_lstm = np.reshape(X_train, (X_train.shape[0], time_steps, features))
X_valid_lstm = np.reshape(X_valid, (X_valid.shape[0], time_steps, features))

start_time = time.time()
lstm_model = create_lstm_model(input_shape)
lstm_model.fit(X_train_lstm, y_train, epochs=3, batch_size=32, verbose=0)
lstm_time = time.time() - start_time




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


In [16]:
# Select features based on the model that finishes first
if dnn_time < lstm_time:
    selected_features = dnn_feature_selection(pd.DataFrame(X_train), y_train, feature_names, num_features=4)
    print("DNN finished first, using DNN selected features:", selected_features)
else:
    selected_features = lstm_feature_selection(pd.DataFrame(X_train), y_train, X_valid, feature_names, num_features=4)
    print("LSTM finished first, using LSTM selected features:", selected_features)

DNN finished first, using DNN selected features: ['Data_Throughput', 'Error_Rate', 'Energy_Consumption_Rate', 'Packet_Drop_Rate']


In [17]:
feature_names=selected_features
print(feature_names)

['Data_Throughput', 'Error_Rate', 'Energy_Consumption_Rate', 'Packet_Drop_Rate']


In [18]:
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

X = df[feature_names]
y = df[target_column]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features using Min-Max scaler
scaler = MinMaxScaler()
X_train_selected_scaled = scaler.fit_transform(X_train)
X_test_selected_scaled = scaler.transform(X_test)

# Defining classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Creating an empty DataFrame to store the metrics
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC', 'Training Time', 'Prediction Time'])

# Loop through classifiers
for model_name, classifier in classifiers.items():
    start_time = time.time()
    
    # Training the classifier
    classifier.fit(X_train_selected_scaled, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    
    # Making predictions on the test set
    predictions = classifier.predict(X_test_selected_scaled)
    prediction_time = time.time() - start_time

    # Calculating metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)

    # Constructing a DataFrame row with the current metrics
    metrics_row = pd.DataFrame({
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-score': [f1],
        'ROC AUC': [roc_auc],
        'Training Time': [training_time],
        'Prediction Time': [prediction_time]
    })

    # Concatenating the current metrics with the main metrics DataFrame
    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

print(metrics_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)


[LightGBM] [Info] Number of positive: 2000, number of negative: 2000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1006
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
               Model  Accuracy  Precision  Recall  F1-score  ROC AUC  \
0      Random Forest     1.000   1.000000     1.0  1.000000    1.000   
1            XGBoost     1.000   1.000000     1.0  1.000000    1.000   
2           LightGBM     1.000   1.000000     1.0  1.000000    1.000   
3        Extra Trees     1.000   1.000000     1.0  1.000000    1.000   
4  Gradient Boosting     0.999   0.998004     1.0  0.999001    0.999   

   Training Time  Prediction Time  
0       0.440681         0.011351  
1       0.083795         0.000999  
2       0.498563        