In [1]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier

data = pd.read_csv("/kaggle/input/iot-dataset-for-intrusion-detection-systems-ids/BoTNeTIoT-L01-v2.csv")

data = data.drop(['Device_Name', 'Attack', 'Attack_subType'], axis=1)

X = data.drop('label', axis=1)
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgbm = LGBMClassifier()

lgbm.fit(X_train, y_train)
print('Initial LightGBM accuracy:', lgbm.score(X_test, y_test))

feature_importances = lgbm.feature_importances_

feature_ranking = pd.Series(feature_importances, index=X.columns).sort_values()

threshold = 0.01

selected_features = feature_ranking[feature_ranking > threshold].index

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print("Selected features:", selected_features)


rfe = RFE(estimator=LGBMClassifier())
rfe.fit(X_train_selected, y_train)

rfe_ranking = pd.Series(rfe.ranking_, index=selected_features).sort_values()
selected_rfe_features = rfe_ranking[rfe_ranking == 1].index

print("RFE selected features:", selected_rfe_features)

X_train_rfe = X_train_selected[selected_rfe_features]
X_test_rfe = X_test_selected[selected_rfe_features]

start_time = time.time()
lgbm.fit(X_train_rfe, y_train)
end_time = time.time()
print('LightGBM accuracy with RFE selected features:', lgbm.score(X_test_rfe, y_test))
elapsed_time = end_time - start_time
print(f"Time taken RFE: {elapsed_time} seconds")


[LightGBM] [Info] Number of positive: 444352, number of negative: 5205732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.735941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5862
[LightGBM] [Info] Number of data points in the train set: 5650084, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078645 -> initscore=-2.460899
[LightGBM] [Info] Start training from score -2.460899
Initial LightGBM accuracy: 0.9999978761392743
Selected features: Index(['H_L0.1_variance', 'HH_L0.1_std', 'HpHp_L0.1_covariance',
       'HpHp_L0.1_pcc', 'HpHp_L0.1_std', 'HH_L0.1_pcc', 'HH_L0.1_magnitude',
       'HpHp_L0.1_mean', 'HpHp_L0.1_magnitude', 'HH_L0.1_mean',
       'HH_L0.1_weight', 'HpHp_L0.1_radius', 'HH_L0.1_radius',
       'HH_L0.1_covariance', 'H_L0.1_weight', 'HpHp_L0.1_weight',
       'MI_dir_L0.1_variance', 'HH_ji