In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFE, VarianceThreshold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from pyswarms.discrete import BinaryPSO
import time

print('done - imports')

# Load and preprocess
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
X_raw = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
y = df['Normal/Attack']

scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)
print('done - preprocessing and split')

# PSO options
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
results = {}

# Helper function for PSO
def run_pso_on_features(X_tr, X_te, y_tr, y_te, method_name):
    start = time.time()
    X_tr_np = X_tr.values if isinstance(X_tr, pd.DataFrame) else X_tr
    X_te_np = X_te.values if isinstance(X_te, pd.DataFrame) else X_te
    
    def fitness(mask):
        scores = []
        for particle in mask:
            cols = [i for i, bit in enumerate(particle) if bit]
            if not cols:
                scores.append(0)
                continue
            clf = RandomForestClassifier(n_estimators=30, random_state=42)
            clf.fit(X_tr_np[:, cols], y_tr)
            preds = clf.predict(X_te_np[:, cols])
            scores.append(f1_score(y_te, preds, average='weighted'))
        return -1 * np.array(scores)
    
    n_feat = X_tr_np.shape[1]
    pso = BinaryPSO(n_particles=10, dimensions=n_feat, options=options)
    cost, pos = pso.optimize(fitness, iters=10)
    
    selected_indices = [i for i, bit in enumerate(pos) if bit]
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_tr_np[:, selected_indices], y_tr)
    preds = clf.predict(X_te_np[:, selected_indices])
    
    return {
        'num_features': len(selected_indices),
        'selected_indices': selected_indices,
        'f1': f1_score(y_te, preds, average='weighted'),
        'accuracy': accuracy_score(y_te, preds),
        'precision': precision_score(y_te, preds, average='weighted'),
        'recall': recall_score(y_te, preds, average='weighted'),
        'time': time.time() - start
    }

done - imports
done - preprocessing and split


In [4]:
# 5. RFE + PSO
print('Running: RFE + PSO...')
from tqdm import tqdm

# Add progress tracking to RFE
print('  - Running RFE feature elimination (this may take a few minutes)...')
rfe = RFE(estimator=RandomForestClassifier(n_estimators=30, random_state=42), n_features_to_select=30, verbose=1)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)
print('  - RFE completed, starting PSO...')

results['RFE_PSO'] = run_pso_on_features(X_train_rfe, X_test_rfe, y_train, y_test, 'RFE+PSO')
print(f"done - RFE+PSO: {results['RFE_PSO']['num_features']} features, F1={results['RFE_PSO']['f1']:.4f}")

Running: RFE + PSO...
  - Running RFE feature elimination (this may take a few minutes)...
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.


2025-11-19 13:27:25,353 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}


  - RFE completed, starting PSO...


pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-1
2025-11-19 14:07:17,816 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.9998444413420144, best pos: [1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1]


done - RFE+PSO: 15 features, F1=0.9998


In [7]:
rfe_selected_indices = rfe.get_support(indices=True)
rfe_feature_names = X_train.columns[rfe_selected_indices]

In [8]:
# Assign the PSO mask from your output
best_pos = [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1]
  # Use actual mask from your output

selected_features = np.array(rfe_feature_names)[np.array(best_pos, dtype=bool)]
print("Selected features:", selected_features.tolist())

Selected features: ['FIT101', 'LIT101', 'MV101', 'P102', 'AIT203', 'FIT301', 'MV304', 'AIT402', 'LIT401', 'AIT503', 'AIT504', 'FIT501', 'P501', 'PIT502', 'FIT601']
