In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFE, VarianceThreshold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from pyswarms.discrete import BinaryPSO
import time

print('done - imports')

# Load and preprocess
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
X_raw = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
y = df['Normal/Attack']

scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)
print('done - preprocessing and split')

# PSO options
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
results = {}

# Helper function for PSO
def run_pso_on_features(X_tr, X_te, y_tr, y_te, method_name):
    start = time.time()
    X_tr_np = X_tr.values if isinstance(X_tr, pd.DataFrame) else X_tr
    X_te_np = X_te.values if isinstance(X_te, pd.DataFrame) else X_te
    
    def fitness(mask):
        scores = []
        for particle in mask:
            cols = [i for i, bit in enumerate(particle) if bit]
            if not cols:
                scores.append(0)
                continue
            clf = RandomForestClassifier(n_estimators=30, random_state=42)
            clf.fit(X_tr_np[:, cols], y_tr)
            preds = clf.predict(X_te_np[:, cols])
            scores.append(f1_score(y_te, preds, average='weighted'))
        return -1 * np.array(scores)
    
    n_feat = X_tr_np.shape[1]
    pso = BinaryPSO(n_particles=10, dimensions=n_feat, options=options)
    cost, pos = pso.optimize(fitness, iters=10)
    
    selected_indices = [i for i, bit in enumerate(pos) if bit]
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_tr_np[:, selected_indices], y_tr)
    preds = clf.predict(X_te_np[:, selected_indices])
    
    return {
        'num_features': len(selected_indices),
        'selected_indices': selected_indices,
        'f1': f1_score(y_te, preds, average='weighted'),
        'accuracy': accuracy_score(y_te, preds),
        'precision': precision_score(y_te, preds, average='weighted'),
        'recall': recall_score(y_te, preds, average='weighted'),
        'time': time.time() - start
    }

done - imports
done - preprocessing and split


In [4]:
# 6. Random Forest Feature Importance + PSO
print('Running: RF Importance + PSO...')
print('  - Training Random Forest to calculate feature importances...')
rf_imp = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
rf_imp.fit(X_train, y_train)
print('  - Selecting top 10 features by importance...')
importances = rf_imp.feature_importances_
top_10_idx = np.argsort(importances)[-10:]
X_train_rf = X_train.iloc[:, top_10_idx]
X_test_rf = X_test.iloc[:, top_10_idx]
print('  - Starting PSO optimization...')
results['RFImportance_PSO'] = run_pso_on_features(X_train_rf, X_test_rf, y_train, y_test, 'RFImp+PSO')
print(f"done - RFImp+PSO: {results['RFImportance_PSO']['num_features']} features, F1={results['RFImportance_PSO']['f1']:.4f}")

Running: RF Importance + PSO...
  - Training Random Forest to calculate feature importances...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   46.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.6min finished
2025-11-19 13:25:06,013 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}


  - Selecting top 10 features by importance...
  - Starting PSO optimization...


pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-0.999
2025-11-19 13:55:20,713 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.9988763115177857, best pos: [1 1 0 1 0 1 0 1 1 1]


done - RFImp+PSO: 7 features, F1=0.9989


In [9]:
print(results)
print(results['RFImportance_PSO'])

{'RFImportance_PSO': {'num_features': 7, 'selected_indices': [0, 1, 3, 5, 7, 8, 9], 'f1': 0.9988763115177857, 'accuracy': 0.9988775782361309, 'precision': 0.9988769274655339, 'recall': 0.9988775782361309, 'time': 1831.141232252121}}
{'num_features': 7, 'selected_indices': [0, 1, 3, 5, 7, 8, 9], 'f1': 0.9988763115177857, 'accuracy': 0.9988775782361309, 'precision': 0.9988769274655339, 'recall': 0.9988775782361309, 'time': 1831.141232252121}


In [10]:
# Extract indices of selected features
selected_indices = results['RFImportance_PSO']['selected_indices']

# Print feature names
selected_features = X_train_rf.columns[selected_indices]
print("Selected features:", selected_features.tolist())


Selected features: ['FIT503', 'LIT401', 'LIT301', 'FIT501', 'PIT501', 'PIT502', 'PIT503']
