In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFE, VarianceThreshold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from pyswarms.discrete import BinaryPSO
import time

print('done - imports')

done - imports


In [2]:
# Load and preprocess
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
X_raw = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
y = df['Normal/Attack']

In [3]:
scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)
print('done - preprocessing and split')

done - preprocessing and split


In [4]:
# PSO options
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
results = {}

In [5]:
# Helper function for PSO
def run_pso_on_features(X_tr, X_te, y_tr, y_te, method_name):
    start = time.time()
    X_tr_np = X_tr.values if isinstance(X_tr, pd.DataFrame) else X_tr
    X_te_np = X_te.values if isinstance(X_te, pd.DataFrame) else X_te
    
    def fitness(mask):
        scores = []
        for particle in mask:
            cols = [i for i, bit in enumerate(particle) if bit]
            if not cols:
                scores.append(0)
                continue
            clf = RandomForestClassifier(n_estimators=30, random_state=42)
            clf.fit(X_tr_np[:, cols], y_tr)
            preds = clf.predict(X_te_np[:, cols])
            scores.append(f1_score(y_te, preds, average='weighted'))
        return -1 * np.array(scores)
    
    n_feat = X_tr_np.shape[1]
    pso = BinaryPSO(n_particles=10, dimensions=n_feat, options=options)
    cost, pos = pso.optimize(fitness, iters=10)
    
    selected_indices = [i for i, bit in enumerate(pos) if bit]
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_tr_np[:, selected_indices], y_tr)
    preds = clf.predict(X_te_np[:, selected_indices])
    
    return {
        'num_features': len(selected_indices),
        'selected_indices': selected_indices,
        'f1': f1_score(y_te, preds, average='weighted'),
        'accuracy': accuracy_score(y_te, preds),
        'precision': precision_score(y_te, preds, average='weighted'),
        'recall': recall_score(y_te, preds, average='weighted'),
        'time': time.time() - start
    }

In [9]:
# 1. Variance Threshold + PSO
print('Running: Variance Threshold + PSO...')
vt = VarianceThreshold(threshold=0.01)
X_train_vt = vt.fit_transform(X_train)
X_test_vt = vt.transform(X_test)
results['VarianceThreshold_PSO'] = run_pso_on_features(X_train_vt, X_test_vt, y_train, y_test, 'VT+PSO')
print(f"done - VT+PSO: {results['VarianceThreshold_PSO']['num_features']} features, F1={results['VarianceThreshold_PSO']['f1']:.4f}")

Running: Variance Threshold + PSO...


2025-11-19 11:35:07,159 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-1
2025-11-19 12:04:18,256 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.9998444290662696, best pos: [0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1]


done - VT+PSO: 20 features, F1=0.9998


In [10]:
 print(selected_feature_names_vt)
print(f"Total: {len(selected_feature_names_vt)} features")

['MV101', 'P101', 'AIT201', 'AIT203', 'FIT201', 'MV201', 'P201', 'FIT301', 'MV302', 'MV304', 'P302', 'AIT402', 'FIT401', 'LIT401', 'UV401', 'AIT502', 'AIT503', 'FIT502', 'FIT504', 'P501', 'PIT501', 'PIT503']
Total: 22 features
