In [6]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from pyswarms.discrete import BinaryPSO
print('done - imports')

done - imports


In [7]:
# 2. Load and Clean Data
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
print('done - load/clean')

done - load/clean


In [8]:
# 3. Data Preprocessing
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
X = X.drop(constant_cols, axis=1)
print('done - scaling/constants')

done - scaling/constants


In [9]:
# 4. Advanced Feature Engineering: Rolling Window Stats
stat_features = []
for col in X.columns:
    stat_df = pd.DataFrame({
        f'{col}_mean': X[col].rolling(window=10, min_periods=1).mean(),
        f'{col}_std':  X[col].rolling(window=10, min_periods=1).std(),
        f'{col}_min':  X[col].rolling(window=10, min_periods=1).min(),
        f'{col}_max':  X[col].rolling(window=10, min_periods=1).max(),
    })
    stat_features.append(stat_df)
stat_features_df = pd.concat(stat_features, axis=1)
X = pd.concat([X, stat_features_df], axis=1)
print('done - rolling/statistics')

done - rolling/statistics


In [17]:
# 5. Advanced Feature Engineering: DIP Features
# Only run DIP features on original input columns
orig_cols = [col for col in df.columns if col in X.columns and not any(s in col for s in ['_mean', '_std', '_min', '_max', '_diff', '_cumsum'])]

dip_dict = {}
for col in orig_cols:
    dip_dict[f'{col}_diff'] = X[col].diff().fillna(0)
    dip_dict[f'{col}_cumsum'] = X[col].cumsum()

dip_features_df = pd.DataFrame(dip_dict, index=X.index)
X = pd.concat([X, dip_features_df], axis=1)
print('done - DIP features')

done - DIP features


In [18]:
# 6. Load Labels
labels = pd.read_csv('SWaT.csv')['Normal/Attack']
print('done - labels')

done - labels


In [19]:
# 7. PSO Feature Selection (No Split, Full Dataset)
X_np = X.values
y_np = labels.values

def pso_fitness(mask):
    fitness = []
    for particle in mask:
        cols = [i for i, bit in enumerate(particle) if bit]
        if not cols:
            fitness.append(0)
            continue
        clf = RandomForestClassifier(n_estimators=30, random_state=42)
        clf.fit(X_np[:, cols], y_np)
        preds = clf.predict(X_np[:, cols])
        fitness.append(f1_score(y_np, preds, average='weighted'))
    return -1 * np.array(fitness)

n_features = X.shape[1]
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pso = BinaryPSO(n_particles=10, dimensions=n_features, options=options)
cost, pos = pso.optimize(pso_fitness, iters=10)
selected_features = [col for col, use in zip(X.columns, pos) if use]
print("Selected features:", selected_features)
print("Number of features selected:", len(selected_features))
print('done - PSO feature selection')

2025-11-17 10:49:23,679 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-1
2025-11-17 17:15:29,942 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -1.0, best pos: [1 0 0 ... 1 1 1]


Selected features: ['FIT101', 'P101', 'P102', 'AIT201', 'AIT202', 'FIT201', 'MV201', 'P201', 'P203', 'P204', 'P205', 'DPIT301', 'MV301', 'AIT401', 'AIT402', 'FIT401', 'LIT401', 'P402', 'AIT503', 'FIT501', 'PIT503', 'P602', 'FIT101_std', 'FIT101_min', 'FIT101_max', 'LIT101_std', 'MV101_max', 'P101_min', 'P101_max', 'P102_min', 'P102_max', 'AIT201_mean', 'AIT201_max', 'AIT202_mean', 'AIT202_std', 'AIT202_min', 'AIT203_max', 'FIT201_mean', 'FIT201_std', 'MV201_std', 'P201_max', 'P203_mean', 'P203_min', 'P203_max', 'P204_std', 'P204_min', 'P205_std', 'P206_mean', 'DPIT301_mean', 'DPIT301_max', 'LIT301_std', 'MV301_mean', 'MV301_min', 'MV301_max', 'MV302_max', 'MV303_max', 'MV304_std', 'MV304_max', 'P302_std', 'P302_min', 'AIT402_std', 'AIT402_min', 'FIT401_mean', 'FIT401_std', 'FIT401_min', 'FIT401_max', 'LIT401_min', 'P403_mean', 'P403_std', 'P403_max', 'UV401_mean', 'AIT501_std', 'AIT501_min', 'AIT501_max', 'AIT502_std', 'AIT502_max', 'AIT504_mean', 'FIT501_min', 'FIT501_max', 'FIT502_mi