In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from pyswarms.discrete import BinaryPSO
print('done - imports')

done - imports


In [2]:
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
print('done - load data')

# Replace 'attack'/'normal' with 1/0 for labels if necessary
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
print('done - label recode')

done - load data
done - label recode


In [3]:
# All available features from SWaT dataset (after removing Timestamp and label)
all_features = [
    'FIT101', 'LIT101', 'MV101', 'P101', 'P102',
    'AIT201', 'AIT202', 'AIT203', 'FIT201', 'MV201', 'P201', 'P202', 'P203', 'P204', 'P205', 'P206',
    'DPIT301', 'FIT301', 'LIT301', 'MV301', 'MV302', 'MV303', 'MV304', 'P301', 'P302',
    'AIT401', 'AIT402', 'FIT401', 'LIT401', 'P401', 'P402', 'P403', 'P404', 'UV401',
    'AIT501', 'AIT502', 'AIT503', 'AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502', 'PIT501', 'PIT502', 'PIT503',
    'FIT601', 'P601', 'P602', 'P603'
]

# Select key sensors for advanced feature engineering (edit this list based on your choice)
key_sensors = [
    'FIT101', 'LIT101', 'MV101', 'P101', 'P102',
    'AIT201', 'AIT202', 'FIT201', 'MV201', 'P201', 'P203',
    'DPIT301', 'FIT301', 'LIT301', 'MV301', 'P301',
    'AIT401', 'AIT402', 'FIT401', 'LIT401', 'P401',
    'AIT501', 'AIT502', 'FIT501', 'FIT502', 'P501', 'PIT501',
    'FIT601', 'P601', 'P602'
]

print('done - sensor list')

done - sensor list


In [4]:
cols_to_remove = ['Timestamp', 'Normal/Attack']
X_raw = df.drop(cols_to_remove, axis=1)
y = df['Normal/Attack']

scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
print('done - raw feature scaling')

done - raw feature scaling


In [5]:
engineered = []

for col in key_sensors:
    temp = pd.DataFrame({
        f'{col}_mean': X_raw[col].rolling(window=10, min_periods=1).mean(),
        f'{col}_diff': X_raw[col].diff().fillna(0),
    })
    engineered.append(temp)

X_eng = pd.concat([X_raw] + engineered, axis=1)
print('done - engineering')

done - engineering


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_eng, y, test_size=0.2, stratify=y, random_state=42)
print('done - train/test split')

done - train/test split


In [7]:
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

def pso_fitness(mask):
    fitness = []
    for particle in mask:
        cols = [i for i, bit in enumerate(particle) if bit]
        if not cols:
            fitness.append(0)
            continue
        clf = RandomForestClassifier(n_estimators=30, random_state=42)
        clf.fit(X_train_np[:, cols], y_train_np)
        preds = clf.predict(X_test_np[:, cols])
        fitness.append(f1_score(y_test_np, preds, average='weighted'))
    return -1 * np.array(fitness)

n_features = X_train_np.shape[1]
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pso = BinaryPSO(n_particles=10, dimensions=n_features, options=options)
cost, pos = pso.optimize(pso_fitness, iters=10)
selected_features = [col for col, use in zip(X_train.columns, pos) if use]

print("Selected features:", selected_features)
print("Number of features selected:", len(selected_features))
print('done - PSO feature selection')

2025-11-18 12:26:05,049 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-1
2025-11-18 13:17:52,156 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.9998888735188592, best pos: [0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 1
 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0
 1 0 0 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0]


Selected features: ['MV101', 'AIT202', 'FIT201', 'P201', 'P202', 'P204', 'P206', 'DPIT301', 'LIT301', 'MV302', 'MV304', 'P301', 'FIT401', 'LIT401', 'P402', 'P403', 'AIT503', 'FIT501', 'FIT502', 'FIT504', 'P501', 'P502', 'PIT501', 'PIT502', 'FIT601', 'P603', 'FIT101_mean', 'FIT101_diff', 'LIT101_mean', 'P101_mean', 'P101_diff', 'P102_diff', 'AIT201_mean', 'AIT201_diff', 'AIT202_mean', 'FIT201_mean', 'FIT201_diff', 'MV201_mean', 'P201_mean', 'DPIT301_diff', 'LIT301_mean', 'MV301_mean', 'AIT402_mean', 'AIT402_diff', 'FIT401_diff', 'LIT401_mean', 'LIT401_diff', 'AIT501_mean', 'AIT501_diff', 'FIT502_mean', 'FIT502_diff', 'P501_mean', 'P501_diff', 'PIT501_diff', 'FIT601_mean', 'FIT601_diff', 'P601_mean']
Number of features selected: 57
done - PSO feature selection
