In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFE, VarianceThreshold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from pyswarms.discrete import BinaryPSO
import time
from sklearn.feature_selection import chi2

print('done - imports')

# Load and preprocess
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
X_raw = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
y = df['Normal/Attack']

scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)
print('done - preprocessing and split')

# PSO options
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
results = {}

# Helper function for PSO
def run_pso_on_features(X_tr, X_te, y_tr, y_te, method_name):
    start = time.time()
    X_tr_np = X_tr.values if isinstance(X_tr, pd.DataFrame) else X_tr
    X_te_np = X_te.values if isinstance(X_te, pd.DataFrame) else X_te
    
    def fitness(mask):
        scores = []
        for particle in mask:
            cols = [i for i, bit in enumerate(particle) if bit]
            if not cols:
                scores.append(0)
                continue
            clf = RandomForestClassifier(n_estimators=30, random_state=42)
            clf.fit(X_tr_np[:, cols], y_tr)
            preds = clf.predict(X_te_np[:, cols])
            scores.append(f1_score(y_te, preds, average='weighted'))
        return -1 * np.array(scores)
    
    n_feat = X_tr_np.shape[1]
    pso = BinaryPSO(n_particles=10, dimensions=n_feat, options=options)
    cost, pos = pso.optimize(fitness, iters=10)
    
    selected_indices = [i for i, bit in enumerate(pos) if bit]
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_tr_np[:, selected_indices], y_tr)
    preds = clf.predict(X_te_np[:, selected_indices])
    
    return {
        'num_features': len(selected_indices),
        'selected_indices': selected_indices,
        'f1': f1_score(y_te, preds, average='weighted'),
        'accuracy': accuracy_score(y_te, preds),
        'precision': precision_score(y_te, preds, average='weighted'),
        'recall': recall_score(y_te, preds, average='weighted'),
        'time': time.time() - start
    }

done - imports
done - preprocessing and split


In [5]:
# 4. Chi-Square + PSO
print('Running: Chi-Square + PSO...')
X_train_pos = X_train - X_train.min() + 1e-10
X_test_pos = X_test - X_test.min() + 1e-10
chi = SelectKBest(chi2, k=30)
X_train_chi = chi.fit_transform(X_train_pos, y_train)
X_test_chi = chi.transform(X_test_pos)
results['ChiSquare_PSO'] = run_pso_on_features(X_train_chi, X_test_chi, y_train, y_test, 'Chi2+PSO')
print(f"done - Chi2+PSO: {results['ChiSquare_PSO']['num_features']} features, F1={results['ChiSquare_PSO']['f1']:.4f}")

Running: Chi-Square + PSO...


2025-11-19 12:28:00,205 - pyswarms.discrete.binary - INFO - Optimize for 10 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|10/10, best_cost=-0.992
2025-11-19 12:58:49,947 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: -0.9919066988235551, best pos: [1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 0 0 1]


done - Chi2+PSO: 20 features, F1=0.9919


In [6]:
selected_indices_chi = results['ChiSquare_PSO']['selected_indices']
selected_feature_names_chi = X_train.columns[chi.get_support()][selected_indices_chi].tolist()
print("Selected features (Chi2+PSO):")
print(selected_feature_names_chi)
print(f"Total: {len(selected_feature_names_chi)} features")

Selected features (Chi2+PSO):
['FIT101', 'LIT101', 'P101', 'P102', 'AIT203', 'FIT201', 'MV201', 'DPIT301', 'FIT301', 'MV304', 'P302', 'AIT402', 'P402', 'UV401', 'AIT501', 'AIT502', 'FIT502', 'FIT503', 'FIT504', 'PIT503']
Total: 20 features
