In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFE, VarianceThreshold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from pyswarms.discrete import BinaryPSO
import time

print('done - imports')

# Load and preprocess
df = pd.read_csv('SWaT.csv')
df.columns = df.columns.str.strip()
df['Normal/Attack'] = (df['Normal/Attack'] == 'Attack').astype(int)
X_raw = df.drop(['Timestamp', 'Normal/Attack'], axis=1)
y = df['Normal/Attack']

scaler = MinMaxScaler()
X_raw = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)
print('done - preprocessing and split')

# PSO options
options = {'c1': 1.5, 'c2': 1.5, 'w': 0.8, 'k': 2, 'p': 2}
results = {}

# Helper function for PSO
def run_pso_on_features(X_tr, X_te, y_tr, y_te, method_name):
    start = time.time()
    X_tr_np = X_tr.values if isinstance(X_tr, pd.DataFrame) else X_tr
    X_te_np = X_te.values if isinstance(X_te, pd.DataFrame) else X_te
    
    def fitness(mask):
        scores = []
        for particle in mask:
            cols = [i for i, bit in enumerate(particle) if bit]
            if not cols:
                scores.append(0)
                continue
            clf = RandomForestClassifier(n_estimators=30, random_state=42)
            clf.fit(X_tr_np[:, cols], y_tr)
            preds = clf.predict(X_te_np[:, cols])
            scores.append(f1_score(y_te, preds, average='weighted'))
        return -1 * np.array(scores)
    
    n_feat = X_tr_np.shape[1]
    pso = BinaryPSO(n_particles=10, dimensions=n_feat, options=options)
    cost, pos = pso.optimize(fitness, iters=10)
    
    selected_indices = [i for i, bit in enumerate(pos) if bit]
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_tr_np[:, selected_indices], y_tr)
    preds = clf.predict(X_te_np[:, selected_indices])
    
    return {
        'num_features': len(selected_indices),
        'selected_indices': selected_indices,
        'f1': f1_score(y_te, preds, average='weighted'),
        'accuracy': accuracy_score(y_te, preds),
        'precision': precision_score(y_te, preds, average='weighted'),
        'recall': recall_score(y_te, preds, average='weighted'),
        'time': time.time() - start
    }

done - imports
done - preprocessing and split


In [2]:
print('Running: Correlation-based feature selection ...')

# Calculate absolute correlations with target
corr_matrix = X_train.corrwith(y_train).abs()
print(' - Calculating feature correlations...')

# Select top 10 correlated features
top_10_corr = corr_matrix.nlargest(10).index
print(' - Selecting top 10 features by correlation:', list(top_10_corr))

# Subset data for selected features
X_train_corr = X_train[top_10_corr]
X_test_corr = X_test[top_10_corr]

print(' - Training Random Forest on selected features ...')
clf = RandomForestClassifier(n_estimators=100, random_state=42)
start = time.time()
clf.fit(X_train_corr, y_train)
preds = clf.predict(X_test_corr)
print(' - Done')

f1 = f1_score(y_test, preds, average='weighted')
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='weighted')
recall = recall_score(y_test, preds, average='weighted')
elapsed = time.time() - start

print(f"done - CorrelationOnly: 10 features, F1={f1:.4f}, accuracy={accuracy:.4f}, precision={precision:.4f}, recall={recall:.4f}, time={elapsed:.2f}s")

# If you want this as a dict for comparison:
correlation_results = {
    'num_features': len(top_10_corr),
    'selected_features': list(top_10_corr),
    'f1': f1,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'time': elapsed
}

Running: Correlation-based feature selection ...


  c /= stddev[:, None]
  c /= stddev[None, :]


 - Calculating feature correlations...
 - Selecting top 10 features by correlation: ['FIT401', 'FIT504', 'FIT503', 'UV401', 'P501', 'PIT501', 'FIT501', 'PIT503', 'FIT502', 'P402']
 - Training Random Forest on selected features ...
 - Done
done - CorrelationOnly: 10 features, F1=0.9806, accuracy=0.9813, precision=0.9815, recall=0.9813, time=48.67s
