In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from reservoirpy import ESN

In [2]:
# Load and basic preprocessing (same as your correlation notebook)
df = pd.read_csv("SWaT.csv")
df.columns = df.columns.str.strip()

# Encode label: 1 = Attack, 0 = Normal
df["Normal/Attack"] = (df["Normal/Attack"] == "Attack").astype(int)

X_raw = df.drop(["Timestamp", "Normal/Attack"], axis=1)
y = df["Normal/Attack"]

# Scale features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)

# Stratified split into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print("done - preprocessing and split")
print(X_train.shape, X_test.shape)

done - preprocessing and split
(359935, 51) (89984, 51)


In [3]:
print("Running: Correlation-based feature selection ...")

corr_scores = X_train.corrwith(y_train).abs()
top_10_corr = corr_scores.nlargest(10).index
print("Top 10 correlated features:", list(top_10_corr))

X_train_corr = X_train[top_10_corr]
X_test_corr = X_test[top_10_corr]

# Clean: remove constant and duplicate columns
X_train_clean = X_train_corr.loc[:, X_train_corr.std() > 0]
X_test_clean = X_test_corr[X_train_clean.columns]

X_train_clean = X_train_clean.T.drop_duplicates().T
X_test_clean = X_test_clean[X_train_clean.columns]

print("Final correlation feature count:", X_train_clean.shape[1])

Running: Correlation-based feature selection ...


  c /= stddev[:, None]
  c /= stddev[None, :]


Top 10 correlated features: ['FIT401', 'FIT504', 'FIT503', 'UV401', 'P501', 'PIT501', 'FIT501', 'PIT503', 'FIT502', 'P402']
Final correlation feature count: 10


In [4]:
# === Final ESN hyperparameters (change if your best grid result differs) ===
final_units = 40
final_sr = 0.29
final_input_scaling = 1.0

Xtr = X_train_clean.values
Xte = X_test_clean.values
ytr = y_train.values.reshape(-1, 1)
yte = y_test.values

esn_final = ESN(
    units=final_units,
    sr=final_sr,
    input_scaling=final_input_scaling,
)

esn_final = esn_final.fit(Xtr, ytr)

ytr_pred_raw = esn_final.run(Xtr)
yte_pred_raw = esn_final.run(Xte)

ytr_pred = (ytr_pred_raw > 0.5).astype(int).ravel()
yte_pred = (yte_pred_raw > 0.5).astype(int).ravel()

In [5]:
def compute_metrics(y_true, y_pred, split_name):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    print(f"{split_name} - Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")
    print(f"{split_name} confusion matrix:\n{cm}\n")
    return {
        "split": split_name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "tn": cm[0,0],
        "fp": cm[0,1],
        "fn": cm[1,0],
        "tp": cm[1,1],
    }

metrics_corr = []
metrics_corr.append(compute_metrics(y_train.values, ytr_pred, "train_corr"))
metrics_corr.append(compute_metrics(yte, yte_pred, "test_corr"))

metrics_corr_df = pd.DataFrame(metrics_corr)
metrics_corr_df.to_csv("esn_corr_final_metrics.csv", index=False)
metrics_corr_df

train_corr - Acc: 0.9541, Prec: 0.9560, Rec: 0.9541, F1: 0.9495
train_corr confusion matrix:
[[316090    178]
 [ 16341  27326]]

test_corr - Acc: 0.9542, Prec: 0.9561, Rec: 0.9542, F1: 0.9496
test_corr confusion matrix:
[[79020    47]
 [ 4073  6844]]



Unnamed: 0,split,accuracy,precision,recall,f1,tn,fp,fn,tp
0,train_corr,0.954106,0.956022,0.954106,0.949466,316090,178,16341,27326
1,test_corr,0.954214,0.956102,0.954214,0.949607,79020,47,4073,6844
