In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

X = pd.read_csv("secom.data", sep=" ", header=None)
y = pd.read_csv("secom_labels.data", sep=" ", header=None)[0]
variances = X.var(axis=0, skipna=True)
non_constant_cols = variances[variances > 0].index
X_nc = X[non_constant_cols].copy()
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X_nc)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print("Scaled shape:", X_scaled.shape)
print("Mean (approx 0) of first 5 features:", X_scaled[:, :5].mean(axis=0))
print("Std  (approx 1) of first 5 features:", X_scaled[:, :5].std(axis=0))



Scaled shape: (1567, 474)
Mean (approx 0) of first 5 features: [ 1.62332035e-15 -5.07372631e-15 -5.71336214e-15 -1.26963603e-16
  0.00000000e+00]
Std  (approx 1) of first 5 features: [1. 1. 1. 1. 1.]


step 2

In [3]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, mutual_info_classif


k = 20
start = time.time()
mi_selector = SelectKBest(mutual_info_classif, k=k)
X_mi = mi_selector.fit_transform(X_scaled, y)
end = time.time()
execution_time = end - start
selected_mi_indices = mi_selector.get_support(indices=True)
print("Top 20 MI Feature Indices:", selected_mi_indices)
print(f"MI execution time: {execution_time:.2f} seconds")


rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
k = 20
start = time.time()
rfe = RFE(estimator=rf, n_features_to_select=k, step=1)
rfe.fit(X_scaled, y)
end = time.time()
execution_time = end - start
selected_rfe_indices = np.where(rfe.support_ == True)[0]
print("RFE selected feature indices:", selected_rfe_indices)
print(f"RFE execution time: {execution_time:.2f} seconds")


Top 20 MI Feature Indices: [ 38  39  60 115 121 123 127 233 243 276 329 337 348 387 425 454 455 457
 459 461]
MI execution time: 4.50 seconds


KeyboardInterrupt: 

step 3

In [4]:
k= 20
U, S, VT = np.linalg.svd(X_scaled, full_matrices=False)
V = VT[:k, :]
scores = np.zeros(V.shape[1])
for j in range(V.shape[1]):
    scores[j] = np.sum((S[:k]**2) * (V[:, j]**2))
ranked_indices = np.argsort(scores)[::-1]
top_20_indices = ranked_indices[:20]
print("Top 20 SVD-based feature indices:", top_20_indices)

Top 20 SVD-based feature indices: [317 413 214 315 212 411 140 239 342 412 316 213 281 242 143 284 283 184
 178 389]


step 4