In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [None]:
data = pd.read_csv('../precleaned-datasets/dataset_2.csv')
data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['target']),
                                                     data['target'],
                                                       test_size=0.2,
                                                         random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
len(X_train.columns)  

In [None]:
# remove correlated features to reduce feature space
correlated_features = set()
correlation_matrix = data.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Correlated features: {correlated_features}")
X_train = X_train.drop(columns=correlated_features)
X_test = X_test.drop(columns=correlated_features)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
len(X_train.columns)  

In [None]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=42),
        n_features_to_select=10,  # -->  Number of features to select
          direction='backward',   #  --> Backward selection
          scoring='roc_auc', # --> Use ROC AUC as the scoring metric
          cv=3)           # --> 5-fold cross-validation


In [None]:
sfs.fit(X_train, y_train)

In [None]:
selected_feat = sfs.get_feature_names_out()
print(selected_feat)

In [None]:
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

print(X_train_selected.shape, X_test_selected.shape, y_train.shape, y_test.shape)