# Feature Selection
I fear after analysis feature importance that feature engineering may have introduced much collinearity in the feature space. Many of the top features used in modelling are related to area of housing components. I want to perform feature selection to address this. Additionally many features provide little relevant information for the model. For example the some binary features have a very small percentage of features in minority class.

In [7]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings

# Suppress LightGBM warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

features = train.iloc[:, :-1]
target = train.iloc[:, -1]

target_transformed = np.log1p(target)

X = features.copy()
y = target_transformed
remaining_features = list(X.columns)
selected_features = []
scores = []

prev_best_score = -np.inf
improvement_threshold = 1e-4  # Minimum improvement required to continue

while remaining_features:
    best_score = -np.inf
    best_feature = None

    for feature in remaining_features:
        trial_features = selected_features + [feature]
        X_subset = X[trial_features]
        
        model = LGBMRegressor(verbose=-1)  # suppress LightGBM output here
        score = cross_val_score(model, X_subset, y, cv=5, scoring='neg_mean_squared_error').mean()
        print(f'Evaluating {feature}: {score}')

        if score > best_score:
            best_score = score
            best_feature = feature

    # Check for improvement
    if best_score - prev_best_score < improvement_threshold:
        print(f"Stopping early: no significant improvement (Δ={best_score - prev_best_score:.6f})")
        break

    selected_features.append(best_feature)
    remaining_features.remove(best_feature)
    scores.append(best_score)
    prev_best_score = best_score

    print(f"Added: {best_feature}, CV Score: {best_score:.5f}")

Evaluating LotFrontage: -0.1251629351387483
Evaluating LotArea: -0.12684128847851067
Evaluating OverallQual: -0.054123058230548436
Evaluating OverallCond: -0.13480486703969669
Evaluating YearBuilt: -0.09540177764344715
Evaluating YearRemodAdd: -0.11025886579345379
Evaluating MasVnrArea: -0.13481212008955784
Evaluating ExterQual: -0.08905705957976691
Evaluating ExterCond: -0.15298702197904226
Evaluating BsmtQual: -0.0878558049450816
Evaluating BsmtCond: -0.14658324555638833
Evaluating BsmtFinSF1: -0.13108241582920635
Evaluating BsmtFinSF2: -0.1597989693515377
Evaluating BsmtUnfSF: -0.15354107718389856
Evaluating TotalBsmtSF: -0.0949560185159581
Evaluating HeatingQC: -0.12285129723982173
Evaluating 1stFlrSF: -0.10510686261337635
Evaluating 2ndFlrSF: -0.12498882809692761
Evaluating LowQualFinSF: -0.15952017628492912
Evaluating GrLivArea: -0.08185156015739996
Evaluating BsmtFullBath: -0.15063765845312568
Evaluating BsmtHalfBath: -0.15984805290767523
Evaluating FullBath: -0.1021195550444623