In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression


In [2]:
# Create a sample regression dataset
X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=42)
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])


In [3]:
def backward_selection(X, y, model, scoring='r2', min_features=1, verbose=True):
    selected = list(X.columns)
    best_score = np.mean(cross_val_score(model, X[selected], y, cv=5, scoring=scoring))
    
    while len(selected) > min_features:
        scores = []
        for feature in selected:
            trial_features = [f for f in selected if f != feature]
            score = np.mean(cross_val_score(model, X[trial_features], y, cv=5, scoring=scoring))
            scores.append((score, feature))
        
        scores.sort(reverse=True)
        top_score, worst_feature = scores[0]
        
        if top_score >= best_score:
            selected.remove(worst_feature)
            best_score = top_score
            if verbose:
                print(f"Removed {worst_feature}, Score: {top_score:.4f}")
        else:
            if verbose:
                print("No further improvement. Stopping.")
            break
            
    return selected, best_score

In [4]:
# Use the function
model = LinearRegression()
selected_features, final_score = backward_selection(X, y, model)

print("\nSelected features:", selected_features)
print("Final R² score:", final_score)

No further improvement. Stopping.

Selected features: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']
Final R² score: 0.9999996208615279
