In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


In [2]:
# Generate synthetic dataset
X, y = make_regression(n_samples=100, n_features=10, n_informative=4, noise=0.1, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])


In [3]:
# Forward Selection function
def forward_selection(X, y, model, scoring='r2', max_features=None, verbose=True):
    if max_features is None:
        max_features = X.shape[1]
    
    selected_features = []
    remaining_features = list(X.columns)
    best_score = -np.inf
    
    while remaining_features and len(selected_features) < max_features:
        scores = []
        for feature in remaining_features:
            # Try current selected features + one new feature
            current_features = selected_features + [feature]
            X_subset = X[current_features]
            # Evaluate model with cross-validation
            score = np.mean(cross_val_score(model, X_subset, y, cv=5, scoring=scoring))
            scores.append((score, feature))
        
        # Find the best feature to add
        scores.sort(reverse=True)
        best_new_score, best_feature = scores[0]
        
        if best_new_score > best_score:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_score = best_new_score
            if verbose:
                print(f"Added {best_feature}, Score: {best_score:.4f}")
        else:
            # Stop if no improvement
            if verbose:
                print("No further improvement. Stopping.")
            break
    
    return selected_features, best_score


In [4]:
# Initialize model
model = LinearRegression()

In [5]:
# Run forward selection
selected_features, final_score = forward_selection(X, y, model, scoring='r2')

Added feature_7, Score: 0.4970
Added feature_3, Score: 0.9654
Added feature_1, Score: 0.9871
Added feature_5, Score: 1.0000
Added feature_9, Score: 1.0000
No further improvement. Stopping.


In [6]:
print("\nFinal selected features:", selected_features)
print("Final R² score:", final_score)



Final selected features: ['feature_7', 'feature_3', 'feature_1', 'feature_5', 'feature_9']
Final R² score: 0.9999989530922127


In [7]:
# Train final model with selected features
X_final = X[selected_features]
model.fit(X_final, y)
print("Model coefficients:", model.coef_)

Model coefficients: [7.06440999e+01 6.36498603e+01 1.67620845e+01 1.04501231e+01
 1.27385973e-02]
