# Practice Activity 5.2: Implementing forward selection

[link](https://www.coursera.org/learn/ai-and-machine-learning-algorithms-and-techniques/supplement/PuJ5z/practice-activity-implementing-forward-selection)

In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [7]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
# Load the data from the student_data.csv
data = pd.read_csv('student_data.csv')

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']
print(df.head(5))

   StudyHours  PrevExamScore  Pass
0           5             83     0
1           5             74     0
2           9             72     1
3           5             76     0
4           6             69     0


## forward selection

In [8]:
def forward_selection(X, y):
    remaining_features = set(X.columns)
    selected_features = []
    current_score = 0.0
    best_score = 0.0
    
    while remaining_features:
        scores_with_candidates = []
        
        # Loop through remaining features
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train, X_test, y_train, y_test = train_test_split(X[features_to_test], y, test_size=0.2, random_state=42)
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            
            # Make predictions and calculate R-squared
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            
            # Record the score with the current feature
            scores_with_candidates.append((score, feature))
        
        # Sort candidates by score (highest score first)
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]
        
        # If adding the feature improves the score, add it to the model
        if current_score < best_score:
            remaining_features.remove(best_feature)
            selected_features.append(best_feature)
            current_score = best_score
        else:
            break
    
    return selected_features

# Run forward selection
best_features = forward_selection(X, y)
print("Selected features using Forward Selection:", best_features)

Selected features using Forward Selection: ['StudyHours', 'PrevExamScore']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X[best_features], y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
final_r2_score = r2_score(y_test, y_pred)

print(f'Final R-squared score with selected features: {final_r2_score}')

Final R-squared score with selected features: 0.6070993583296804
