# Practice Activity 5.4: Implementing feature selection techniques on a given dataset

[link](https://www.coursera.org/learn/ai-and-machine-learning-algorithms-and-techniques/supplement/C4fw0/practice-activity-implementing-feature-selection-techniques-on-a-given-dataset)

In [40]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [41]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
# Load the data from the student_data.csv
data = pd.read_csv('student_data.csv')

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']
print(df.head(5))

   StudyHours  PrevExamScore  Pass
0           5             83     0
1           5             74     0
2           9             72     1
3           5             76     0
4           6             69     0


In [42]:
import random
randonmaxboundery = len(df) - 1
print(f"randonmaxboundery: {randonmaxboundery}")

random_state = random.randint(0, randonmaxboundery)
print(f"random_state: {random_state}")

randonmaxboundery: 999
random_state: 500


## Backward elimination

In [43]:
# Add constant to the model
X = sm.add_constant(X)

# Fit the model using Ordinary Least Squares (OLS)
model = sm.OLS(y, X).fit()

# Display the model summary
print(model.summary())

# Remove feature with highest p-value if greater than 0.05
if model.pvalues['StudyHours'] > 0.05:
    X = X.drop(columns='StudyHours')
    model = sm.OLS(y, X).fit()

# Final model after backward elimination
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.633
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     859.9
Date:                Mon, 03 Mar 2025   Prob (F-statistic):          9.50e-218
Time:                        23:02:57   Log-Likelihood:                -184.36
No. Observations:                1000   AIC:                             374.7
Df Residuals:                     997   BIC:                             389.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.4083      0.103    -13.623

## Forward selection

In [44]:
def forward_selection(X, y):
    remaining_features = set(X.columns)
    selected_features = []
    current_score = 0.0
    
    while remaining_features:
        scores_with_candidates = []
        
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train, X_test, y_train, y_test = train_test_split(X[features_to_test], y, test_size=0.2, random_state=random_state,shuffle=True)
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            
            scores_with_candidates.append((score, feature))
        
        # Select the feature with the highest score
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]
        
        if current_score < best_score:
            remaining_features.remove(best_feature)
            selected_features.append(best_feature)
            current_score = best_score
        else:
            break
    
    return selected_features

best_features = forward_selection(X, y)
print(f"Selected features using Forward Selection: {best_features}")

Selected features using Forward Selection: ['StudyHours', 'PrevExamScore']


## Implementing LASSO

In [45]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state,shuffle=True)

# Initialize the LASSO model with alpha (regularization parameter)
lasso_model = Lasso(alpha=0.1)

# Train the LASSO model
lasso_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lasso_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared score: {r2}')

# Display the coefficients of the features
print(f'LASSO Coefficients: {lasso_model.coef_}')

R-squared score: 0.6370325001246487
LASSO Coefficients: [0.         0.18780164 0.00446642]
