# Practice Activity 5.1: Implementing backward elimination

[link](https://www.coursera.org/learn/ai-and-machine-learning-algorithms-and-techniques/supplement/TdDc3/practice-activity-implementing-backward-elimination)

In [3]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [4]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
# Load the data from the student_data.csv
data = pd.read_csv('student_data.csv')

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']
print(df.head(5))

   StudyHours  PrevExamScore  Pass
0           5             83     0
1           5             74     0
2           9             72     1
3           5             76     0
4           6             69     0


In [5]:
# Add a constant to the model (for the intercept)
X = sm.add_constant(X)

In [6]:
# Fit the model using Ordinary Least Squares (OLS) regression
model = sm.OLS(y, X).fit()

# Display the summary, including p-values for each feature
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.633
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     859.9
Date:                Mon, 03 Mar 2025   Prob (F-statistic):          9.50e-218
Time:                        22:25:09   Log-Likelihood:                -184.36
No. Observations:                1000   AIC:                             374.7
Df Residuals:                     997   BIC:                             389.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.4083      0.103    -13.623

## backward elimination

In [7]:
# Define a significance level
significance_level = 0.05

# Perform backward elimination
while True:
    # Fit the model
    model = sm.OLS(y, X).fit()
    # Get the highest p-value in the model
    max_p_value = model.pvalues.max()
    
    # Check if the highest p-value is greater than the significance level
    if max_p_value > significance_level:
        # Identify the feature with the highest p-value
        feature_to_remove = model.pvalues.idxmax()
        print(f"Removing feature: {feature_to_remove} with p-value: {max_p_value}")
        
        # Drop the feature
        X = X.drop(columns=[feature_to_remove])
    else:
        break

# Display the final model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.633
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     859.9
Date:                Mon, 03 Mar 2025   Prob (F-statistic):          9.50e-218
Time:                        22:25:09   Log-Likelihood:                -184.36
No. Observations:                1000   AIC:                             374.7
Df Residuals:                     997   BIC:                             389.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.4083      0.103    -13.623