In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [2]:
# Sample dataset
data = {
    'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass
}

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']

In [3]:
X

Unnamed: 0,StudyHours,PrevExamScore
0,1,30
1,2,40
2,3,45
3,4,50
4,5,60
5,6,65
6,7,70
7,8,75
8,9,80
9,10,85


In [4]:
y

0    0
1    0
2    0
3    0
4    0
5    1
6    1
7    1
8    1
9    1
Name: Pass, dtype: int64

In [5]:
# Add a constant to the model (for the intercept)
X = sm.add_constant(X)

In [6]:
X

Unnamed: 0,const,StudyHours,PrevExamScore
0,1.0,1,30
1,1.0,2,40
2,1.0,3,45
3,1.0,4,50
4,1.0,5,60
5,1.0,6,65
6,1.0,7,70
7,1.0,8,75
8,1.0,9,80
9,1.0,10,85


In [7]:
# Fit the model using Ordinary Least Squares (OLS) regression
model = sm.OLS(y, X).fit()

# Display the summary, including p-values for each feature
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     10.94
Date:                Thu, 17 Jul 2025   Prob (F-statistic):            0.00701
Time:                        19:44:57   Log-Likelihood:               -0.17258
No. Observations:                  10   AIC:                             6.345
Df Residuals:                       7   BIC:                             7.253
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.3333      1.464     -0.228

In [9]:
# Define a significance level
significance_level = 0.05

# Perform backward elimination
while True:
    # Fit the model
    model = sm.OLS(y, X).fit()
    # Get the highest p-value in the model
    max_p_value = model.pvalues.max()
    
    # Check if the highest p-value is greater than the significance level
    if max_p_value > significance_level:
        # Identify the feature with the highest p-value
        feature_to_remove = model.pvalues.idxmax()
        print(f"Removing feature: {feature_to_remove} with p-value: {max_p_value}")
        
        # Drop the feature
        X = X.drop(columns=[feature_to_remove])
    else:
        break

# Display the final model summary
print(model.summary())

Removing feature: PrevExamScore with p-value: 0.9999999999999956
Removing feature: const with p-value: 0.11419580126842216
                                 OLS Regression Results                                
Dep. Variable:                   Pass   R-squared (uncentered):                   0.831
Model:                            OLS   Adj. R-squared (uncentered):              0.812
Method:                 Least Squares   F-statistic:                              44.31
Date:                Thu, 17 Jul 2025   Prob (F-statistic):                    9.31e-05
Time:                        19:45:58   Log-Likelihood:                         -1.8294
No. Observations:                  10   AIC:                                      5.659
Df Residuals:                       9   BIC:                                      5.961
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                

In [10]:
X

Unnamed: 0,StudyHours
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10
