In [2]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn import preprocessing
bodyfat = pd.read_csv("/Users/gabri/Classes/MachineLearning2026/data/bodyfat.csv")
X = bodyfat.drop(columns=["BodyFat","Density"])
y = bodyfat["BodyFat"]
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.2,
random_state = 10
)
kf = KFold(n_splits = 5, shuffle = True, random_state = 10)
cv_fold = np.zeros(len(y_train)).astype(int)
for i, (_, fold_indexes) in enumerate(kf.split(X_train)):
    cv_fold[fold_indexes] = int(i)

In [25]:
# Best subset selection
from itertools import combinations
import statsmodels.api as sm
from tqdm import tqdm

cols = X_train.columns.tolist()
bss_r2 = {}
max_r2_by_k = {}
best_subset_by_k = {}

for k in tqdm(range(1,14)):

    bss_r2[k] = {}
    max_r2_by_k[k] = 0
    best_subset_by_k[k] = None
    
    for subset in combinations(cols, k):

        X_sub = sm.add_constant(X_train[list(subset)])
        model = sm.OLS(y_train, X_sub).fit()
        r2 = model.rsquared
        bss_r2[k][subset] = r2

        if r2 > max_r2_by_k[k]:
            max_r2_by_k[k] = r2
            best_subset_by_k[k] = subset
            

for k in range(1,14):
    print(f'Melhor subset para k = {k} : {best_subset_by_k[k]}')
    print(f'R^2  = {max_r2_by_k[k]}')
    print('----------------------------- \n')

100%|██████████| 13/13 [00:08<00:00,  1.57it/s]

Melhor subset para k = 1 : ('Abdomen',)
R^2  = 0.6404157973526721
----------------------------- 

Melhor subset para k = 2 : ('Weight', 'Abdomen')
R^2  = 0.693716876511187
----------------------------- 

Melhor subset para k = 3 : ('Weight', 'Abdomen', 'Wrist')
R^2  = 0.7100124087139327
----------------------------- 

Melhor subset para k = 4 : ('Weight', 'Abdomen', 'Biceps', 'Wrist')
R^2  = 0.7192801308495235
----------------------------- 

Melhor subset para k = 5 : ('Weight', 'Abdomen', 'Biceps', 'Forearm', 'Wrist')
R^2  = 0.7231554582194162
----------------------------- 

Melhor subset para k = 6 : ('Weight', 'Neck', 'Abdomen', 'Biceps', 'Forearm', 'Wrist')
R^2  = 0.7263015578363177
----------------------------- 

Melhor subset para k = 7 : ('Age', 'Weight', 'Neck', 'Abdomen', 'Thigh', 'Forearm', 'Wrist')
R^2  = 0.7287047610965436
----------------------------- 

Melhor subset para k = 8 : ('Age', 'Weight', 'Neck', 'Abdomen', 'Thigh', 'Biceps', 'Forearm', 'Wrist')
R^2  = 0.731617043




In [63]:
#foward selection

columns = list(X_train.columns)

best_by_k = {}  
selected = []  

for k in range(1, 14):
    best_r2 = -float("inf")
    best_feature_to_add = None

    for feature in columns:
        if feature in selected:
            continue

        trial_features = selected + [feature]
        X_sub = sm.add_constant(X_train[trial_features])
        model = sm.OLS(y_train, X_sub).fit()
        r2 = model.rsquared

        if r2 > best_r2:
            best_r2 = r2
            best_feature_to_add = feature

    selected.append(best_feature_to_add)

    best_by_k[k] = {
        "features": selected.copy(),
        "r2": best_r2
    }

    print(f"k={k:2d} | R²={best_r2:.6f} | features={best_by_k[k]['features']}")



k= 1 | R²=0.640416 | features=['Abdomen']
k= 2 | R²=0.693717 | features=['Abdomen', 'Weight']
k= 3 | R²=0.710012 | features=['Abdomen', 'Weight', 'Wrist']
k= 4 | R²=0.719280 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps']
k= 5 | R²=0.723155 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm']
k= 6 | R²=0.726302 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck']
k= 7 | R²=0.728615 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck', 'Thigh']
k= 8 | R²=0.731617 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck', 'Thigh', 'Age']
k= 9 | R²=0.733688 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck', 'Thigh', 'Age', 'Hip']
k=10 | R²=0.734807 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck', 'Thigh', 'Age', 'Hip', 'Ankle']
k=11 | R²=0.735076 | features=['Abdomen', 'Weight', 'Wrist', 'Biceps', 'Forearm', 'Neck', 'Thigh', 'Age', 'Hip', 'Ankle', 'Chest']
k=12 | R²=0.735308 | features=[

In [104]:
# backward selection

best_back_by_k = {}

columns = list(X_train.columns)

deleted = []

for k in range(13,0,-1):

    best_r2 = float("inf")
    best_feature_to_drop = None

    for feature in columns:
        
        if feature not in deleted:

            nova = columns.copy()
            nova.remove(feature)

            X_sub = sm.add_constant(X_train[nova])
            model = sm.OLS(y_train,X_sub).fit()
            r2 = model.rsquared

            if r2 < best_r2:
                best_r2 = r2
                best_feature_to_drop = feature

    deleted.append(best_feature_to_drop)
    columns.remove(best_feature_to_drop)
    print(columns)
    best_back_by_k[k] = {
        'features': columns,
        'r2': best_r2
    }

['Age', 'Weight', 'Height', 'Neck', 'Chest', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']
['Weight', 'Height', 'Neck', 'Chest', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']
['Weight', 'Height', 'Neck', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']
['Weight', 'Neck', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']
['Weight', 'Neck', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm']
['Weight', 'Neck', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm']
['Neck', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm']
['Neck', 'Knee', 'Ankle', 'Biceps', 'Forearm']
['Neck', 'Ankle', 'Biceps', 'Forearm']
['Ankle', 'Biceps', 'Forearm']
['Ankle', 'Forearm']
['Ankle']
[]


In [105]:
best_back_by_k

{13: {'features': [], 'r2': np.float64(0.5947670493617078)},
 12: {'features': [], 'r2': np.float64(0.530501426540588)},
 11: {'features': [], 'r2': np.float64(0.47427419784294844)},
 10: {'features': [], 'r2': np.float64(0.4290737497047791)},
 9: {'features': [], 'r2': np.float64(0.4080657738060006)},
 8: {'features': [], 'r2': np.float64(0.3977841462031726)},
 7: {'features': [], 'r2': np.float64(0.34691337347187634)},
 6: {'features': [], 'r2': np.float64(0.3240882183124175)},
 5: {'features': [], 'r2': np.float64(0.2909651786494021)},
 4: {'features': [], 'r2': np.float64(0.2484349062896496)},
 3: {'features': [], 'r2': np.float64(0.13953148120287373)},
 2: {'features': [], 'r2': np.float64(0.08090122441323933)},
 1: {'features': [], 'r2': np.float64(0.0)}}

In [None]:
aa = columns.remove('Age')
columns

TypeError: 'str' object cannot be interpreted as an integer

In [73]:
selected

['Age',
 'Weight',
 'Height',
 'Neck',
 'Chest',
 'Abdomen',
 'Hip',
 'Thigh',
 'Knee',
 'Ankle',
 'Biceps',
 'Forearm',
 'Wrist']