In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("adult.csv")

df["income"] = (df["income"] == ">50K").astype(int)

X = df.drop(columns=["income"])
y = df["income"]

X = pd.get_dummies(X, drop_first=True, dtype=float)

scaler = StandardScaler()
X = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)


In [3]:
def build_model(selected_vars, X, y):
    X_sel = sm.add_constant(X[selected_vars])
    model = sm.Logit(y, X_sel).fit(disp=False, method='lbfgs', maxiter=200)
    return model, getattr(model, 'aic')

def forward_selection(X, y):
    remaining = list(X.columns)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    best_model = None

    while remaining:
        scores = []
        for candidate in remaining:
            model, score = build_model(selected + [candidate], X, y)
            scores.append((score, candidate, model))

        # lower aic is better
        scores.sort(key=lambda x: x[0])
        best_new_score, best_candidate, best_candidate_model = scores[0]

        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            best_model = best_candidate_model
        else:
            break

    return best_model, selected

In [4]:
final_model, selected_vars = forward_selection(X, y)
print(selected_vars)
print(final_model.summary())

KeyboardInterrupt: 