In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("adult.csv")

df["income"] = (df["income"] == ">50K").astype(int)

df = df.replace('?', np.nan).dropna()

X = df.drop(columns=["income"])
y = df["income"]

X = pd.get_dummies(X, drop_first=True, dtype=float)

scaler = StandardScaler()
X = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)


X, y = X.align(y, join="inner", axis=0)

In [None]:
def build_model(selected_vars, X, y):
    X_sel = sm.add_constant(X[selected_vars])
    model = sm.Logit(y, X_sel).fit(disp=False, method='newton', maxiter=200)
    return model, getattr(model, 'aic')

def forward_selection(X, y):
    remaining = list(X.columns)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    best_model = None

    while remaining:
        scores = []
        for candidate in remaining:
            model, score = build_model(selected + [candidate], X, y)
            scores.append((score, candidate, model))

        # lower aic is better
        scores.sort(key=lambda x: x[0])
        best_new_score, best_candidate, best_candidate_model = scores[0]

        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            best_model = best_candidate_model
        else:
            break
        print(selected)

    return best_model, selected

In [None]:
# # 1) wczytanie i czyszczenie na jednym DF
# df = pd.read_csv("adult.csv")
# df = df.replace('?', np.nan).dropna().copy()
#
# # 2) target
# df["income"] = (df["income"] == ">50K").astype(int)
# y = df["income"]
#
# # 3) cechy
# X = df.drop(columns=["income"])
# X = pd.get_dummies(X, drop_first=True, dtype=float)
#
# # 4) (opcjonalnie) skalowanie – ZACHOWAJ index!
# scaler = StandardScaler()
# X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
#
# # 5) na wszelki wypadek wyrównaj indeksy (gdyby coś się rozjechało)
# X, y = X.align(y, join="inner", axis=0)
#
# def build_model(selected_vars, X, y):
#     X_sel = sm.add_constant(X[selected_vars], has_constant='add')
#     with warnings.catch_warnings():
#         warnings.filterwarnings("ignore", category=ConvergenceWarning)
#         model = sm.Logit(y, X_sel).fit(method='newton', maxiter=200, disp=False)
#     return model, float(getattr(model, 'aic', np.inf))
#
# def forward_selection(X, y):
#     remaining = list(X.columns)
#     selected = [ ]
#     best_model = None
#     current_score = np.inf
#
#     while remaining:
#         cand_best = (np.inf, None, None)
#         for c in remaining:
#             try:
#                 m, s = build_model(selected + [c], X, y)
#                 if s < cand_best[0]:
#                     cand_best = (s, c, m)
#             except Exception:
#                 pass
#
#         best_new_score, best_candidate, best_candidate_model = cand_best
#         if best_candidate is not None and best_new_score < current_score - 1e-8:
#             selected.append(best_candidate)
#             remaining.remove(best_candidate)
#             current_score = best_new_score
#             best_model = best_candidate_model
#         else:
#             break
#         print(selected)
#     return best_model, selected

final_model, selected_vars = forward_selection(X, y)
print(selected_vars)
print(final_model.summary())

# ['capital.loss', 'capital.gain', 'education.num', 'age']

