In [356]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import optuna
import warnings
from tqdm import tqdm

In [357]:
IN_DIR = 'kaggle/input/apl-2025-spring-smoker-status'
OUT_DIR = 'kaggle/output'
SEED = 42

In [358]:
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings("ignore")

## Read the Data

In [359]:
traindata = np.genfromtxt(f"{IN_DIR}/train.csv", delimiter=',', skip_header=1)
testdata = np.genfromtxt(f"{IN_DIR}/test.csv", delimiter=',', skip_header=1)

In [360]:
trainset, valset = train_test_split(traindata, test_size=0.3, random_state=SEED)

In [361]:
X_train, y_train, X_val, y_val = trainset[:, :-1], trainset[:, -1], valset[:, :-1], valset[:, -1]
X_test = testdata

## Build a Model/Pipline

### Optuna Objective Functions

In [362]:
def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'auto',  # change to 'gpu_hist' if using GPU
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'verbosity': 0,
    }

    model = make_pipeline(StandardScaler(), xgb.XGBClassifier(**params))
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)

    return accuracy

In [363]:
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }

    model = make_pipeline(StandardScaler(), RandomForestClassifier(**params))
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)

    return accuracy

In [364]:
def lr_objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 1000),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
    }

    model = make_pipeline(StandardScaler(), LogisticRegression(**params))
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)

    return accuracy

In [None]:

def ada_objective(trial):
    base_estimator = DecisionTreeClassifier(max_depth=trial.suggest_int('base_max_depth', 2, 5))

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'base_estimator': base_estimator,
    }

    model = make_pipeline(StandardScaler(), AdaBoostClassifier(**params))
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)

    return accuracy

### Studies

In [366]:
models = [
    ('xgboost', xgb_objective, 10),
    ('random_forest', rf_objective, 10),
    ('logistic_regression', lr_objective, 10),
    ('ada_boost', ada_objective, 10),
]

In [None]:
best_models = {}
best_scores = []

for name, objective, n_trials in models:
    print(f"Optimizing {name}...")

    study = optuna.create_study(direction='maximize')

    with tqdm(total=n_trials, desc=f"{name}") as pbar:
        def make_callback(pbar):
            def callback(study, trial):
                score = trial.value
                best_score = study.best_value
                pbar.set_postfix({"score": f"{score:.4f}", "best": f"{best_score:.4f}"})
                pbar.update(1)

            return callback

        study.optimize(objective, n_trials=n_trials, callbacks=[make_callback(pbar)])

    print(f"\nBest trial for {name}:")
    trial = study.best_trial
    print(f"  Value: {trial.value:.4f}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    if name == 'logistic_regression':
        model = LogisticRegression(**trial.params)
    elif name == 'random_forest':
        model = RandomForestClassifier(**trial.params)
    elif name == 'xgboost':
        model = xgb.XGBClassifier(**trial.params, verbosity=0)
    elif name == 'ada_boost':
        # Extract AdaBoost params
        ada_params = {
            'n_estimators': trial.params['n_estimators'],
            'learning_rate': trial.params['learning_rate'],
        }

        # Extract base estimator param
        base_max_depth = trial.params['base_max_depth']
        base_estimator = DecisionTreeClassifier(max_depth=base_max_depth)

        model = AdaBoostClassifier(base_estimator=base_estimator, **ada_params)
    else:
        raise ValueError(f"Unknown model name {name}")

    best_models[name] = model
    best_scores.append(trial.value)


Optimizing xgboost...


xgboost: 100%|██████████| 10/10 [00:12<00:00,  1.22s/it, score=0.8049, best=0.8122]



Best trial for xgboost:
  Value: 0.8122
  Params:
    max_depth: 9
    learning_rate: 0.01689528131437169
    n_estimators: 288
    lambda: 2.9706895135096407
    alpha: 0.10045769879010309
Optimizing random_forest...


random_forest: 100%|██████████| 10/10 [00:18<00:00,  1.84s/it, score=0.7993, best=0.8018]



Best trial for random_forest:
  Value: 0.8018
  Params:
    n_estimators: 492
    max_depth: 9
    min_samples_split: 7
    min_samples_leaf: 6
    max_features: log2
Optimizing logistic_regression...


logistic_regression: 100%|██████████| 10/10 [00:01<00:00,  5.51it/s, score=0.7869, best=0.7869]



Best trial for logistic_regression:
  Value: 0.7869
  Params:
    C: 1.3059668990720392
    max_iter: 191
    solver: liblinear
    penalty: l2
Optimizing ada_boost...


ada_boost: 100%|██████████| 10/10 [00:49<00:00,  4.95s/it, score=0.8002, best=0.8064]


Best trial for ada_boost:
  Value: 0.8064
  Params:
    base_max_depth: 3
    n_estimators: 156
    learning_rate: 0.0502008338421205





TypeError: __init__() got an unexpected keyword argument 'base_max_depth'

In [None]:
# Fit all tuned models on the training data
for model in best_models.values():
    model.fit(X_train, y_train)

In [None]:
# Create weighted voting classifier
voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in best_models.items()],
    voting='soft',
    weights=best_scores  # weights = list of Optuna best scores
)

# Fit voting ensemble
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('xgboost',
                              XGBClassifier(alpha=9.844348377647071,
                                            base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1,
                                            enable_categorical=False, gamma=0,
                                            gpu_id=-1, importance_type=None,
                                            interaction_constraints='',
                                            lambda=0.9872504954973608,
                                            learning_rate=0.03481065336422548,
                                            max_delta_step=0, max_depth=5,
                                            mi...
                                            validate_parameters=1,
                                            verbo

### Evaluate on Validation Set

In [None]:
# Predict and evaluate
val_pred = voting_clf.predict(X_val)
print("Weighted voting ensemble accuracy:", accuracy_score(y_val, val_pred))

Weighted voting ensemble accuracy: 0.8102222222222222


## Make Predictions on Test Set

In [None]:
test_pred = voting_clf.predict(X_test).astype(int)

### Build the Submission File

In [None]:
test_ids = X_test[:, 0].astype(int)

In [None]:
sub_data = np.column_stack((test_ids, test_pred))

np.savetxt(f"{OUT_DIR}/smoking.csv", sub_data, delimiter=',', header='id,smoking', comments='', fmt=['%d', '%d'])