In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

In [3]:
from hypster import HyPSTERClassifier

In [4]:
SEED = 42

# Load Adult Income Dataset

In [5]:
adult = fetch_openml(name='adult', version=2)
X = pd.DataFrame(data=adult["data"], columns=adult["feature_names"])
y = adult["target"]

In [6]:
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)

# Fit HyPSTER On the Data

In [8]:
frameworks = ["xgboost", "lightgbm", "sklearn"]
model_types = ["tree_based", "linear"]

In [9]:
clf = HyPSTERClassifier(frameworks = frameworks,
                        model_types = model_types,
                        scoring="roc_auc",
                        cv=5,
                        max_iter=100,
                        n_jobs=-1,
                        random_state=SEED)

In [10]:
%%time
clf.fit(X_train, y_train, cat_cols=cat_cols, n_trials=50)

XGBoost Linear Classifier Score: 0.86262
XGBoost Linear Classifier Score: 0.85293
XGBoost Linear Classifier Score: 0.84853
LightGBM Classifier Score: 0.85773
XGBoost Tree-Based Classifier Score: 0.86388
XGBoost Tree-Based Classifier Score: 0.87446
SGD Classifier Score: 0.87097
XGBoost Linear Classifier Score: 0.84882
XGBoost Tree-Based Classifier Score: 0.857
LightGBM Classifier Score: 0.86426
XGBoost Linear Classifier Score: 0.86936
XGBoost Tree-Based Classifier Score: 0.87312
XGBoost Tree-Based Classifier Score: 0.8779
XGBoost Tree-Based Classifier Score: 0.87032
XGBoost Tree-Based Classifier Score: 0.87628
LightGBM Classifier Score: 0.87038
SGD Classifier Score: 0.86785
XGBoost Tree-Based Classifier Score: 0.86606
XGBoost Tree-Based Classifier Score: 0.86896
LightGBM Classifier Score: 0.86015
XGBoost Tree-Based Classifier Score: 0.8716
LightGBM Classifier Score: 0.85945
XGBoost Tree-Based Classifier Score: 0.86427
Wall time: 2min 31s


### Visualize Learning Curves - Optional

In [15]:
#uncomment only if "plotly" is installed
#from optuna.visualization import plot_intermediate_values
#plot_intermediate_values(clf.study)

# Review Results On Test

In [16]:
clf.best_score_

0.8778967704000706

In [18]:
preds = clf.predict_proba(X_test)
roc_score = sklearn.metrics.roc_auc_score(y_test, preds[:, 1])
print(roc_score)

0.8839704458038956
