In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.compose import(
    make_column_selector as selector,
    ColumnTransformer,
)

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('adult_cencus.csv')
df.sample(n=5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
40383,33,Private,234537,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
26248,65,Private,180807,HS-grad,9,Separated,Protective-serv,Not-in-family,White,Male,991,0,20,United-States,<=50K
5698,51,Private,240988,11th,7,Married-civ-spouse,Machine-op-inspct,Husband,Asian-Pac-Islander,Male,0,0,40,Philippines,<=50K
39304,35,Local-gov,184117,Masters,14,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K
28943,23,,86337,Some-college,10,Never-married,,Not-in-family,White,Female,0,0,15,United-States,<=50K


In [3]:
df.isna().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     857
class                0
dtype: int64

In [4]:
data, target = df.drop(columns=['education_num', 'class']), df['class'].map({' >50K':1, ' <=50K':0})

In [5]:
target.isna().sum()

0

In [6]:
X_full_train, X_test, y_full_train, y_test = model_selection.train_test_split(
    data,
    target,
    test_size=.2,
    random_state=42
)

X_full_train.shape, y_full_train.shape, X_test.shape, y_test.shape

((39073, 13), (39073,), (9769, 13), (9769,))

In [7]:
categorical = selector(dtype_include=object)(data)
cat_preprocessor = OrdinalEncoder(
    handle_unknown='use_encoded_value', 
    unknown_value=-1
)

preprocessor = ColumnTransformer([
    ('cat_preprocessor', cat_preprocessor, categorical)
],
remainder='passthrough', sparse_threshold=0
)

In [8]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', HistGradientBoostingClassifier(random_state=42))
])

In [9]:
for param in model.get_params():
  print(param)

memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__verbose_feature_names_out
preprocessor__cat_preprocessor
preprocessor__cat_preprocessor__categories
preprocessor__cat_preprocessor__dtype
preprocessor__cat_preprocessor__handle_unknown
preprocessor__cat_preprocessor__unknown_value
classifier__categorical_features
classifier__early_stopping
classifier__l2_regularization
classifier__learning_rate
classifier__loss
classifier__max_bins
classifier__max_depth
classifier__max_iter
classifier__max_leaf_nodes
classifier__min_samples_leaf
classifier__monotonic_cst
classifier__n_iter_no_change
classifier__random_state
classifier__scoring
classifier__tol
classifier__validation_fraction
classifier__verbose
classifier__warm_start


# Tuning hyperparameters manually

Make a search of the best combinations of the `learning_rate` and `max_leaf_nodes`

In [10]:
best_params = {}
score = 0

for lr in [0.01, 0.1, 1, 10.]:
  for leaf in [3, 10, 30]:
    model.set_params(
        classifier__learning_rate=lr, 
        classifier__max_leaf_nodes=leaf
    )

    cv_score = model_selection.cross_val_score(
        model,
        X_full_train,
        y_full_train,
        cv=10,
        n_jobs=-1
    )
    if cv_score.mean() > score:
      score = cv_score.mean()
      best_params['lr'] = lr
      best_params['max_leaf_nodes'] = leaf
      best_params['score'] = score
      best_params['std'] = score.std()
    
    print(f"lr:{lr} - max-leaf-nodes:{leaf} - score:{cv_score.mean():.3f} "
          f"std: {cv_score.std():.3f}"
    )
print()
print(f"best lr: {best_params['lr']}, best max leaf nodes:{best_params['max_leaf_nodes']} "
      f"best score: {best_params['score']:.3f} +/- {best_params['std']:.3f}"
)

lr:0.01 - max-leaf-nodes:3 - score:0.800 std: 0.003
lr:0.01 - max-leaf-nodes:10 - score:0.820 std: 0.004
lr:0.01 - max-leaf-nodes:30 - score:0.849 std: 0.004
lr:0.1 - max-leaf-nodes:3 - score:0.855 std: 0.004
lr:0.1 - max-leaf-nodes:10 - score:0.871 std: 0.005
lr:0.1 - max-leaf-nodes:30 - score:0.874 std: 0.004
lr:1 - max-leaf-nodes:3 - score:0.866 std: 0.005
lr:1 - max-leaf-nodes:10 - score:0.866 std: 0.006
lr:1 - max-leaf-nodes:30 - score:0.860 std: 0.010
lr:10.0 - max-leaf-nodes:3 - score:0.279 std: 0.008
lr:10.0 - max-leaf-nodes:10 - score:0.743 std: 0.082
lr:10.0 - max-leaf-nodes:30 - score:0.656 std: 0.161

best lr: 0.1, best max leaf nodes:30 best score: 0.874 +/- 0.000


In [11]:
model.set_params(
    classifier__learning_rate=best_params['lr'], 
    classifier__max_leaf_nodes=best_params['max_leaf_nodes']
)

model.fit(X_full_train, y_full_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('cat_preprocessor',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['workclass', 'education',
                                                   'marital_status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native_country'])])),
                ('classifier',
                 HistGradientBoostingClassifier(max_leaf_nodes=30,
                                                random_state=42))])

In [12]:
y_pred = model.predict(X_test)

In [13]:
y_pred[:5]

array([0, 0, 0, 0, 0])

In [14]:
score = model.score(X_test, y_test)
score.round(3)

0.874

In [16]:
from sklearn import metrics

recall_score = metrics.recall_score(y_test, y_pred)
pecision_score = metrics.precision_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)

print(f'recall: {recall_score:.3f}')
print(f'precision: {pecision_score:.3f}')
print(f'auc: {auc:.3f}')

recall: 0.642
precision: 0.795
auc: 0.795
