Find the best set of hyperparameters which maximize the generalization performance on a training set.

In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.compose import(
    make_column_selector as selector,
    ColumnTransformer,
)

from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import plotly.express as px

In [2]:
df = pd.read_csv('adult_cencus.csv')

In [3]:
data, target = df.drop(
    columns=['education_num', 'class']),df['class'].map({' >50K':1, ' <=50K':0})

In [4]:
X_full_train, X_test, y_full_train, y_test = model_selection.train_test_split(
    data,
    target,
    test_size=.2,
    random_state=42
)

X_full_train.shape, y_full_train.shape, X_test.shape, y_test.shape

((39073, 13), (39073,), (9769, 13), (9769,))

In [5]:
categorical = selector(dtype_include=object)(data)
numerical = selector(dtype_include=np.number)(data)

cat_preprocessor = OneHotEncoder(
    handle_unknown='ignore', 
)

preprocessor = ColumnTransformer([
    ('num_preprocessor', StandardScaler(), numerical),
    ('cat_preprocessor', cat_preprocessor, categorical)
],
remainder='passthrough', sparse_threshold=0
)

In [6]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))

In [7]:
from scipy.stats import loguniform

class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [8]:
# for param in model.get_params():
#   print(param)

In [9]:
%%time

param_distributions = {
    'columntransformer__num_preprocessor__with_mean': [True, False],
    'columntransformer__num_preprocessor__with_std': [True, False],
    'logisticregression__C': loguniform(0.001, 10),
}

model_random_search = model_selection.RandomizedSearchCV(
    model, 
    param_distributions=param_distributions,
    n_iter=20,
    cv=5,
    verbose=1,
    error_score='raise'
).fit(X_full_train, y_full_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 4min 34s, sys: 1min 15s, total: 5min 50s
Wall time: 3min 6s


In [10]:
accuracy = model_random_search.score(X_test, y_test)

print(f"The test accuracy score of the best model is "
      f"{accuracy:.2f}"
)

The test accuracy score of the best model is 0.85


In [11]:
from pprint import pprint

print("The best parameters are:")
pprint(model_random_search.best_params_)

The best parameters are:
{'columntransformer__num_preprocessor__with_mean': True,
 'columntransformer__num_preprocessor__with_std': True,
 'logisticregression__C': 5.089436203781769}
