In [1]:
from lightgbm import LGBMClassifier

import numpy as np

from sklearn import base, pipeline, preprocessing, compose, metrics, model_selection

import pandas as pd

In [2]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
label_col = "class"

In [3]:
data = pd.read_csv("adult/data/dataset.csv")
data[label_col] = data[label_col].replace({"<=50K": 0, ">50K": 1})

In [4]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [5]:
train_size=0.65
val_size=0.15
test_size=0.20
seed=11


train_features, test_features, train_labels, test_labels = model_selection.train_test_split(
    data[categorical_cols + numerical_cols], 
    data[label_col], 
    test_size=test_size,
    random_state=seed
)

val_size = data.shape[0] * val_size / train_features.shape[0]

train_features, val_features, train_labels, val_labels = model_selection.train_test_split(
    train_features, 
    train_labels, 
    test_size=val_size, 
    random_state=seed
)    

In [6]:
n_bins = 10
n_quantiles=10

categorical_transformer = pipeline.Pipeline(steps=[
    ('label', preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

numerical_transformer = pipeline.FeatureUnion([
#    ('qtscaler', preprocessing.QuantileTransformer(n_quantiles=n_quantiles)),
    ('sscaler', preprocessing.StandardScaler()),
#    ('logscaler', preprocessing.FunctionTransformer(np.log1p)),
])

numerical_categorical_transformer = pipeline.Pipeline(steps=[
    ('dscaler', preprocessing.KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="uniform")), 
])


preprocessor = pipeline.Pipeline([
    ('columns_transformer', compose.ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('categorical_transformer', categorical_transformer , categorical_cols),
            #('numerical_categorical_transformer', numerical_categorical_transformer , numerical_cols),
            ('numerical_transformer', numerical_transformer , numerical_cols)
        ]),
    )
])

In [7]:
total_examples = train_features.shape[0] + val_features.shape[0] + test_features.shape[0]

print("Training examples {} ({})".format(train_features.shape[0], train_features.shape[0] / total_examples))
print("Validation examples {} ({})".format(val_features.shape[0], val_features.shape[0] / total_examples))
print("Test examples {} ({})".format(test_features.shape[0], test_features.shape[0] / total_examples))

Training examples 21163 (0.6499493258806548)
Validation examples 4885 (0.15002610484935966)
Test examples 6513 (0.20002456926998557)


In [8]:
preprocessor = preprocessor.fit(train_features, train_labels)

train_features = preprocessor.transform(train_features)
val_features = preprocessor.transform(val_features)
test_features = preprocessor.transform(test_features)

all_features = np.concatenate([train_features, val_features])
all_labels = np.concatenate([train_labels, val_labels])

n_labels = 1

In [9]:
all_features[:3, :15]

array([[ 2.00000000e+00,  1.50000000e+01,  3.00000000e+00,
         1.00000000e+00,  3.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  1.30000000e+01, -1.15046758e+00,
         2.89216869e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 4.00000000e+00,  1.50000000e+01,  0.00000000e+00,
         1.00000000e+00,  4.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  3.90000000e+01,  4.67736094e-01,
         1.41924226e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 5.00000000e+00,  1.10000000e+01,  6.00000000e+00,
         8.00000000e+00,  4.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  3.90000000e+01,  2.15949448e+00,
        -5.07547252e-01, -4.26395104e-01, -1.46859301e-01,
        -2.15354774e-01, -2.07068513e+00]])

In [10]:
all_features.shape, all_labels.shape

((26048, 14), (26048,))

In [11]:
params = {
    "num_leaves": np.arange(5, 51),   
    #"min_child_samples": np.arange(1, 101),
    "min_child_samples": np.arange(1, 10),
    "learning_rate": [5 * (10 ** i) for i in range(-3, 0, 1)],
    "n_estimators": np.arange(10, 1001)
}

model = LGBMClassifier(objective="binary")

gs = model_selection.RandomizedSearchCV(
    model, 
    params,
    cv=3,
    verbose=4,
    scoring="roc_auc"
)

gs.fit(all_features, all_labels)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END learning_rate=0.05, min_child_samples=6, n_estimators=939, num_leaves=33;, score=0.920 total time=   2.6s
[CV 2/3] END learning_rate=0.05, min_child_samples=6, n_estimators=939, num_leaves=33;, score=0.914 total time=   3.0s
[CV 3/3] END learning_rate=0.05, min_child_samples=6, n_estimators=939, num_leaves=33;, score=0.925 total time=   1.4s
[CV 1/3] END learning_rate=0.05, min_child_samples=4, n_estimators=325, num_leaves=33;, score=0.926 total time=   0.7s
[CV 2/3] END learning_rate=0.05, min_child_samples=4, n_estimators=325, num_leaves=33;, score=0.920 total time=   1.8s
[CV 3/3] END learning_rate=0.05, min_child_samples=4, n_estimators=325, num_leaves=33;, score=0.930 total time=   1.5s
[CV 1/3] END learning_rate=0.05, min_child_samples=8, n_estimators=208, num_leaves=19;, score=0.927 total time=   0.3s
[CV 2/3] END learning_rate=0.05, min_child_samples=8, n_estimators=208, num_leaves=19;, score=0.922 total 

In [12]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_num_leaves,param_n_estimators,param_min_child_samples,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.259012,0.696914,0.065408,0.017082,33,939,6,0.05,"{'num_leaves': 33, 'n_estimators': 939, 'min_c...",0.920022,0.913864,0.924561,0.919482,0.004384,6
1,1.292517,0.464803,0.031507,0.003463,33,325,4,0.05,"{'num_leaves': 33, 'n_estimators': 325, 'min_c...",0.925845,0.919999,0.92983,0.925225,0.004037,4
2,0.333296,0.02409,0.017705,0.002138,19,208,8,0.05,"{'num_leaves': 19, 'n_estimators': 208, 'min_c...",0.92743,0.921844,0.930697,0.926657,0.003655,1
3,0.510853,0.186888,0.018124,0.004386,40,173,9,0.05,"{'num_leaves': 40, 'n_estimators': 173, 'min_c...",0.92622,0.920822,0.929594,0.925545,0.003613,3
4,3.087572,1.580537,0.050549,0.016064,48,911,7,0.5,"{'num_leaves': 48, 'n_estimators': 911, 'min_c...",0.898398,0.652524,0.903272,0.818065,0.117072,10
5,1.532188,0.700276,0.053231,0.008505,28,801,9,0.5,"{'num_leaves': 28, 'n_estimators': 801, 'min_c...",0.90145,0.894851,0.905246,0.900516,0.004295,9
6,0.822541,0.190606,0.016482,0.002813,28,186,5,0.005,"{'num_leaves': 28, 'n_estimators': 186, 'min_c...",0.910388,0.904136,0.913508,0.909344,0.003897,8
7,0.238209,0.078648,0.018341,0.004156,5,330,9,0.05,"{'num_leaves': 5, 'n_estimators': 330, 'min_ch...",0.923668,0.918228,0.925697,0.922531,0.003154,5
8,1.104446,0.27803,0.02894,0.006171,30,259,5,0.05,"{'num_leaves': 30, 'n_estimators': 259, 'min_c...",0.926236,0.921299,0.930201,0.925912,0.003642,2
9,0.032134,0.009225,0.003777,0.00012,5,13,3,0.5,"{'num_leaves': 5, 'n_estimators': 13, 'min_chi...",0.914578,0.907904,0.913461,0.911981,0.002919,7
