In [1]:
from lightgbm import LGBMClassifier

import numpy as np

from sklearn import base, pipeline, preprocessing, compose, metrics, model_selection, linear_model

import pandas as pd

In [2]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
label_col = "class"

In [3]:
data = pd.read_csv("adult/data/dataset.csv")
data[label_col] = data[label_col].replace({"<=50K": 0, ">50K": 1})

In [4]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [5]:
train_size=0.65
val_size=0.15
test_size=0.20
seed=11


train_features, test_features, train_labels, test_labels = model_selection.train_test_split(
    data[categorical_cols + numerical_cols], 
    data[label_col], 
    test_size=test_size,
    random_state=seed
)

val_size = data.shape[0] * val_size / train_features.shape[0]

train_features, val_features, train_labels, val_labels = model_selection.train_test_split(
    train_features, 
    train_labels, 
    test_size=val_size, 
    random_state=seed
)    

In [6]:
n_bins = 10
n_quantiles=10

categorical_transformer = pipeline.Pipeline(steps=[
    ('label', preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

numerical_transformer = pipeline.FeatureUnion([
    #('qtscaler', preprocessing.QuantileTransformer(n_quantiles=n_quantiles)),
    ('sscaler', preprocessing.StandardScaler()),
    #('logscaler', preprocessing.FunctionTransformer(np.log1p)),
])

numerical_categorical_transformer = pipeline.Pipeline(steps=[
    ('dscaler', preprocessing.KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="uniform")), 
])


preprocessor = pipeline.Pipeline([
    ('columns_transformer', compose.ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('categorical_transformer', categorical_transformer , categorical_cols),
            #('numerical_categorical_transformer', numerical_categorical_transformer , numerical_cols),
            ('numerical_transformer', numerical_transformer , numerical_cols)
        ]),
    )
])

In [7]:
total_examples = train_features.shape[0] + val_features.shape[0] + test_features.shape[0]

print("Training examples {} ({})".format(train_features.shape[0], train_features.shape[0] / total_examples))
print("Validation examples {} ({})".format(val_features.shape[0], val_features.shape[0] / total_examples))
print("Test examples {} ({})".format(test_features.shape[0], test_features.shape[0] / total_examples))

Training examples 21163 (0.6499493258806548)
Validation examples 4885 (0.15002610484935966)
Test examples 6513 (0.20002456926998557)


In [8]:
preprocessor = preprocessor.fit(train_features, train_labels)

train_features = preprocessor.transform(train_features)
val_features = preprocessor.transform(val_features)
test_features = preprocessor.transform(test_features)

all_features = np.concatenate([train_features, val_features])
all_labels = np.concatenate([train_labels, val_labels])

n_labels = 1

In [9]:
all_features[:3, :15]

array([[ 2.00000000e+00,  1.50000000e+01,  3.00000000e+00,
         1.00000000e+00,  3.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  1.30000000e+01, -1.15046758e+00,
         2.89216869e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 4.00000000e+00,  1.50000000e+01,  0.00000000e+00,
         1.00000000e+00,  4.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  3.90000000e+01,  4.67736094e-01,
         1.41924226e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 5.00000000e+00,  1.10000000e+01,  6.00000000e+00,
         8.00000000e+00,  4.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  3.90000000e+01,  2.15949448e+00,
        -5.07547252e-01, -4.26395104e-01, -1.46859301e-01,
        -2.15354774e-01, -2.07068513e+00]])

In [10]:
all_features.shape, all_labels.shape

((26048, 14), (26048,))

In [11]:
params = {
    "C": [0.1, 0.2,0.3, 0.4, 0.5, 0.6 ,0.7, 0.8, 0.9,1]
}

model = linear_model.LogisticRegression(max_iter=5000)

gs = model_selection.RandomizedSearchCV(
    model, 
    params,
    cv=3,
    verbose=4,
    scoring="roc_auc"
)

gs = gs.fit(all_features, all_labels)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END .............................C=0.1;, score=0.855 total time=   0.2s
[CV 2/3] END .............................C=0.1;, score=0.849 total time=   0.2s
[CV 3/3] END .............................C=0.1;, score=0.858 total time=   0.2s
[CV 1/3] END .............................C=0.2;, score=0.855 total time=   0.2s
[CV 2/3] END .............................C=0.2;, score=0.849 total time=   0.2s
[CV 3/3] END .............................C=0.2;, score=0.858 total time=   0.2s
[CV 1/3] END .............................C=0.3;, score=0.855 total time=   0.1s
[CV 2/3] END .............................C=0.3;, score=0.849 total time=   0.1s
[CV 3/3] END .............................C=0.3;, score=0.858 total time=   0.1s
[CV 1/3] END .............................C=0.4;, score=0.855 total time=   0.1s
[CV 2/3] END .............................C=0.4;, score=0.849 total time=   0.2s
[CV 3/3] END .............................C=0.4;

In [16]:
gs.scorer_

make_scorer(roc_auc_score, needs_threshold=True)

In [24]:
preds = gs.best_estimator_.predict_proba(test_features)
metrics.roc_auc_score(test_labels, preds[:, 1])

0.8550446124909651

In [13]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.207954,0.033224,0.003308,0.000447,0.1,{'C': 0.1},0.854674,0.849407,0.85772,0.853934,0.003434,9
1,0.187315,0.034141,0.002991,0.000581,0.2,{'C': 0.2},0.85466,0.8494,0.857749,0.853936,0.003447,4
2,0.122335,0.014736,0.002767,7.6e-05,0.3,{'C': 0.3},0.854662,0.849412,0.857759,0.853944,0.003445,1
3,0.162981,0.012041,0.002568,4.7e-05,0.4,{'C': 0.4},0.854664,0.849401,0.857757,0.85394,0.00345,2
4,0.146123,0.03171,0.002571,3.4e-05,0.5,{'C': 0.5},0.854654,0.8494,0.857754,0.853936,0.003448,5
5,0.178153,0.030628,0.003093,0.000199,0.6,{'C': 0.6},0.854648,0.849399,0.85776,0.853935,0.00345,6
6,0.172916,0.033248,0.002652,5.3e-05,0.7,{'C': 0.7},0.854653,0.849396,0.857754,0.853935,0.00345,8
7,0.148444,0.008858,0.002624,3e-05,0.8,{'C': 0.8},0.854647,0.849396,0.857756,0.853933,0.00345,10
8,0.14359,0.021043,0.00286,0.000113,0.9,{'C': 0.9},0.854658,0.849399,0.857757,0.853938,0.00345,3
9,0.161602,0.014792,0.002684,0.000183,1.0,{'C': 1},0.854656,0.849392,0.857757,0.853935,0.003453,7
