In [1]:
!pip install optuna --quiet

[K     |████████████████████████████████| 348 kB 7.5 MB/s 
[K     |████████████████████████████████| 209 kB 44.4 MB/s 
[K     |████████████████████████████████| 81 kB 9.4 MB/s 
[K     |████████████████████████████████| 78 kB 7.4 MB/s 
[K     |████████████████████████████████| 112 kB 48.5 MB/s 
[K     |████████████████████████████████| 50 kB 5.3 MB/s 
[K     |████████████████████████████████| 147 kB 44.0 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
import warnings
warnings.filterwarnings('ignore')
import optuna
pd.options.display.max_columns = 30


In [4]:
df = pd.read_csv('creditScoreCleaned.csv')
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129.0,0.0,0.0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131.0,0.0,0.0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200.0,3000.0,0.0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182.0,2500.0,0.0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107.0,0.0,0.0,310,910


In [5]:
df.shape

(4454, 14)

In [6]:
data, target = df.drop(columns=['status']), df['status'].map({'ok':0, 'default':1})

In [7]:
def tweaking(data=data, target=target, linreg=False):
    X = data.copy()
    y = target.copy()

    numerical = selector(dtype_include=np.number)(X)
    categorical = selector(dtype_include=object)(X)

    sca = StandardScaler()

    num_imputer = SimpleImputer(
        missing_values=np.NaN, 
        strategy='constant', 
        fill_value=0
    )
    cat_imputer = SimpleImputer(
        strategy='most_frequent', 
        fill_value='unk'
    )
    cat_ord_encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    )
    cat_ohe = OneHotEncoder(
        sparse=False,
        handle_unknown='ignore'
    )

    X_full_train, X_test, y_full_train, y_test = model_selection.train_test_split(
      X,
      y,
      test_size=.2,
      random_state=42,
    )
    X_train, X_dev, y_train, y_dev = model_selection.train_test_split(
          X_full_train,
          y_full_train,
          test_size=.25,
          random_state=42,
    )

    X_train.loc[:, numerical] = num_imputer.fit_transform(X_train[numerical])
    X_dev.loc[:, numerical] = num_imputer.transform(X_dev[numerical])
    X_test.loc[:, numerical] = num_imputer.transform(X_test[numerical])

    X_train.loc[:, categorical] = cat_imputer.fit_transform(X_train[categorical])
    X_dev.loc[:, categorical] = cat_imputer.transform(X_dev[categorical])
    X_test.loc[:, categorical] = cat_imputer.transform(X_test[categorical])
    
    if linreg:
        X_train.loc[:, numerical] = sca.fit_transform(X_train[numerical])
        X_dev.loc[:, numerical] = sca.transform(X_dev[numerical])
        X_test.loc[:, numerical] = sca.transform(X_test[numerical])


        cat_encoded = cat_ohe.fit_transform(X_train[categorical])
        features = cat_ohe.get_feature_names_out(categorical)
        cat_encoded = pd.DataFrame(cat_encoded, columns=features)
        X_train.reset_index(inplace=True, drop=True)
        X_train = X_train[numerical].join(cat_encoded)

        cat_encoded = cat_ohe.transform(X_dev[categorical])
        cat_encoded = pd.DataFrame(cat_encoded, columns=features)
        X_dev.reset_index(inplace=True, drop=True)
        X_dev = X_dev[numerical].join(cat_encoded)

        cat_encoded = cat_ohe.transform(X_test[categorical])
        cat_encoded = pd.DataFrame(cat_encoded, columns=features)
        X_test.reset_index(inplace=True, drop=True)
        X_test = X_test[numerical].join(cat_encoded)

    else:
        
        X_train.loc[:, categorical] = cat_ord_encoder.fit_transform(X_train[categorical])
        X_dev.loc[:, categorical] = cat_ord_encoder.transform(X_dev[categorical])
        X_test.loc[:, categorical] = cat_ord_encoder.transform(X_test[categorical])
       
    
    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [8]:
models = [
    DummyClassifier,
    LogisticRegression,
    GaussianNB,
    SVC,
    KNeighborsClassifier,
]

X_train, y_train, X_dev, y_dev, X_test, y_test = tweaking(linreg=True)

X = pd.concat([X_train, X_dev])
y = pd.concat([y_train, y_dev])

for model in models:
    cls = model()
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = model_selection.cross_val_score(
        cls,
        X,
        y,
        cv=kf,
        scoring="roc_auc",
        n_jobs=-1,
    )

    print(
        f"{model.__name__:22} AUC: "
        f"{cv_results.mean():.3f} STD: {cv_results.std():.2f}"
    )

DummyClassifier        AUC: 0.500 STD: 0.00
LogisticRegression     AUC: 0.840 STD: 0.02
GaussianNB             AUC: 0.780 STD: 0.02
SVC                    AUC: 0.842 STD: 0.02
KNeighborsClassifier   AUC: 0.749 STD: 0.01


In [9]:
def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["logit", "svc"])

    if classifier_name == "logit":
        logit_penalty = trial.suggest_categorical("logit_penalty", ["l1", "l2"])
        logit_c = trial.suggest_float("logit_c", 0.001, 10)
        logit_solver = trial.suggest_categorical("logit_solver", ["saga"])

        model = LogisticRegression(
            penalty=logit_penalty,
            C=logit_c,
            solver=logit_solver,
            max_iter=1000
        )
    
    elif classifier_name == "svc":
        svc_c = trial.suggest_float("svc_c", 0.001, 10)
        svc_gamma = trial.suggest_categorical("svc_gamma", ["scale", "auto"])
        svc_class_weight = trial.suggest_categorical("svc_class_weight", ["balanced", None])

        model = SVC(
            C=svc_c,
            gamma=svc_gamma,
            class_weight=svc_class_weight
        )

    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
    score = model_selection.cross_val_score(
        model,
        X,
        y,
        cv=kf,
        scoring="roc_auc",
        n_jobs=-1
    )

    auc = score.mean()

    return auc

# TPE sampler the default
study = optuna.create_study(
    direction="maximize",
    # sampler=optuna.samplers.TPESampler()
    # sampler=optuna.samplers.RandomSampler()
    sampler=optuna.samplers.CmaEsSampler()
)

study.optimize(objective, n_trials=20)
print("CMA-ES")
print(f"CMA-ES best params: {study.best_params}")
print(f"CMA-ES best score: {study.best_value}")
print()
result = study.trials_dataframe()
print(result["params_classifier"].value_counts())
res = result.groupby(["params_classifier"])["value"].agg(["mean", "std"])
print(res)

[32m[I 2022-10-19 07:55:08,529][0m A new study created in memory with name: no-name-c787734d-6efd-4911-822f-7ebd38a861fc[0m
[32m[I 2022-10-19 07:55:15,461][0m Trial 0 finished with value: 0.8399754303991326 and parameters: {'classifier': 'logit', 'logit_penalty': 'l2', 'logit_c': 8.374371697729224, 'logit_solver': 'saga'}. Best is trial 0 with value: 0.8399754303991326.[0m
[32m[I 2022-10-19 07:55:15,465][0m `CmaEsSampler` only supports two or more dimensional continuous search space. `RandomSampler` is used instead of `CmaEsSampler`.[0m
[32m[I 2022-10-19 07:55:17,619][0m Trial 1 finished with value: 0.840760815908507 and parameters: {'classifier': 'svc', 'svc_c': 5.158246495377517, 'svc_gamma': 'auto', 'svc_class_weight': None}. Best is trial 1 with value: 0.840760815908507.[0m
[32m[I 2022-10-19 07:55:19,428][0m Trial 2 finished with value: 0.8403276158980887 and parameters: {'classifier': 'svc', 'svc_c': 1.2845548670344171, 'svc_gamma': 'scale', 'svc_class_weight': None}

CMA-ES
CMA-ES best params: {'classifier': 'svc', 'svc_c': 0.6237647236965644, 'svc_gamma': 'scale', 'svc_class_weight': None}
CMA-ES best score: 0.8436323657090009

logit    11
svc       9
Name: params_classifier, dtype: int64
                       mean       std
params_classifier                    
logit              0.840021  0.000092
svc                0.838364  0.004759


In [10]:
dv = DictVectorizer(sparse=False, sort=False)
X_dict = X.to_dict(orient="records")
X_test_dict = X_test.to_dict(orient="records")
X = dv.fit_transform(X_dict)
X_test = dv.transform(X_test_dict)


AUC-test:  0.684


In [12]:
model = LogisticRegression(
    C=2.2257902566989283,
    penalty='l2',
    solver='saga'
)
model.fit(X, y)

y_pred = model.predict(X_test)

print(f"AUC-test: {metrics.roc_auc_score(y_test, y_pred): .3f}")

AUC-test:  0.703


In [13]:
def twk(data=data, target=target):
  X = data.copy()
  y = target.copy()

  numerical = selector(dtype_include=np.number)(X)
  categorical = selector(dtype_include=object)(X)
  
  num_imputer = SimpleImputer(
        missing_values=np.NaN, 
        strategy='constant', 
        fill_value=0
    )
  cat_ord_encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    )
    
  X_full_train, X_test, y_full_train, y_test = model_selection.train_test_split(
      X,
      y,
      test_size=.2,
      random_state=42,
    )
  X_train, X_dev, y_train, y_dev = model_selection.train_test_split(
          X_full_train,
          y_full_train,
          test_size=.25,
          random_state=42,
    )

  X_train.loc[:, numerical] = num_imputer.fit_transform(X_train[numerical])
  X_dev.loc[:, numerical] = num_imputer.transform(X_dev[numerical])
  X_test.loc[:, numerical] = num_imputer.transform(X_test[numerical])
  X_train.loc[:, categorical] = cat_ord_encoder.fit_transform(X_train[categorical])
  X_dev.loc[:, categorical] = cat_ord_encoder.transform(X_dev[categorical])
  X_test.loc[:, categorical] = cat_ord_encoder.transform(X_test[categorical])
  
  return X_train, y_train, X_dev, y_dev, X_test, y_test

In [14]:
X_train, y_train, X_dev, y_dev, X_test, y_test = twk()

X = pd.concat([X_train, X_dev])
y = pd.concat([y_train, y_test])

X.shape, y.shape, X_test.shape, y_test.shape

((3563, 13), (3563,), (891, 13), (891,))

In [18]:
models = [
    tree.DecisionTreeClassifier,
    ensemble.HistGradientBoostingClassifier,
    ensemble.RandomForestClassifier,
    xgb.XGBClassifier,
]

for model in models:
    cls = model()
    kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=11)
    cv_results = model_selection.cross_val_score(
        cls,
        X,
        y,
        cv=kf,
        scoring="roc_auc",
        n_jobs=-1,
    )

    print(
        f"{model.__name__:22} AUC: "
        f"{cv_results.mean():.3f} STD: {cv_results.std():.2f}"
    )

DecisionTreeClassifier AUC: 0.613 STD: 0.03
HistGradientBoostingClassifier AUC: 0.732 STD: 0.03
RandomForestClassifier AUC: 0.745 STD: 0.02
XGBClassifier          AUC: 0.754 STD: 0.02


In [19]:
def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["hist", "rf", "xgb"])

    if classifier_name == "hist":
        hist_learning_rate = trial.suggest_float("hist_learning_rate", 0.001, 0.5)
        hist_max_iter = trial.suggest_int("hist_max_iter", 10, 250)
        hist_max_leaf_nodes = trial.suggest_int("hist_max_leaf_nodes", 10, 150)

        model = ensemble.HistGradientBoostingClassifier(
            learning_rate=hist_learning_rate,
            max_iter=hist_max_iter,
            max_leaf_nodes=hist_max_leaf_nodes
        )
    
    elif classifier_name == "rf":
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 100, 1000)
        rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy'])
        rf_max_depth = trial.suggest_int("rf_max_depth", 1, 4)
        rf_min_samples_split = trial.suggest_float("rf_min_samples_split", 0.01, 1)

        model = ensemble.RandomForestClassifier(
            n_estimators=rf_n_estimators,
            criterion=rf_criterion,
            max_depth=rf_max_depth,
            min_samples_split=rf_min_samples_split,
        )
    
    elif classifier_name == "xgb":
        xgb_eta = trial.suggest_float("xgb_eta", 0.001, 0.5)
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 1, 15)
        xgb_min_child_weight = trial.suggest_int("xgb_min_child_weight", 1, 15)
        model = xgb.XGBClassifier(
            eta=xgb_eta,
            max_depth=xgb_max_depth,
            min_child_weight=xgb_min_child_weight,
            objective="binary:logistic",
            nthread=-1,
        )

    kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=0)
    score = model_selection.cross_val_score(
        model,
        X,
        y,
        cv=kf,
        scoring="roc_auc",
        n_jobs=-1
    )

    auc = score.mean()

    return auc

In [20]:
study = optuna.create_study(
    direction="maximize",
    # sampler=optuna.samplers.TPESampler()
    sampler=optuna.samplers.RandomSampler()
    # sampler=optuna.samplers.CmaEsSampler()
)

study.optimize(objective, n_trials=100)
print("Randomized Sample")
print(f"RS best params: {study.best_params}")
print(f"RS best score: {study.best_value}")
print()
result = study.trials_dataframe()
print(result["params_classifier"].value_counts())
res = result.groupby(["params_classifier"])["value"].agg(["mean", "std"])
print(res)


[32m[I 2022-10-19 08:13:23,489][0m A new study created in memory with name: no-name-df884ae9-0bb3-4721-9a2f-204d1874fae8[0m
[32m[I 2022-10-19 08:13:27,835][0m Trial 0 finished with value: 0.7444540109174858 and parameters: {'classifier': 'rf', 'rf_n_estimators': 201, 'rf_criterion': 'entropy', 'rf_max_depth': 3, 'rf_min_samples_split': 0.1633299799462796}. Best is trial 0 with value: 0.7444540109174858.[0m
[32m[I 2022-10-19 08:13:28,675][0m Trial 1 finished with value: 0.7535292616066737 and parameters: {'classifier': 'xgb', 'xgb_eta': 0.15187700453016306, 'xgb_max_depth': 3, 'xgb_min_child_weight': 10}. Best is trial 1 with value: 0.7535292616066737.[0m
[32m[I 2022-10-19 08:13:30,575][0m Trial 2 finished with value: 0.5 and parameters: {'classifier': 'rf', 'rf_n_estimators': 264, 'rf_criterion': 'entropy', 'rf_max_depth': 4, 'rf_min_samples_split': 0.6780127039386767}. Best is trial 1 with value: 0.7535292616066737.[0m
[32m[I 2022-10-19 08:13:31,220][0m Trial 3 finished 

Randomized Sample
RS best params: {'classifier': 'xgb', 'xgb_eta': 0.2760659810861274, 'xgb_max_depth': 2, 'xgb_min_child_weight': 2}
RS best score: 0.7541784188737118

hist    44
xgb     31
rf      25
Name: params_classifier, dtype: int64
                       mean       std
params_classifier                    
hist               0.703915  0.012941
rf                 0.627549  0.118302
xgb                0.735063  0.012640


In [None]:
dv = DictVectorizer(sparse=False, sort=False)
X_dict = X.to_dict(orient="records")
X_test_dict = X_test.to_dict(orient="records")
X = dv.fit_transform(X_dict)
X_test = dv.transform(X_test_dict)

In [21]:
model = xgb.XGBClassifier(
   eta=0.2760659810861274,
   max_depth=2,
   min_child_weight=2
)
model.fit(X, y)

y_pred = model.predict(X_test)

print(f"AUC-test: {metrics.roc_auc_score(y_test, y_pred): .3f}")

AUC-test:  0.638
