In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [73]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [5]:
melanin_df = pd.read_csv('/content/melanin.csv')

In [6]:
melanin_df.head()

Unnamed: 0,SMILES,Class
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,1
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,1
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,1
3,CC1C2Cc3ccc(cc3C1(CCN2CC=C)C)O,1
4,COc1ccc(cc1)c2coc3cc(ccc3c2=O)OC,1


In [95]:
melanin_df.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,607
0,173


In [7]:
descriptor_list = Descriptors.descList
descriptors = []

for descriptor in descriptor_list:
      descriptors.append(descriptor[0])
def get_descriptor_values(mol, descriptors):
    calc = MolecularDescriptorCalculator(descriptors)
    ds = calc.CalcDescriptors(mol)
    return ds[0]
for i in descriptors:
    melanin_df[i] = pd.Series(np.array([get_descriptor_values(Chem.MolFromSmiles(j), [i]) for j in melanin_df["SMILES"]]), index=melanin_df.index)

In [12]:
melanin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Columns: 219 entries, SMILES to fr_urea
dtypes: float64(107), int64(111), object(1)
memory usage: 1.3+ MB


In [15]:
single_value_columns = melanin_df.columns[melanin_df.nunique() == 1]
melanin_df[single_value_columns]

Unnamed: 0,NumRadicalElectrons,SMR_VSA8,SlogP_VSA9,fr_aldehyde,fr_azide,fr_azo,fr_barbitur,fr_diazo,fr_isocyan,fr_isothiocyan,fr_lactam,fr_nitroso,fr_phos_acid,fr_phos_ester,fr_prisulfonamd,fr_thiocyan
0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
776,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
777,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
778,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
melanin_df.drop(columns=single_value_columns, inplace=True)

In [17]:
melanin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Columns: 203 entries, SMILES to fr_urea
dtypes: float64(105), int64(97), object(1)
memory usage: 1.2+ MB


In [18]:
melanin_df.head()

Unnamed: 0,SMILES,Class,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,1,11.743677,11.743677,0.0,-0.0443,0.775469,9.944444,271.792,249.616,...,0,0,0,0,0,0,0,0,0,0
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,1,12.055486,12.055486,0.033118,-0.816349,0.753573,30.347826,319.405,294.205,...,0,0,0,0,0,0,0,0,0,0
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,1,5.773582,5.773582,0.263537,0.263537,0.821909,15.681818,293.326,278.206,...,0,0,0,0,0,0,0,0,0,0
3,CC1C2Cc3ccc(cc3C1(CCN2CC=C)C)O,1,9.798686,9.798686,0.203036,0.203036,0.822957,33.789474,257.377,234.193,...,0,0,0,0,0,0,0,0,0,0
4,COc1ccc(cc1)c2coc3cc(ccc3c2=O)OC,1,12.550924,12.550924,0.065676,-0.065676,0.737634,10.571429,282.295,268.183,...,0,0,0,0,0,0,0,0,0,0


In [21]:
melanin_df.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,607
0,173


In [30]:
corr_matrix = melanin_df.drop(columns=['SMILES', 'Class']).corr().abs()

In [31]:
corr_matrix

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
MaxAbsEStateIndex,1.000000,1.000000,0.549487,0.517814,0.148163,0.145336,0.368054,0.374870,0.368826,0.385975,...,0.068537,0.057079,0.132178,0.073312,0.036169,0.012941,0.022827,0.033263,0.010493,0.094043
MaxEStateIndex,1.000000,1.000000,0.549487,0.517814,0.148163,0.145336,0.368054,0.374870,0.368826,0.385975,...,0.068537,0.057079,0.132178,0.073312,0.036169,0.012941,0.022827,0.033263,0.010493,0.094043
MinAbsEStateIndex,0.549487,0.549487,1.000000,0.364339,0.034567,0.103524,0.278091,0.278769,0.277457,0.265366,...,0.067192,0.063719,0.070668,0.033836,0.033334,0.008988,0.079464,0.116182,0.042911,0.024510
MinEStateIndex,0.517814,0.517814,0.364339,1.000000,0.167352,0.111044,0.405698,0.419271,0.406025,0.381325,...,0.017349,0.066786,0.493778,0.256334,0.013940,0.005433,0.035509,0.025377,0.020185,0.031224
qed,0.148163,0.148163,0.034567,0.167352,1.000000,0.176124,0.490615,0.495097,0.491267,0.487543,...,0.146499,0.055386,0.008351,0.019270,0.024371,0.024964,0.036488,0.010505,0.203508,0.024415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fr_tetrazole,0.012941,0.012941,0.008988,0.005433,0.024964,0.011923,0.022276,0.025766,0.022334,0.018236,...,0.005913,0.013489,0.010123,0.005913,0.003150,1.000000,0.011071,0.075074,0.007978,0.009275
fr_thiazole,0.022827,0.022827,0.079464,0.035509,0.036488,0.073606,0.023909,0.010955,0.023820,0.064786,...,0.016955,0.038678,0.099615,0.016955,0.009034,0.011071,1.000000,0.054805,0.022875,0.026595
fr_thiophene,0.033263,0.033263,0.116182,0.025377,0.010505,0.062258,0.035183,0.022801,0.035300,0.092893,...,0.022151,0.000195,0.090126,0.036617,0.011802,0.075074,0.054805,1.000000,0.029885,0.041169
fr_unbrch_alkane,0.010493,0.010493,0.042911,0.020185,0.203508,0.062677,0.021859,0.041967,0.021805,0.016007,...,0.328123,0.001506,0.020916,0.012217,0.006509,0.007978,0.022875,0.029885,1.000000,0.008173


In [33]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

melanin_df.drop(to_drop, axis=1, inplace=True)


In [43]:
X = melanin_df.drop(columns=['SMILES', 'Class'])
y = melanin_df['Class']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1488)

In [51]:
from xgboost import XGBClassifier

In [184]:
ratio_of_classes = y_train.value_counts()[0] / y_train.value_counts()[1]
model = XGBClassifier(
    scale_pos_weight=ratio_of_classes,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.9
)


In [158]:
model.fit(X_train, y_train)

In [159]:
y_pred = model.predict(X_test)

# Оценка модели с помощью метрик
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
roc_auc = metrics.roc_auc_score(y_test, y_pred)

# Вывод результатов
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.7821
Precision: 0.8425
Recall: 0.8843
F1 Score: 0.8629
ROC AUC: 0.6564


In [85]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      0.43      0.47        35
           1       0.84      0.88      0.86       121

    accuracy                           0.78       156
   macro avg       0.68      0.66      0.67       156
weighted avg       0.77      0.78      0.77       156



In [82]:
model

In [90]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


params = {'max_depth':[5,10,20],'min_samples_split':[2,8,32],'min_samples_leaf':[1,2,5,10],'n_estimators':[50,100,200]}
cv = StratifiedKFold(n_splits=10, shuffle=False)
gs = GridSearchCV(RandomForestClassifier(class_weight="balanced"), params, cv=cv,verbose=3,refit=True)
gs.fit(X_train, y_train)

print('Best score: %0.2f',gs.best_score_)
print('Training set performance using best parameters (%s)', gs.best_params_)
best_morgan_treemodel = gs.best_estimator_
#training set evaluation
best_morgan_tree_prediction = best_morgan_treemodel.predict(X_test)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.810 total time=   0.2s
[CV 2/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.810 total time=   0.2s
[CV 3/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.698 total time=   0.2s
[CV 4/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.778 total time=   0.2s
[CV 5/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.823 total time=   0.2s
[CV 6/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.855 total time=   0.2s
[CV 7/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.806 total time=   0.2s
[CV 8/10] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.758 total time=   0.2s

In [153]:
print(metrics.classification_report(y_test,best_morgan_tree_prediction))
roc_auc = metrics.roc_auc_score(y_test, best_morgan_tree_prediction)
roc_auc

              precision    recall  f1-score   support

           0       0.63      0.49      0.55        35
           1       0.86      0.92      0.89       121

    accuracy                           0.82       156
   macro avg       0.75      0.70      0.72       156
weighted avg       0.81      0.82      0.81       156



np.float64(0.7015348288075561)

In [112]:
from sklearn.tree import DecisionTreeClassifier

In [113]:
dtc = DecisionTreeClassifier(class_weight="balanced", random_state=1488)

In [114]:
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

In [121]:
unique_counts = X.nunique()
binary = unique_counts[unique_counts == 2]

In [129]:
from imblearn.over_sampling import SMOTENC

In [143]:
cat_features = list(X.select_dtypes(int).columns)

In [144]:
sm = SMOTENC(categorical_features=cat_features, random_state=1488)

In [145]:
X_sm, y_sm = sm.fit_resample(X_train, y_train)

In [152]:
X_sm

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.003023,0.000000,-0.662199,0.749451,13.000000,313.656000,0.091513,-0.396312,1.277778,35.496885,...,0,0,0,0,0,0,0,0,0,0
1,12.675291,0.147182,-2.738426,0.173133,11.405405,527.611000,0.336419,-0.481139,0.810811,16.558077,...,0,0,0,0,0,0,0,0,0,0
2,11.792002,0.004322,-0.464520,0.573089,10.714286,282.259000,0.268969,-0.345791,1.142857,16.628232,...,0,0,0,0,0,0,0,0,0,0
3,11.913430,0.075607,-0.122546,0.884685,15.000000,246.310000,0.243512,-0.333366,1.333333,16.160400,...,0,0,0,0,0,0,0,0,0,0
4,6.268670,0.675833,0.675833,0.755798,10.578947,249.317000,0.134893,-0.383059,0.894737,15.315194,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,8.056683,0.258464,-0.104681,0.703776,10.758987,245.838165,0.238022,-0.416131,1.008579,22.135686,...,0,0,0,0,0,0,0,0,0,0
968,5.264383,0.135847,0.135847,0.434401,10.624535,245.195610,0.183363,-0.375722,1.410967,31.986077,...,0,0,0,0,0,0,0,0,0,0
969,6.209065,0.458705,0.458705,0.687847,14.076473,332.659323,0.047834,-0.322050,1.046698,35.495703,...,0,0,0,0,0,0,0,0,0,0
970,13.082905,0.172586,-0.715333,0.841986,18.665255,352.719792,0.278770,-0.451216,1.242659,18.218912,...,0,0,0,0,0,0,0,0,0,0


In [160]:
!pip install optuna
!pip install optuna-integration[xgboost]

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1
Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.2.1-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.2.1-py3-none-any.whl (97 kB)
[2K   [90m━━

In [162]:
ratio_of_classes = y_train.value_counts()[0] / y_train.value_counts()[1]
model = XGBClassifier(
    scale_pos_weight=ratio_of_classes,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.9,
    enable_categorical=True
)

In [163]:
metric = 'auc'
base_params = {
    'objective': 'binary:logistic',
    'eval_metric': metric,
    'enable_categorical': True,
}

In [166]:
import time

In [257]:
scale_pos_weight

np.float64(0.2839506172839506)

In [258]:
def objective(trial):
    params = {
        'tree_method': trial.suggest_categorical('tree_method', ['approx', 'hist']),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 12),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10000, 10000),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 50),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', scale_pos_weight, scale_pos_weight)
    }
    params.update(base_params)

    # Add pruning callback
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_1-{metric}')

    # Initialize XGBClassifier
    model = XGBClassifier(callbacks=[pruning_callback], **params)

    # Train the model
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=0
    )

    # Save the best iteration for reference
    trial.set_user_attr('best_iteration', model.best_iteration)

    # Return the validation score
    return model.best_score

In [171]:
import optuna

In [259]:
sampler = optuna.samplers.TPESampler(seed=1488)
study = optuna.create_study(direction='maximize', sampler=sampler)
tic = time.time()
while time.time() - tic < 100:
    study.optimize(objective, n_trials=1)

[I 2025-03-29 15:17:11,993] A new study created in memory with name: no-name-fdae1fff-f08e-4290-8ef6-66ca1cd18fb5
[I 2025-03-29 15:17:13,069] Trial 0 finished with value: 0.8661157024793389 and parameters: {'tree_method': 'approx', 'max_depth': 11, 'min_child_weight': 8, 'subsample': 0.8889393258968677, 'colsample_bytree': 0.8258044087891845, 'reg_lambda': 0.05023256449955142, 'n_estimators': 10000, 'early_stopping_rounds': 50, 'scale_pos_weight': 0.2839506172839506}. Best is trial 0 with value: 0.8661157024793389.
[I 2025-03-29 15:17:13,797] Trial 1 finished with value: 0.8422668240850059 and parameters: {'tree_method': 'approx', 'max_depth': 7, 'min_child_weight': 12, 'subsample': 0.47368572541632725, 'colsample_bytree': 0.5275802773931773, 'reg_lambda': 0.14804210086916686, 'n_estimators': 10000, 'early_stopping_rounds': 50, 'scale_pos_weight': 0.2839506172839506}. Best is trial 0 with value: 0.8661157024793389.
[I 2025-03-29 15:17:14,530] Trial 2 finished with value: 0.841322314049

In [260]:
print(f'best score = {study.best_trial.value}')
print('boosting params ---------------------------')
print(f'fixed learning rate: {learning_rate}')
print(f'best boosting round: {study.best_trial.user_attrs["best_iteration"]}')
print('best tree params --------------------------')
for k, v in study.best_trial.params.items():
    print(k, ':', v)

best score = 0.8878394332939787
boosting params ---------------------------
fixed learning rate: 0.3
best boosting round: 10
best tree params --------------------------
tree_method : approx
max_depth : 7
min_child_weight : 7
subsample : 0.8673593511918096
colsample_bytree : 0.9802268887260982
reg_lambda : 0.013301789485592347
n_estimators : 10000
early_stopping_rounds : 50
scale_pos_weight : 0.2839506172839506


In [261]:
best_trial = XGBClassifier(**base_params, **study.best_trial.params)

best_trial.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=0
)

y_true = y_test
y_pred = best_trial.predict(X_test)
y_score = best_trial.predict_proba(X_test)[:,1]


print(metrics.classification_report(y_true, y_pred))
metrics.roc_auc_score(y_true, y_score)

              precision    recall  f1-score   support

           0       0.55      0.77      0.64        35
           1       0.93      0.82      0.87       121

    accuracy                           0.81       156
   macro avg       0.74      0.79      0.76       156
weighted avg       0.84      0.81      0.82       156



np.float64(0.8878394332939787)

In [209]:
def objective(trial):
    params = {
        'tree_method': trial.suggest_categorical('tree_method', ['approx', 'hist']),
        'max_depth': trial.suggest_int('max_depth', 7, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 7, 12),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10000, 10000),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 50),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', scale_pos_weight, scale_pos_weight),

    }
    params.update(base_params)

    # Add pruning callback
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_1-{metric}')

    # Initialize XGBClassifier
    model = XGBClassifier(callbacks=[pruning_callback], **params)

    # Train the model
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=0
    )

    # Save the best iteration for reference
    trial.set_user_attr('best_iteration', model.best_iteration)

    # Return the validation score
    return model.best_score

In [210]:
sampler = optuna.samplers.TPESampler(seed=1488)
study = optuna.create_study(direction='maximize', sampler=sampler)
tic = time.time()
while time.time() - tic < 100:
    study.optimize(objective, n_trials=1)

[I 2025-03-29 13:25:24,146] A new study created in memory with name: no-name-c06e0701-b790-4193-9db5-9bf423e270c0
[I 2025-03-29 13:25:24,702] Trial 0 finished with value: 0.8683589138134593 and parameters: {'tree_method': 'approx', 'max_depth': 12, 'min_child_weight': 9, 'subsample': 0.8889393258968677, 'colsample_bytree': 0.8258044087891845, 'reg_lambda': 0.05023256449955142, 'n_estimators': 10000, 'early_stopping_rounds': 50, 'scale_pos_weight': 0.2839506172839506}. Best is trial 0 with value: 0.8683589138134593.
[I 2025-03-29 13:25:27,550] Trial 1 finished with value: 0.8422668240850059 and parameters: {'tree_method': 'approx', 'max_depth': 9, 'min_child_weight': 12, 'subsample': 0.47368572541632725, 'colsample_bytree': 0.5275802773931773, 'reg_lambda': 0.14804210086916686, 'n_estimators': 10000, 'early_stopping_rounds': 50, 'scale_pos_weight': 0.2839506172839506}. Best is trial 0 with value: 0.8683589138134593.
[I 2025-03-29 13:25:28,484] Trial 2 finished with value: 0.835655253837

In [211]:
print(f'best score = {study.best_trial.value}')
print('boosting params ---------------------------')
print(f'fixed learning rate: {learning_rate}')
print(f'best boosting round: {study.best_trial.user_attrs["best_iteration"]}')
print('best tree params --------------------------')
for k, v in study.best_trial.params.items():
    print(k, ':', v)

best score = 0.8819362455726092
boosting params ---------------------------
fixed learning rate: 0.3
best boosting round: 1
best tree params --------------------------
tree_method : hist
max_depth : 7
min_child_weight : 8
subsample : 0.8469379886698255
colsample_bytree : 0.9186831836591831
reg_lambda : 0.0012973104604383952
n_estimators : 10000
early_stopping_rounds : 50
scale_pos_weight : 0.2839506172839506


In [256]:
best_trial = XGBClassifier(**base_params, **study.best_trial.params)

best_trial.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=0
)

y_true = y_test
y_pred = best_trial.predict(X_test)
y_score = best_trial.predict_proba(X_test)[:,1]


print(metrics.classification_report(y_true, y_pred))
metrics.roc_auc_score(y_true, y_score)

              precision    recall  f1-score   support

           0       0.56      0.83      0.67        35
           1       0.94      0.81      0.87       121

    accuracy                           0.81       156
   macro avg       0.75      0.82      0.77       156
weighted avg       0.86      0.81      0.83       156



np.float64(0.8819362455726092)