In [43]:
# Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

# Models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier, DMatrix
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.metrics import f1_score

In [2]:
root_folder = "../data/"
# root_folder = "/kaggle/input/playground-series-s3e22/"

In [3]:
train = pd.read_csv(root_folder + 'train.csv')
test = pd.read_csv(root_folder + 'test.csv')
sample_submission = pd.read_csv(root_folder + 'sample_submission.csv')

In [4]:
train.isnull().sum()

Unnamed: 0               0
surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
outcome                  0
dtype: int64

In [5]:
cat_features = train.select_dtypes(include=['object']).columns.tolist()[:-1]
num_features = [i for i in train.columns if i not in cat_features]

In [6]:
cat_features

['peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'nasogastric_reflux_ph',
 'rectal_exam_feces',
 'abdomen',
 'packed_cell_volume',
 'total_protein',
 'abdomo_appearance',
 'abdomo_protein',
 'surgical_lesion',
 'lesion_1',
 'lesion_2',
 'lesion_3']

In [7]:
X_train = train.drop(columns=['outcome'])
y_train = train['outcome']

In [8]:
X_train

Unnamed: 0.1,Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
0,0,-0.313086,-0.129621,1.801773,-0.368142,1.093380,0.702421,-0.483319,0.068659,-0.298737,...,more_3_sec,depressed,absent,slight,slight,less_1_liter,decreased,distend_small,serosanguious,yes
1,1,-0.310258,-0.890705,0.289582,-1.097830,-1.230314,-1.576449,1.598010,-0.812647,-0.298921,...,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,absent,distend_small,serosanguious,yes
2,2,-0.313226,0.124074,1.389357,-0.124913,-0.455749,-1.196638,-0.562072,0.068659,0.237648,...,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,absent,distend_large,serosanguious,yes
3,3,3.177363,-1.398095,-0.260306,-0.003299,-1.230314,0.322609,-0.539571,0.383411,-0.298921,...,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,decreased,distend_small,cloudy,yes
4,4,3.204717,-0.256468,-0.947666,1.091232,1.351568,-0.247108,-0.528321,-0.434944,-0.705212,...,less_3_sec,alert,hypomotile,none,slight,less_1_liter,normal,normal,cloudy,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,1230,-0.309218,0.377769,1.698669,1.091232,-1.230314,0.702421,1.673013,-0.812647,-0.299289,...,more_3_sec,depressed,absent,moderate,none,more_1_liter,absent,distend_large,serosanguious,yes
1231,1231,-0.314142,-0.890705,-0.672722,1.212846,-0.713937,-1.386544,-0.562072,0.194560,-0.298737,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,decreased,distend_small,serosanguious,yes
1232,1232,-0.313319,-0.890705,0.152110,0.604774,-0.713937,-0.911779,-0.580823,2.334874,-0.631608,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,increased,firm,cloudy,yes
1233,1233,-0.309559,-0.129621,-0.329042,-0.854601,-1.230314,0.797374,1.973025,-0.812647,-0.298737,...,less_3_sec,mild_pain,hypomotile,slight,none,more_1_liter,absent,distend_small,cloudy,yes


# Catboost

In [35]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "iterations": 500, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "TotalF1", # Тоже стоит заранее определиться
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,
    )

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=30,
    )

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [36]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [38]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=1, max_resource=100, reduction_factor=3
    ),)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-25 23:25:39,660] A new study created in memory with name: no-name-f143e279-333f-49a2-b19d-ea7a8b88ffba


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2023-09-25 23:25:46,363] Trial 14 finished with value: 0.6199249330842874 and parameters: {'learning_rate': 0.028318665698051332, 'l2_leaf_reg': 20, 'colsample_bylevel': 0.5066725287081372, 'auto_class_weights': 'Balanced', 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.339105925205214}. Best is trial 14 with value: 0.6199249330842874.
[I 2023-09-25 23:25:46,592] Trial 31 finished with value: 0.6483975538963785 and parameters: {'learning_rate': 0.006882014920086926, 'l2_leaf_reg': 45, 'colsample_bylevel': 0.34494368204015263, 'auto_class_weights': 'Balanced', 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4892295021272044}. Best is trial 31 with value: 0.6483975538963785.
[I 2023-09-25 23:25:49,056] Trial 29 finished with value: 0.6587462955151178 and parameters: {'learning_rate': 0.09594621308224878, 'l2_leaf_reg': 28, 'colsample_bylevel': 0.515569170362249, 'auto_class_weights': 'SqrtBalanced', 'dep

# XGBoost

In [22]:
y_train_encoded = LabelEncoder().fit_transform(y_train)

In [39]:
def fit_xgboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    dtrain = DMatrix(data=X_train, label=y_train, nthread=-1, enable_categorical=True)
    dtest = DMatrix(data=X_val, label=y_val, nthread=-1, enable_categorical=True)
    evallist = [(dtest, 'eval'), (dtrain, 'train')]

    # param = {
    #     'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
    #     'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
    #     'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    #     'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    #     'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    #     'n_estimators': 1000,
    #     'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
    #     'random_state': trial.suggest_categorical('random_state', [2020]),
    #     'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    #     'booster': 'dart',
    #     'sample_type': 'uniform',
    #     'rate_drop': '0.1',
    #     'enable_categorical': True,
    # }

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    xgb.train(params, dtrain, 100, evallist)

    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    
    y_pred = clf.predict(X_val)
    return clf, y_pred

In [40]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train_encoded):
        train_data = X_train.iloc[train_idx, :], y_train_encoded[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train_encoded[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_xgboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [41]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=1, max_resource=100, reduction_factor=3
    ),)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-25 23:34:22,189] A new study created in memory with name: no-name-cff7da93-d72d-4493-826e-715fd2a95ad8


  0%|          | 0/1000 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'gamma': trial.suggest_loguniform

[W 2023-09-25 23:34:22,260] Trial 2 failed with parameters: {'max_depth': 1, 'learning_rate': 0.1007527774027846, 'n_estimators': 500, 'min_child_weight': 4, 'gamma': 7.998242849153636e-06, 'subsample': 0.07439302515004169, 'colsample_bytree': 0.20517148855938602, 'reg_alpha': 0.02441514283631517, 'reg_lambda': 0.011618618150486568} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:peristalsis: object, abdominal_distention: object, nasogastric_tube: object, nasogastric_reflux: object, nasogastric_reflux_ph: object, rectal_exam_feces: object, abdomen: object, packed_cell_volume: object, total_protein: object, abdomo_appearance: object, abdomo_protein: object, surgical_lesion: object, lesion_1: object, lesion_2: object, lesion_3: object').
Traceback (most recent call last):
  File "/home/artem/projects/

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)


[W 2023-09-25 23:34:22,336] Trial 7 failed with parameters: {'max_depth': 1, 'learning_rate': 0.010149224637359289, 'n_estimators': 242, 'min_child_weight': 9, 'gamma': 2.726181053989389e-05, 'subsample': 0.014126018946652762, 'colsample_bytree': 0.023807178441620606, 'reg_alpha': 0.01739278484301115, 'reg_lambda': 0.006072133236483862} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:peristalsis: object, abdominal_distention: object, nasogastric_tube: object, nasogastric_reflux: object, nasogastric_reflux_ph: object, rectal_exam_feces: object, abdomen: object, packed_cell_volume: object, total_protein: object, abdomo_appearance: object, abdomo_protein: object, surgical_lesion: object, lesion_1: object, lesion_2: object, lesion_3: object').
Traceback (most recent call last):
  File "/home/artem/proje

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:peristalsis: object, abdominal_distention: object, nasogastric_tube: object, nasogastric_reflux: object, nasogastric_reflux_ph: object, rectal_exam_feces: object, abdomen: object, packed_cell_volume: object, total_protein: object, abdomo_appearance: object, abdomo_protein: object, surgical_lesion: object, lesion_1: object, lesion_2: object, lesion_3: object