In [25]:
# Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Models
from catboost import CatBoostClassifier, Pool
from optuna.integration import CatBoostPruningCallback
from xgboost import XGBClassifier, DMatrix
import xgboost as xgb
from lightgbm import Dataset
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold

# Vizualize
import matplotlib.pyplot as plt
# import seaborn as sns
import optuna
from sklearn.metrics import f1_score

In [2]:
root_folder = "../data/"
# root_folder = "/kaggle/input/playground-series-s3e22/"

In [3]:
train = pd.read_csv(root_folder + 'train.csv')
test = pd.read_csv(root_folder + 'test.csv')
sample_submission = pd.read_csv(root_folder + 'sample_submission.csv')

In [4]:
train.isnull().sum()

Unnamed: 0               0
surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
outcome                  0
dtype: int64

In [30]:
cat_features = train.select_dtypes(include=['object']).columns.tolist()[:-1]
num_features = [i for i in train.columns if i not in cat_features][:-1]

In [31]:
num_features

['Unnamed: 0',
 'surgery',
 'age',
 'hospital_number',
 'rectal_temp',
 'pulse',
 'respiratory_rate',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain']

In [7]:
X_train = train.drop(columns=['outcome'])
y_train = train['outcome']

In [8]:
X_train

Unnamed: 0.1,Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
0,0,-0.313086,-0.129621,1.801773,-0.368142,1.093380,0.702421,-0.483319,0.068659,-0.298737,...,more_3_sec,depressed,absent,slight,slight,less_1_liter,decreased,distend_small,serosanguious,yes
1,1,-0.310258,-0.890705,0.289582,-1.097830,-1.230314,-1.576449,1.598010,-0.812647,-0.298921,...,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,absent,distend_small,serosanguious,yes
2,2,-0.313226,0.124074,1.389357,-0.124913,-0.455749,-1.196638,-0.562072,0.068659,0.237648,...,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,absent,distend_large,serosanguious,yes
3,3,3.177363,-1.398095,-0.260306,-0.003299,-1.230314,0.322609,-0.539571,0.383411,-0.298921,...,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,decreased,distend_small,cloudy,yes
4,4,3.204717,-0.256468,-0.947666,1.091232,1.351568,-0.247108,-0.528321,-0.434944,-0.705212,...,less_3_sec,alert,hypomotile,none,slight,less_1_liter,normal,normal,cloudy,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,1230,-0.309218,0.377769,1.698669,1.091232,-1.230314,0.702421,1.673013,-0.812647,-0.299289,...,more_3_sec,depressed,absent,moderate,none,more_1_liter,absent,distend_large,serosanguious,yes
1231,1231,-0.314142,-0.890705,-0.672722,1.212846,-0.713937,-1.386544,-0.562072,0.194560,-0.298737,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,decreased,distend_small,serosanguious,yes
1232,1232,-0.313319,-0.890705,0.152110,0.604774,-0.713937,-0.911779,-0.580823,2.334874,-0.631608,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,increased,firm,cloudy,yes
1233,1233,-0.309559,-0.129621,-0.329042,-0.854601,-1.230314,0.797374,1.973025,-0.812647,-0.298737,...,less_3_sec,mild_pain,hypomotile,slight,none,more_1_liter,absent,distend_small,cloudy,yes


# Catboost

In [12]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "iterations": 500, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "TotalF1", # Тоже стоит заранее определиться
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,
    )

    pruning_callback = CatBoostPruningCallback(trial, "TotalF1")

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=30,
        callbacks=[pruning_callback],
    )

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [13]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [None]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=5, max_resource="auto", reduction_factor=3
    ),)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-27 20:07:24,722] A new study created in memory with name: no-name-a80293d2-b81f-4dc9-adef-55585d0dd523


  0%|          | 0/1000 [00:00<?, ?it/s]

  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")


[I 2023-09-27 20:07:33,051] Trial 2 finished with value: 0.6537439895485839 and parameters: {'learning_rate': 0.04679252390998169, 'l2_leaf_reg': 47, 'colsample_bylevel': 0.3825509328129426, 'auto_class_weights': 'SqrtBalanced', 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7382173100346546}. Best is trial 2 with value: 0.6537439895485839.


  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")


[I 2023-09-27 20:07:36,558] Trial 4 finished with value: 0.6260463030526336 and parameters: {'learning_rate': 0.05089238898444698, 'l2_leaf_reg': 37, 'colsample_bylevel': 0.07570206121001047, 'auto_class_weights': 'None', 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5048154246249362}. Best is trial 2 with value: 0.6537439895485839.


  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")


[I 2023-09-27 20:07:43,800] Trial 1 finished with value: 0.6616577096839154 and parameters: {'learning_rate': 0.07204708110387296, 'l2_leaf_reg': 22, 'colsample_bylevel': 0.12075151621523442, 'auto_class_weights': 'SqrtBalanced', 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.6616577096839154.


  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")


[I 2023-09-27 20:07:47,396] Trial 0 finished with value: 0.6311399879330848 and parameters: {'learning_rate': 0.016199943503262332, 'l2_leaf_reg': 39, 'colsample_bylevel': 0.250456845413745, 'auto_class_weights': 'Balanced', 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.6616577096839154.




[I 2023-09-27 20:07:48,963] Trial 5 finished with value: 0.6557128790706114 and parameters: {'learning_rate': 0.08751835898410014, 'l2_leaf_reg': 5, 'colsample_bylevel': 0.6982636202949215, 'auto_class_weights': 'SqrtBalanced', 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.6616577096839154.


  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")
  pruning_callback = CatBoostPruningCallback(trial, "TotalF1")


KeyboardInterrupt: 





# XGBoost

In [32]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])
preprocessor_train = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, num_features),
    ("categorical", categorical_transformer, cat_features)]
)

In [33]:
X_train = preprocessor_train.fit_transform(X_train)

In [34]:
def fit_xgboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    clf = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [37]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train[train_idx, :], y_train[train_idx]
        valid_data = X_train[valid_idx, :], y_train[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_xgboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [38]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=1, max_resource=100, reduction_factor=3
    ),)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-27 20:14:35,600] A new study created in memory with name: no-name-b188c321-24cc-415f-aed1-9acf27e9f784


  0%|          | 0/1000 [00:00<?, ?it/s]

[W 2023-09-27 20:14:35,671] Trial 3 failed with parameters: {} because of the following error: ValueError("could not convert string to float: 'died'").
Traceback (most recent call last):
  File "/home/artem/projects/ml_learning/competitions/horse_health/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_16883/714867016.py", line 14, in objective
    model, y_pred = fit_xgboost(trial, train_data, valid_data) # Определили выше
  File "/tmp/ipykernel_16883/3096940325.py", line 5, in fit_xgboost
    dtrain = xgb.DMatrix(X_train, label=y_train)
  File "/home/artem/projects/ml_learning/competitions/horse_health/.venv/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/artem/projects/ml_learning/competitions/horse_health/.venv/lib/python3.10/site-packages/xgboost/core.py", line 868, in __init__
    self.set_info(
  File "/home/artem/projects/ml_l

ValueError: could not convert string to float: 'died'

# LightGBM

In [None]:
def git_lgbm(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_val, label=y_val)

    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    clf = lgb.train(param, dtrain, valid_sets=[dvalid], callbacks=[pruning_callback])

    preds = gbm.predict(X_val)
    pred_labels = np.rint(preds)
    return clf, pred_labels

In [None]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result