# Libraries

In [1]:
# Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Models
from catboost import CatBoostClassifier, Pool
from optuna.integration import CatBoostPruningCallback
from xgboost import XGBClassifier, DMatrix
import xgboost as xgb
from lightgbm import Dataset
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold

# Vizualize
import matplotlib.pyplot as plt
# import seaborn as sns
import optuna
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
root_folder = "../data/"
# root_folder = "/kaggle/input/playground-series-s3e22/"

In [3]:
train = pd.read_csv(root_folder + 'train.csv').drop(columns=['Unnamed: 0'])
test = pd.read_csv(root_folder + 'test.csv')
sample_submission = pd.read_csv(root_folder + 'sample_submission.csv')

In [4]:
train.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
outcome                  0
dtype: int64

In [5]:
cat_features = train.select_dtypes(include=['object']).columns.tolist()[:-1]
num_features = [i for i in train.columns if i not in cat_features][:-1]

In [6]:
num_features

['surgery',
 'age',
 'hospital_number',
 'rectal_temp',
 'pulse',
 'respiratory_rate',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain']

In [7]:
X_train = train.drop(columns=['outcome'])
y_train = train['outcome']

In [8]:
X_train

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
0,-0.313086,-0.129621,1.801773,-0.368142,1.093380,0.702421,-0.483319,0.068659,-0.298737,-0.075465,...,more_3_sec,depressed,absent,slight,slight,less_1_liter,decreased,distend_small,serosanguious,yes
1,-0.310258,-0.890705,0.289582,-1.097830,-1.230314,-1.576449,1.598010,-0.812647,-0.298921,-0.075465,...,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,absent,distend_small,serosanguious,yes
2,-0.313226,0.124074,1.389357,-0.124913,-0.455749,-1.196638,-0.562072,0.068659,0.237648,-0.075465,...,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,absent,distend_large,serosanguious,yes
3,3.177363,-1.398095,-0.260306,-0.003299,-1.230314,0.322609,-0.539571,0.383411,-0.298921,-0.075465,...,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,decreased,distend_small,cloudy,yes
4,3.204717,-0.256468,-0.947666,1.091232,1.351568,-0.247108,-0.528321,-0.434944,-0.705212,-0.075465,...,less_3_sec,alert,hypomotile,none,slight,less_1_liter,normal,normal,cloudy,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,-0.309218,0.377769,1.698669,1.091232,-1.230314,0.702421,1.673013,-0.812647,-0.299289,-0.075465,...,more_3_sec,depressed,absent,moderate,none,more_1_liter,absent,distend_large,serosanguious,yes
1231,-0.314142,-0.890705,-0.672722,1.212846,-0.713937,-1.386544,-0.562072,0.194560,-0.298737,-0.075465,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,decreased,distend_small,serosanguious,yes
1232,-0.313319,-0.890705,0.152110,0.604774,-0.713937,-0.911779,-0.580823,2.334874,-0.631608,-0.075465,...,less_3_sec,mild_pain,hypomotile,slight,slight,none,increased,firm,cloudy,yes
1233,-0.309559,-0.129621,-0.329042,-0.854601,-1.230314,0.797374,1.973025,-0.812647,-0.298737,-0.075465,...,less_3_sec,mild_pain,hypomotile,slight,none,more_1_liter,absent,distend_small,cloudy,yes


# Catboost

In [9]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "iterations": 1000, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), 
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "TotalF1", # Тоже стоит заранее определиться

        "objective": trial.suggest_categorical("objective", ["MultiClass", "MultiClassOneVsAll"]),
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,
    )

    pruning_callback = CatBoostPruningCallback(trial, "TotalF1")

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=30,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [10]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [11]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-28 10:05:27,226] A new study created in memory with name: no-name-3e9e6684-ee41-4714-90bf-f4b11549dc1b


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2023-09-28 10:05:29,502] Trial 0 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:30,657] Trial 4 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:31,080] Trial 5 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:31,534] Trial 6 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:32,189] Trial 7 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:35,317] Trial 8 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:35,439] Trial 2 finished with value: 0.6482470657913839 and parameters: {'learning_rate': 0.05322323174432989, 'l2_leaf_reg': 40, 'colsample_bylevel': 0.05905047986382113, 'min_data_in_leaf': 92, 'auto_class_weights': 'Balanced', 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'objective': 'MultiClass', 'subsample': 0.5040834941731825}. Best is trial 2 with value: 0.6482470657913839.
[I 2023-09-28 10:05:36,049] Trial 10 pruned. Trial was pruned at iteration 10.
[I 2023-09-28 10:05:36,77

In [12]:
catboost_best_params = study.best_params

In [13]:
catboost_best_params

{'learning_rate': 0.18172577047233215,
 'l2_leaf_reg': 5,
 'colsample_bylevel': 0.5465094095145017,
 'min_data_in_leaf': 95,
 'auto_class_weights': 'SqrtBalanced',
 'depth': 8,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'objective': 'MultiClass'}

# XGBoost

In [9]:
le = LabelEncoder().fit(y_train)
classes = le.classes_
y_train = le.transform(y_train)

In [15]:
def fit_xgboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val
    X_train = X_train[cat_features].astype("category")
    X_val = X_val[cat_features].astype("category")

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "verbosity": 0,
        "verbose_eval": False,
        "eval_metric": ["merror", "mlogloss"],
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "n_estimators": 300,
        "enable_categorical": True,
        "early_stopping_rounds": 30,
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    param["objective"] = "multi:softprob"
    

    # Add a callback for pruning.
    clf = xgb.XGBClassifier(**param)
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    # clf = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [16]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_xgboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [17]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
)
study.optimize(objective,
    n_trials=200,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-28 11:03:03,946] A new study created in memory with name: no-name-bf54219b-5b35-472f-95c1-54fda5e1cefe


  0%|          | 0/200 [00:00<?, ?it/s]

[0]	validation_0-merror:0.52632	validation_0-mlogloss:1.02680
[0]	validation_0-merror:0.48988	validation_0-mlogloss:0.99954
[1]	validation_0-merror:0.52632	validation_0-mlogloss:1.01460
[1]	validation_0-merror:0.47773	validation_0-mlogloss:0.97678
[2]	validation_0-merror:0.52227	validation_0-mlogloss:1.00656
[2]	validation_0-merror:0.48583	validation_0-mlogloss:0.96625[3]	validation_0-merror:0.50202	validation_0-mlogloss:1.00177

[3]	validation_0-merror:0.47773	validation_0-mlogloss:0.96042[4]	validation_0-merror:0.48988	validation_0-mlogloss:0.99884

[4]	validation_0-merror:0.47773	validation_0-mlogloss:0.95564[5]	validation_0-merror:0.48583	validation_0-mlogloss:0.99682

[5]	validation_0-merror:0.47773	validation_0-mlogloss:0.95246[6]	validation_0-merror:0.49393	validation_0-mlogloss:0.99533

[0]	validation_0-merror:0.48178	validation_0-mlogloss:1.09861
[7]	validation_0-merror:0.49798	validation_0-mlogloss:0.99419[0]	validation_0-merror:0.44534	validation_0-mlogloss:1.09634

[6]	vali

In [None]:
xgboost_best_params = study.best_params

# LightGBM

In [10]:
le = LabelEncoder()
for col in cat_features:
    X_train[col] = le.fit_transform(X_train[col])

for col in cat_features:
    X_train[col] = X_train[col].astype('int')

In [11]:
X_train

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
0,-0.313086,-0.129621,1.801773,-0.368142,1.093380,0.702421,-0.483319,0.068659,-0.298737,-0.075465,...,2,1,0,3,2,0,1,1,2,1
1,-0.310258,-0.890705,0.289582,-1.097830,-1.230314,-1.576449,1.598010,-0.812647,-0.298921,-0.075465,...,2,3,0,0,0,1,0,1,2,1
2,-0.313226,0.124074,1.389357,-0.124913,-0.455749,-1.196638,-0.562072,0.068659,0.237648,-0.075465,...,1,2,3,0,2,2,0,0,2,1
3,3.177363,-1.398095,-0.260306,-0.003299,-1.230314,0.322609,-0.539571,0.383411,-0.298921,-0.075465,...,2,3,3,0,2,1,1,1,1,1
4,3.204717,-0.256468,-0.947666,1.091232,1.351568,-0.247108,-0.528321,-0.434944,-0.705212,-0.075465,...,1,0,3,1,2,0,3,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,-0.309218,0.377769,1.698669,1.091232,-1.230314,0.702421,1.673013,-0.812647,-0.299289,-0.075465,...,2,1,0,0,0,1,0,0,2,1
1231,-0.314142,-0.890705,-0.672722,1.212846,-0.713937,-1.386544,-0.562072,0.194560,-0.298737,-0.075465,...,1,3,3,3,2,2,1,1,2,1
1232,-0.313319,-0.890705,0.152110,0.604774,-0.713937,-0.911779,-0.580823,2.334874,-0.631608,-0.075465,...,1,3,3,3,2,2,2,2,1,1
1233,-0.309559,-0.129621,-0.329042,-0.854601,-1.230314,0.797374,1.973025,-0.812647,-0.298737,-0.075465,...,1,3,3,3,0,1,0,1,1,1


In [12]:
def fit_lgbm(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    fit_params={
        "eval_metric" : 'logloss', 
        "eval_set" : [(X_val, y_val)],
        'eval_names': ['valid'],
        'feature_name': 'auto', # that's actually the default
        'categorical_feature': cat_features # that's actually the default
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    clf = lgbm.LGBMClassifier(**param, n_jobs=-1, n_estimators=1000, verbose=0, callbacks=[pruning_callback])
    clf.fit(X_train, y_train, **fit_params)

    preds = clf.predict(X_val)
    return clf, preds

In [13]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_lgbm(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average='micro'))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [14]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
)
study.optimize(objective,
    n_trials=200,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-09-28 11:42:23,204] A new study created in memory with name: no-name-db7a13e9-7672-45c3-bc40-42f79fb3d5f9


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-09-28 11:43:01,154] Trial 3 finished with value: 0.6388374875267306 and parameters: {'lambda_l1': 0.00018288172185846284, 'lambda_l2': 9.114725011532972, 'num_leaves': 59, 'feature_fraction': 0.43711457849448565, 'bagging_fraction': 0.5680610336917785, 'bagging_freq': 5, 'min_child_samples': 68}. Best is trial 3 with value: 0.6388374875267306.
[I 2023-09-28 11:43:02,492] Trial 1 finished with value: 0.6552168292924174 and parameters: {'lambda_l1': 0.0004648071889338926, 'lambda_l2': 0.0017633620386056255, 'num_leaves': 168, 'feature_fraction': 0.6897516194878264, 'bagging_fraction': 0.6246361564228899, 'bagging_freq': 7, 'min_child_samples': 72}. Best is trial 1 with value: 0.6552168292924174.
[I 2023-09-28 11:43:21,743] Trial 2 finished with value: 0.6648244363504715 and parameters: {'lambda_l1': 0.15640211349069547, 'lambda_l2': 1.7961686658740635e-06, 'num_leaves': 118, 'feature_fraction': 0.7278170984596337, 'bagging_fraction': 0.9049424072409231, 'bagging_freq': 4, 'min_ch

In [15]:
lgbm_best_params = study.best_params

In [16]:
lgbm_best_params

{'lambda_l1': 6.194924967173888e-08,
 'lambda_l2': 0.024206509546991783,
 'num_leaves': 2,
 'feature_fraction': 0.5588173039526346,
 'bagging_fraction': 0.977289560233083,
 'bagging_freq': 7,
 'min_child_samples': 33}