# **LightGBM, XGBoost, CatBoost: оценка важности признаков**

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [3]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [5]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

geopandas==0.13.2
lightgbm==4.1.0
numpy==1.25.2
pandas==1.5.3
pandas-datareader==0.10.0
pandas-gbq==0.19.2
pandas-stubs==1.5.3.230304
scikit-learn==1.2.2
sklearn-pandas==2.2.0


In [6]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

# Загрузка данных

In [7]:
train_df = pd.read_parquet("train_data_modefilled.pqt")
test_df = pd.read_parquet("test_data_modefilled.pqt")

In [8]:
train_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}


In [9]:
test_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,{α}


In [10]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

Обозначение категориальных признаков

In [11]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

In [12]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

Извлекаем веса кластеров

In [13]:
cluster_weights = pd.read_excel("/content/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

## Обучение моделей

### мечты-мечты...

In [None]:
lgbm_model = LGBMClassifier()

In [None]:
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': [31, 51,],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 0.5, 1.0],
    'random_state': [42],
    'verbosity':[-1]
}

In [None]:
search = HalvingGridSearchCV(lgbm_model, param_grid, random_state=42, error_score='raise').fit(x_train, y_train)

In [None]:
search.best_params_

NameError: name 'search' is not defined

### CV

In [14]:
def skf_fit(X, y, model):
  skf = StratifiedKFold(n_splits=5)
  results = []
  for i, (train_index, val_index) in enumerate(skf.split(X, y)):
      print(f"Fold {i}:")
      x_train = X.iloc[train_index]
      x_val = X.iloc[val_index]
      y_train = y[train_index]
      y_val = y[val_index]

      model.fit(x_train, y_train, cat_features=cat_cols)

      y_pred_proba = model.predict_proba(x_val)
      results.append(weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict))
  print(results)
  print(f"cv result: {np.mean(results)}")
  return model

In [15]:
model_lgbm = skf_fit(X, y, LGBMClassifier(verbosity=-1, random_state=42, n_jobs=-1))

Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:
[0.7133922790104996, 0.7213146240979298, 0.7100870066733664, 0.6881407655391474, 0.7103279199632531]
cv result: 0.7086525190568392


In [16]:
model_xgb = skf_fit(X, y, XGBClassifier(random_state=42,  n_jobs=-1))

Fold 0:


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16], got ['{other}' '{}' '{α, β}' '{α, γ}' '{α, δ}' '{α, ε, η}' '{α, ε, θ}'
 '{α, ε, ψ}' '{α, ε}' '{α, η}' '{α, θ}' '{α, λ}' '{α, μ}' '{α, π}'
 '{α, ψ}' '{α}' '{λ}']

In [None]:
model_ctb = CatBoostClassifier(verbose = 0, random_state=42)
skf = StratifiedKFold(n_splits=5)
results = []
for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    x_train = X.iloc[train_index]
    x_val = X.iloc[val_index]
    y_train = y[train_index]
    y_val = y[val_index]

    model_ctb.fit(x_train, y_train, cat_features=cat_cols)

    y_pred_proba = model_ctb.predict_proba(x_val)
    results.append(weighted_roc_auc(y_val, y_pred_proba, model_ctb.classes_, weights_dict))
print(results)
print(f"cv result: {np.mean(results)}")

Fold 0:
