# II. Обучение моделей

### Загрузка данных

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from src.config import PROCESSED_DATA_PATH, MODELS_PATH
from src.model_training import logreg_pipeline, rf_pipeline, lgb_model, xgb_model, get_param_grids
from src.utils import evaluate_metrics, save_model_results

# не используем GridSearchCV для catboost
from catboost import CatBoostClassifier

In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)

Загрузим очищенные данные и посмотрим на них.

In [4]:
df = pd.read_parquet(PROCESSED_DATA_PATH / 'train_cleaned.parquet')
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,DebtRatio_Flag,Total_Late_Payments
0,1,0.766,45,2,0.803,9120.0,13,0,6,0,2.0,0,2
1,0,0.957,40,0,0.122,2600.0,4,0,0,0,1.0,0,0
2,0,0.658,38,1,0.085,3042.0,2,1,0,0,0.0,0,2
3,0,0.234,30,0,0.036,3300.0,5,0,0,0,0.0,0,0
4,0,0.907,49,1,0.025,63588.0,7,0,1,0,0.0,0,1


In [5]:
y = df['SeriousDlqin2yrs']
y.head()

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64

In [6]:
X = df.drop('SeriousDlqin2yrs', axis=1)
X.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,DebtRatio_Flag,Total_Late_Payments
0,0.766,45,2,0.803,9120.0,13,0,6,0,2.0,0,2
1,0.957,40,0,0.122,2600.0,4,0,0,0,1.0,0,0
2,0.658,38,1,0.085,3042.0,2,1,0,0,0.0,0,2
3,0.234,30,0,0.036,3300.0,5,0,0,0,0.0,0,0
4,0.907,49,1,0.025,63588.0,7,0,1,0,0.0,0,1


### Подготовка к валидации

Разделим данные на `X_train`, `y_train` и отложенную выборку (`X_val`, `y_val`) для финальной проверки. Используем `stratify=y` для сохранения пропорции классов.

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

Создадим объект для разделения данных на части с дисбалансом классов (стратифицированные fold'ы).

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Получим словарь всех сеток параметров для GridSearchCV.

In [9]:
all_params = get_param_grids()

### Обучение моделей

**Будем использовать следующие модели:**

1. Логистическая регрессия (Logistic Regression) — baseline.
2. Рандомный лес (Random Forest).
3. Градиентный бустинг (LightGBM, XGBoost, CatBoost).

#### Logistic Regression (логистическая регрессия)

In [10]:
logreg = GridSearchCV(
    estimator=logreg_pipeline(),
    param_grid=all_params['logreg'],
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

logreg.fit(X_train, y_train)

best_model = logreg.best_estimator_
best_params = logreg.best_params_

y_pred = best_model.predict_proba(X_val)[:, 1]

results = evaluate_metrics(y_val, y_pred)

print(f"ROC-AUC: {results['roc_auc']}")
print(f"Gini: {results['gini']}")

ROC-AUC: 0.8337
Gini: 0.6675


In [11]:
model_name = 'logreg_v1'
save_model_results(model_name, best_model, best_params, results)

#### Random Forest

In [12]:
rf = GridSearchCV(
    estimator=rf_pipeline(),
    param_grid=all_params['rf'],
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

rf.fit(X_train, y_train)

best_model = rf.best_estimator_
best_params = rf.best_params_

y_pred = best_model.predict_proba(X_val)[:, 1]

results = evaluate_metrics(y_val, y_pred)

print(f"ROC-AUC: {results['roc_auc']}")
print(f"Gini: {results['gini']}")

ROC-AUC: 0.863
Gini: 0.726


In [13]:
model_name = 'rf_v1'
save_model_results(model_name, best_model, best_params, results)

#### LightGBM

In [14]:
lgb = GridSearchCV(
    estimator=lgb_model(),
    param_grid=all_params['lgb'],
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

lgb.fit(X_train, y_train)

best_model = lgb.best_estimator_
best_params = lgb.best_params_

y_pred = best_model.predict_proba(X_val)[:, 1]

results = evaluate_metrics(y_val, y_pred)

print(f"ROC-AUC: {results['roc_auc']}")
print(f"Gini: {results['gini']}")

[LightGBM] [Info] Number of positive: 8021, number of negative: 111979
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006871 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 987
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
ROC-AUC: 0.869
Gini: 0.7379


In [15]:
model_name = 'lgb_v1'
save_model_results(model_name, best_model, best_params, results)

#### XGBoost

In [16]:
xgb = GridSearchCV(
    estimator=xgb_model(),
    param_grid=all_params['xgb'],
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

xgb.fit(X_train, y_train)

best_model = xgb.best_estimator_
best_params = xgb.best_params_

y_pred = best_model.predict_proba(X_val)[:, 1]

results = evaluate_metrics(y_val, y_pred)

print(f"ROC-AUC: {results['roc_auc']}")
print(f"Gini: {results['gini']}")

ROC-AUC: 0.8688
Gini: 0.7376


In [17]:
model_name = 'xgb_v1'
save_model_results(model_name, best_model, best_params, results)

#### CatBoost

Для модели CatBoost не используется GridSearchCV из-за текущей несовместимости библиотеки с внутренними проверками scikit-learn. Параметры для модели подобраны вручную.

In [18]:
catboost = CatBoostClassifier(
    depth=4,
    learning_rate=0.01,
    iterations=2000,
    auto_class_weights='Balanced',
    eval_metric='AUC',
    silent=True,
    random_seed=42,
    allow_writing_files=False
)

catboost.fit(X_train, y_train)

y_pred = catboost.predict_proba(X_val)[:, 1]

results = evaluate_metrics(y_val, y_pred)

print(f"ROC-AUC: {results['roc_auc']}")
print(f"Gini: {results['gini']}")

ROC-AUC: 0.8703
Gini: 0.7407


In [19]:
model_name = 'catboost_v1'
model = catboost
params = {
    'depth': 4,
    'learning_rate': 0.01,
    'iterations': 2000
}

save_model_results(model_name, model, params, results)