# МОДЕЛЬ ПРОГНОЗИРОВАНИЯ ОТТОКА КЛИЕНТОВ БАНКА
## Улучшенная версия с полным анализом

===============================================================================

**Дата:** 2025-01-13  
**Версия:** 2.0 (Enhanced)  
**Алгоритмы:** CatBoost, XGBoost, LightGBM

## УЛУЧШЕНИЯ:
1. ✅ Удаление segment_group после разделения по сегментам
2. ✅ Сравнение 3 алгоритмов: CatBoost, XGBoost, LightGBM
3. ✅ Анализ корреляции фичей с таргетом
4. ✅ PSI (Population Stability Index) анализ
5. ✅ Метрики по перцентилям (Decile Analysis, Lift, Cumulative Precision/Recall)
6. ✅ Техники балансировки классов: Undersampling, SMOTE, Class Weights
7. ✅ Полная документация для банка

## ОСОБЕННОСТИ:
- Две модели по сегментам:
  * Модель 1: Малый бизнес (SMALL_BUSINESS)
  * Модель 2: Средний + Крупный бизнес (MIDDLE + LARGE_BUSINESS)
- Temporal Split (Train/Val/Test-OOT)
- Полная воспроизводимость (random_seed=42)

===============================================================================

# 1. ИМПОРТ БИБЛИОТЕК И КОНФИГУРАЦИЯ

In [None]:
# ====================================================================================
# ИМПОРТ БИБЛИОТЕК
# ====================================================================================

import os
import sys
import warnings
from datetime import datetime
from pathlib import Path
import json
import pickle
import time
import gc

# Данные
import numpy as np
import pandas as pd
from scipy import stats

# Визуализация
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, roc_curve,
    classification_report, confusion_matrix,
    accuracy_score, f1_score, precision_score, recall_score
)

# Балансировка классов
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Настройки
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

print("="*80)
print("CHURN PREDICTION MODEL - УЛУЧШЕННЫЙ ПАЙПЛАЙН v2.0")
print("="*80)
print(f"✓ Библиотеки импортированы")
print(f"  Pandas: {pd.__version__}")
print(f"  NumPy: {np.__version__}")
print(f"  XGBoost: {xgb.__version__}")
print(f"  LightGBM: {lgb.__version__}")
print(f"  Дата запуска: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

In [None]:
# ====================================================================================
# ГЛОБАЛЬНАЯ КОНФИГУРАЦИЯ
# ====================================================================================

class Config:
    """Централизованная конфигурация для воспроизводимости"""

    # ВОСПРОИЗВОДИМОСТЬ
    RANDOM_SEED = 42

    # ПУТИ
    DATA_DIR = Path("data")
    OUTPUT_DIR = Path("output")
    MODEL_DIR = Path("models")
    FIGURES_DIR = Path("figures")

    # ФАЙЛЫ
    TRAIN_FILE = "churn_train_ul.parquet"
    PROD_FILE = "churn_prod_ul.parquet"

    # КОЛОНКИ
    ID_COLUMNS = ['cli_code', 'client_id', 'observation_point']
    TARGET_COLUMN = 'target_churn_3m'
    SEGMENT_COLUMN = 'segment_group'
    DATE_COLUMN = 'observation_point'
    CATEGORICAL_FEATURES = ['obs_month', 'obs_quarter']  # УДАЛИЛИ segment_group!

    # СЕГМЕНТЫ (ДВЕ МОДЕЛИ)
    SEGMENT_1_NAME = "Small Business"
    SEGMENT_1_VALUES = ['SMALL_BUSINESS']

    SEGMENT_2_NAME = "Middle + Large Business"
    SEGMENT_2_VALUES = ['MIDDLE_BUSINESS', 'LARGE_BUSINESS']

    # ВРЕМЕННОЕ РАЗБИЕНИЕ
    TRAIN_SIZE = 0.70
    VAL_SIZE = 0.15
    TEST_SIZE = 0.15

    # PREPROCESSING
    CORRELATION_THRESHOLD = 0.85
    OUTLIER_IQR_MULTIPLIER = 1.5
    REMOVE_GAPS = True
    HANDLE_OUTLIERS = True
    REMOVE_HIGH_CORRELATIONS = True

    # PSI
    PSI_BUCKETS = 10

    # CATBOOST
    CATBOOST_PARAMS = {
        'iterations': 500,
        'learning_rate': 0.05,
        'depth': 4,
        'l2_leaf_reg': 3,
        'min_data_in_leaf': 100,
        'random_strength': 1,
        'bagging_temperature': 1,
        'border_count': 128,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'early_stopping_rounds': 100,
        'use_best_model': True,
        'random_seed': 42,
        'task_type': 'CPU',
        'verbose': 100,
        'allow_writing_files': False
    }

    # XGBOOST
    XGBOOST_PARAMS = {
        'max_depth': 4,
        'learning_rate': 0.05,
        'n_estimators': 500,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 100,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 1
    }

    # LIGHTGBM
    LIGHTGBM_PARAMS = {
        'max_depth': 4,
        'learning_rate': 0.05,
        'n_estimators': 500,
        'objective': 'binary',
        'metric': 'auc',
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_samples': 100,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1
    }

    # IMBALANCE
    THRESHOLD_METRIC = 'f1'

    @classmethod
    def create_directories(cls):
        for dir_path in [cls.OUTPUT_DIR, cls.MODEL_DIR, cls.FIGURES_DIR]:
            dir_path.mkdir(parents=True, exist_ok=True)

    @classmethod
    def get_train_path(cls):
        return cls.DATA_DIR / cls.TRAIN_FILE

config = Config()
config.create_directories()
np.random.seed(config.RANDOM_SEED)

print("\n✓ Конфигурация инициализирована")
print(f"  Random seed: {config.RANDOM_SEED}")
print(f"  Модель 1: {config.SEGMENT_1_NAME} {config.SEGMENT_1_VALUES}")
print(f"  Модель 2: {config.SEGMENT_2_NAME} {config.SEGMENT_2_VALUES}")
print(f"  Split: {config.TRAIN_SIZE}/{config.VAL_SIZE}/{config.TEST_SIZE}")
print(f"\n⚠️  ВАЖНО: segment_group удален из CATEGORICAL_FEATURES после разделения!")

---
# 2. ЗАГРУЗКА ДАННЫХ

In [None]:
# ====================================================================================
# ЗАГРУЗКА ДАННЫХ
# ====================================================================================

train_path = config.get_train_path()

print("\n" + "="*80)
print("ЗАГРУЗКА ДАННЫХ")
print("="*80)
print(f"Файл: {train_path}")

if not train_path.exists():
    raise FileNotFoundError(
        f"Файл не найден: {train_path}\n"
        f"Запустите: python csv_to_parquet_converter.py"
    )

file_size = train_path.stat().st_size / (1024**2)
print(f"Размер: {file_size:.2f} MB")

start = time.time()
df_full = pd.read_parquet(train_path)
load_time = time.time() - start

memory = df_full.memory_usage(deep=True).sum() / (1024**2)

print(f"\n✓ Загружено за {load_time:.2f} сек")
print(f"  Размер: {df_full.shape}")
print(f"  Память: {memory:.2f} MB")

# Целевая переменная
churn_rate = df_full[config.TARGET_COLUMN].mean()
print(f"\n  Target '{config.TARGET_COLUMN}':")
print(f"    Churn rate: {churn_rate:.4f} ({churn_rate*100:.2f}%)")
print(f"    Churned: {df_full[config.TARGET_COLUMN].sum():,}")
print(f"    Ratio: 1:{(1-churn_rate)/churn_rate:.1f}")

# Временной диапазон
df_full[config.DATE_COLUMN] = pd.to_datetime(df_full[config.DATE_COLUMN])
print(f"\n  Период: {df_full[config.DATE_COLUMN].min().date()} - "
      f"{df_full[config.DATE_COLUMN].max().date()}")
print(f"  Уникальных дат: {df_full[config.DATE_COLUMN].nunique()}")

# Сегменты
print(f"\n  Распределение по сегментам:")
for segment, count in df_full[config.SEGMENT_COLUMN].value_counts().items():
    pct = count / len(df_full) * 100
    churn_seg = df_full[df_full[config.SEGMENT_COLUMN]==segment][config.TARGET_COLUMN].mean()
    print(f"    {segment}: {count:,} ({pct:.1f}%) | Churn: {churn_seg*100:.2f}%")

print("="*80)

---
# 3. EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
# ====================================================================================
# АНАЛИЗ КАЧЕСТВА ДАННЫХ
# ====================================================================================

print("\n" + "="*80)
print("АНАЛИЗ КАЧЕСТВА ДАННЫХ")
print("="*80)

# Пропуски
missing = df_full.isnull().sum()
missing_df = pd.DataFrame({
    'Missing': missing[missing > 0],
    'Percent': (missing[missing > 0] / len(df_full) * 100).round(2)
}).sort_values('Missing', ascending=False)

print(f"\n1. Пропущенные значения:")
if len(missing_df) > 0:
    print(f"   {len(missing_df)} колонок с пропусками:")
    print(missing_df.head(10).to_string())
else:
    print("   ✓ Нет пропусков")

# Константы
constant_cols = [col for col in df_full.columns if df_full[col].nunique() == 1]
print(f"\n2. Константные колонки: {len(constant_cols)}")
if constant_cols:
    print(f"   {constant_cols[:5]}...")

# Дубликаты
n_dups = df_full.duplicated().sum()
print(f"\n3. Дубликаты: {n_dups:,}")

# Типы
print(f"\n4. Типы данных:")
for dtype, count in df_full.dtypes.value_counts().items():
    print(f"   {dtype}: {count}")

print("="*80)

In [None]:
# ====================================================================================
# ВИЗУАЛИЗАЦИЯ: РАСПРЕДЕЛЕНИЕ TARGET
# ====================================================================================

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Общее
target_dist = df_full[config.TARGET_COLUMN].value_counts()
axes[0].bar(['No Churn', 'Churn'], [target_dist[0], target_dist[1]],
           color=['green', 'red'], alpha=0.7, edgecolor='black')
axes[0].set_title('Распределение Target', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Количество')
axes[0].set_yscale('log')
for i, v in enumerate([target_dist[0], target_dist[1]]):
    axes[0].text(i, v, f'{v:,}\n({v/len(df_full)*100:.2f}%)',
                ha='center', va='bottom')

# 2. По сегментам
segment_churn = df_full.groupby([config.SEGMENT_COLUMN,
                                  config.TARGET_COLUMN]).size().unstack(fill_value=0)
segment_churn.plot(kind='bar', stacked=True, ax=axes[1],
                  color=['green', 'red'], alpha=0.7, edgecolor='black')
axes[1].set_title('По сегментам', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Сегмент')
axes[1].legend(['No Churn', 'Churn'])
axes[1].tick_params(axis='x', rotation=45)

# 3. Churn rate
churn_rates = df_full.groupby(config.SEGMENT_COLUMN)[config.TARGET_COLUMN].mean() * 100
axes[2].bar(range(len(churn_rates)), churn_rates.values,
           color='coral', alpha=0.7, edgecolor='black')
axes[2].set_xticks(range(len(churn_rates)))
axes[2].set_xticklabels(churn_rates.index, rotation=45, ha='right')
axes[2].set_title('Churn Rate по сегментам', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Churn Rate (%)')
for i, v in enumerate(churn_rates.values):
    axes[2].text(i, v, f'{v:.2f}%', ha='center', va='bottom')

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / '01_eda_target.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Сохранено: figures/01_eda_target.png")

---
# 4. ВРЕМЕННОЕ РАЗБИЕНИЕ (TRAIN / VAL / TEST-OOT)

In [None]:
# ====================================================================================
# TEMPORAL SPLIT
# ====================================================================================

print("\n" + "="*80)
print("ВРЕМЕННОЕ РАЗБИЕНИЕ (TEMPORAL SPLIT)")
print("="*80)

# Сортировка по времени
df_sorted = df_full.sort_values(config.DATE_COLUMN).reset_index(drop=True)
unique_dates = sorted(df_sorted[config.DATE_COLUMN].unique())
n_dates = len(unique_dates)

print(f"\nУникальных дат: {n_dates}")
print(f"Период: {unique_dates[0].date()} - {unique_dates[-1].date()}")

# Cutoff indices
train_cutoff = int(n_dates * config.TRAIN_SIZE)
val_cutoff = int(n_dates * (config.TRAIN_SIZE + config.VAL_SIZE))

train_end = unique_dates[train_cutoff - 1]
val_end = unique_dates[val_cutoff - 1]

print(f"\nCutoff даты:")
print(f"  Train: до {train_end.date()} ({train_cutoff} дат)")
print(f"  Val: {unique_dates[train_cutoff].date()} - {val_end.date()} ({val_cutoff - train_cutoff} дат)")
print(f"  Test (OOT): {unique_dates[val_cutoff].date()}+ ({n_dates - val_cutoff} дат)")

# Создание split
train_df = df_sorted[df_sorted[config.DATE_COLUMN] <= train_end].copy()
val_df = df_sorted[(df_sorted[config.DATE_COLUMN] > train_end) &
                   (df_sorted[config.DATE_COLUMN] <= val_end)].copy()
test_df = df_sorted[df_sorted[config.DATE_COLUMN] > val_end].copy()

# Stats
for name, df in [('TRAIN', train_df), ('VAL', val_df), ('TEST (OOT)', test_df)]:
    churn_r = df[config.TARGET_COLUMN].mean()
    print(f"\n{name}:")
    print(f"  Записей: {len(df):,}")
    print(f"  Клиентов: {df['cli_code'].nunique():,}")
    print(f"  Период: {df[config.DATE_COLUMN].min().date()} - {df[config.DATE_COLUMN].max().date()}")
    print(f"  Churn rate: {churn_r:.4f} ({churn_r*100:.2f}%)")

# Проверка leakage
assert train_df[config.DATE_COLUMN].max() < val_df[config.DATE_COLUMN].min()
assert val_df[config.DATE_COLUMN].max() < test_df[config.DATE_COLUMN].min()
print("\n✓ Temporal ordering verified - no data leakage")

print("="*80)

---
# 5. УДАЛЕНИЕ КЛИЕНТОВ С ПРОБЕЛАМИ

In [None]:
# ====================================================================================
# GAP REMOVAL
# ====================================================================================

if config.REMOVE_GAPS:
    print("\n" + "="*80)
    print("УДАЛЕНИЕ КЛИЕНТОВ С ПРОБЕЛАМИ")
    print("="*80)

    print("\nАнализ пробелов в train...")

    # Chunked processing
    unique_clients = train_df['cli_code'].unique()
    chunk_size = 10000
    clients_with_gaps_list = []

    for i in range(0, len(unique_clients), chunk_size):
        chunk_clients = unique_clients[i:i+chunk_size]
        chunk = train_df[train_df['cli_code'].isin(chunk_clients)].copy()
        chunk = chunk.sort_values(['cli_code', config.DATE_COLUMN])

        chunk['month_num'] = chunk[config.DATE_COLUMN].dt.to_period('M').apply(lambda x: x.ordinal)
        chunk['month_diff'] = chunk.groupby('cli_code')['month_num'].diff()

        gaps = chunk.groupby('cli_code')['month_diff'].agg([
            ('max_gap', 'max'),
            ('total_gaps', lambda x: (x > 1).sum())
        ]).reset_index()

        chunk_gaps = gaps[gaps['max_gap'] > 1]
        clients_with_gaps_list.append(chunk_gaps)

        if (i // chunk_size + 1) % 10 == 0:
            gc.collect()
            print(f"  Обработано {i+chunk_size:,}/{len(unique_clients):,} клиентов")

    clients_with_gaps = pd.concat(clients_with_gaps_list, ignore_index=True)

    gap_pct = len(clients_with_gaps) / len(unique_clients) * 100
    print(f"\nКлиентов с пробелами: {len(clients_with_gaps):,} ({gap_pct:.2f}%)")

    if len(clients_with_gaps) > 0:
        bad_clients = set(clients_with_gaps['cli_code'])

        train_before = len(train_df)
        train_df = train_df[~train_df['cli_code'].isin(bad_clients)].copy()
        val_df = val_df[~val_df['cli_code'].isin(bad_clients)].copy()
        test_df = test_df[~test_df['cli_code'].isin(bad_clients)].copy()

        print(f"\nУдалено:")
        print(f"  Train: {train_before:,} → {len(train_df):,}")
        print(f"  Val: {len(val_df):,}")
        print(f"  Test: {len(test_df):,}")

        del clients_with_gaps, bad_clients
        gc.collect()

    print("="*80)

---
# 6. НОВОЕ: АНАЛИЗ КОРРЕЛЯЦИИ С ТАРГЕТОМ

Важно для документации банка (раздел 3.5.4)

In [None]:
# ====================================================================================
# КОРРЕЛЯЦИЯ С ТАРГЕТОМ
# ====================================================================================

print("\n" + "="*80)
print("АНАЛИЗ КОРРЕЛЯЦИИ ПРИЗНАКОВ С ТАРГЕТОМ")
print("="*80)

# Только числовые признаки
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]

print(f"\nЧисловых признаков: {len(numeric_cols)}")

# Расчет корреляции
correlations = []
for col in numeric_cols:
    try:
        corr = train_df[[col, config.TARGET_COLUMN]].corr().iloc[0, 1]
        correlations.append({'feature': col, 'correlation': corr})
    except:
        pass

corr_df = pd.DataFrame(correlations)
corr_df['abs_correlation'] = corr_df['correlation'].abs()
corr_df = corr_df.sort_values('abs_correlation', ascending=False)

print(f"\nТОП-20 признаков по корреляции с таргетом:")
print(corr_df.head(20)[['feature', 'correlation']].to_string(index=False))

# Визуализация
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 20 positive
top_positive = corr_df.nlargest(20, 'correlation')
axes[0].barh(range(len(top_positive)), top_positive['correlation'].values, color='green', alpha=0.7)
axes[0].set_yticks(range(len(top_positive)))
axes[0].set_yticklabels(top_positive['feature'].values, fontsize=8)
axes[0].set_xlabel('Correlation')
axes[0].set_title('ТОП-20 Положительных Корреляций с Target', fontweight='bold')
axes[0].invert_yaxis()

# Top 20 negative
top_negative = corr_df.nsmallest(20, 'correlation')
axes[1].barh(range(len(top_negative)), top_negative['correlation'].values, color='red', alpha=0.7)
axes[1].set_yticks(range(len(top_negative)))
axes[1].set_yticklabels(top_negative['feature'].values, fontsize=8)
axes[1].set_xlabel('Correlation')
axes[1].set_title('ТОП-20 Отрицательных Корреляций с Target', fontweight='bold')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / '02_correlation_with_target.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Сохранено: figures/02_correlation_with_target.png")

# Сохранить для отчета
corr_df.to_csv(config.OUTPUT_DIR / 'feature_target_correlations.csv', index=False)
print("✓ Сохранено: output/feature_target_correlations.csv")

print("="*80)

---
# 7. PREPROCESSING PIPELINE

In [None]:
# ====================================================================================
# PREPROCESSING PIPELINE
# ====================================================================================

class PreprocessingPipeline:
    """Preprocessing pipeline для всех моделей"""

    def __init__(self, config):
        self.config = config
        self.fitted_columns = None
        self.final_features = None
        self.constant_cols = []
        self.outlier_bounds = {}
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.numeric_cols_for_imputation = []
        self.categorical_cols_for_imputation = []
        self.features_to_drop_corr = []

    def fit_transform(self, train_df):
        """Fit and transform training data"""
        print("\n" + "="*80)
        print("PREPROCESSING: FIT_TRANSFORM ON TRAIN")
        print("="*80)

        df = train_df.copy()

        # Store columns
        self.fitted_columns = [c for c in df.columns
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]

        # 1. Remove constants
        df = self._remove_constants(df, fit=True)

        # 2. Handle outliers
        df = self._handle_outliers(df, fit=True)

        # 3. Handle missing
        df = self._handle_missing(df, fit=True)

        # 4. Remove correlations
        df = self._remove_correlations(df, fit=True)

        # Final features
        self.final_features = [c for c in df.columns
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]

        print(f"\n✓ Preprocessing complete")
        print(f"  Features: {len(self.final_features)}")

        return df

    def transform(self, df, dataset_name='test'):
        """Transform new data"""
        print(f"\nPreprocessing: {dataset_name}")

        df = df.copy()

        df = self._remove_constants(df, fit=False)
        df = self._handle_outliers(df, fit=False)
        df = self._handle_missing(df, fit=False)
        df = self._remove_correlations(df, fit=False)
        df = self._align_columns(df, dataset_name)

        print(f"  ✓ {dataset_name}: {df.shape}")

        return df

    def _remove_constants(self, df, fit):
        if fit:
            print("\n1. Removing constant columns...")
            for col in df.columns:
                if col in config.ID_COLUMNS + [config.TARGET_COLUMN]:
                    continue
                if df[col].nunique(dropna=False) == 1:
                    self.constant_cols.append(col)

            if self.constant_cols:
                df = df.drop(columns=self.constant_cols)
                print(f"   Removed: {len(self.constant_cols)}")
        return df

    def _handle_outliers(self, df, fit):
        if not config.HANDLE_OUTLIERS:
            return df

        if fit:
            print("\n2. Handling outliers...")
            keywords = ['profit', 'income', 'expense', 'margin', 'provision',
                       'balance', 'assets', 'liabilities']
            cols = [c for c in df.columns
                   if any(kw in c.lower() for kw in keywords)
                   and c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]

            for col in cols:
                if df[col].dtype in ['float64', 'float32', 'int64', 'int32']:
                    Q1, Q3 = df[col].quantile([0.25, 0.75])
                    IQR = Q3 - Q1
                    self.outlier_bounds[col] = {
                        'lower': Q1 - config.OUTLIER_IQR_MULTIPLIER * IQR,
                        'upper': Q3 + config.OUTLIER_IQR_MULTIPLIER * IQR
                    }

            for col, bounds in self.outlier_bounds.items():
                df[col] = df[col].clip(lower=bounds['lower'], upper=bounds['upper'])

            print(f"   Clipped: {len(self.outlier_bounds)} columns")
        else:
            for col, bounds in self.outlier_bounds.items():
                if col in df.columns:
                    df[col] = df[col].clip(lower=bounds['lower'], upper=bounds['upper'])

        return df

    def _handle_missing(self, df, fit):
        if fit:
            print("\n3. Handling missing values...")
            self.numeric_cols_for_imputation = [
                c for c in df.select_dtypes(include=[np.number]).columns
                if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]
            ]
            self.categorical_cols_for_imputation = [
                c for c in config.CATEGORICAL_FEATURES if c in df.columns
            ]

            self.numeric_imputer = SimpleImputer(strategy='median')
            self.categorical_imputer = SimpleImputer(strategy='most_frequent')

            if len(self.numeric_cols_for_imputation) > 0:
                df[self.numeric_cols_for_imputation] = self.numeric_imputer.fit_transform(
                    df[self.numeric_cols_for_imputation]
                )

            if len(self.categorical_cols_for_imputation) > 0:
                df[self.categorical_cols_for_imputation] = self.categorical_imputer.fit_transform(
                    df[self.categorical_cols_for_imputation]
                )

            print(f"   Imputed: {len(self.numeric_cols_for_imputation)} numeric, "
                  f"{len(self.categorical_cols_for_imputation)} categorical")
        else:
            if len(self.numeric_cols_for_imputation) > 0:
                present = [c for c in self.numeric_cols_for_imputation if c in df.columns]
                if present:
                    df[present] = self.numeric_imputer.transform(df[present])

            if len(self.categorical_cols_for_imputation) > 0:
                present = [c for c in self.categorical_cols_for_imputation if c in df.columns]
                if present:
                    df[present] = self.categorical_imputer.transform(df[present])

        return df

    def _remove_correlations(self, df, fit):
        if not config.REMOVE_HIGH_CORRELATIONS:
            return df

        if fit:
            print("\n4. Removing high correlations...")
            numeric = [c for c in df.select_dtypes(include=[np.number]).columns
                      if c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]

            if len(numeric) > 1:
                corr = df[numeric].corr().abs()
                upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                self.features_to_drop_corr = [c for c in upper.columns
                                             if any(upper[c] > config.CORRELATION_THRESHOLD)]

                if self.features_to_drop_corr:
                    df = df.drop(columns=self.features_to_drop_corr)
                    print(f"   Removed: {len(self.features_to_drop_corr)}")

        return df

    def _align_columns(self, df, name):
        preserve = [c for c in config.ID_COLUMNS if c in df.columns]
        if config.TARGET_COLUMN in df.columns:
            preserve.append(config.TARGET_COLUMN)

        current = [c for c in df.columns if c not in preserve]
        missing = [c for c in self.final_features if c not in current]
        extra = [c for c in current if c not in self.final_features]

        if missing:
            for c in missing:
                df[c] = 0

        if extra:
            df = df.drop(columns=extra)

        order = preserve + self.final_features
        df = df[[c for c in order if c in df.columns]]

        return df

# Apply preprocessing
pipeline = PreprocessingPipeline(config)
train_processed = pipeline.fit_transform(train_df)
val_processed = pipeline.transform(val_df, 'validation')
test_processed = pipeline.transform(test_df, 'test (OOT)')

print("\n" + "="*80)

---
# 8. РАЗДЕЛЕНИЕ ПО СЕГМЕНТАМ

**ВАЖНО:** После разделения удаляем segment_group из признаков!

In [None]:
# ====================================================================================
# SEGMENT SPLIT + УДАЛЕНИЕ SEGMENT_GROUP
# ====================================================================================

print("\n" + "="*80)
print("РАЗДЕЛЕНИЕ ПО СЕГМЕНТАМ")
print("="*80)

# Модель 1: Small Business
seg1_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()

# УДАЛЯЕМ SEGMENT_GROUP - он больше не нужен!
if config.SEGMENT_COLUMN in seg1_train.columns:
    seg1_train = seg1_train.drop(columns=[config.SEGMENT_COLUMN])
    seg1_val = seg1_val.drop(columns=[config.SEGMENT_COLUMN])
    seg1_test = seg1_test.drop(columns=[config.SEGMENT_COLUMN])
    print(f"\n✓ УДАЛЕН {config.SEGMENT_COLUMN} из модели 1 (константа внутри сегмента)")

print(f"\n{config.SEGMENT_1_NAME}:")
print(f"  Train: {len(seg1_train):,} | Churn: {seg1_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {len(seg1_val):,} | Churn: {seg1_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {len(seg1_test):,} | Churn: {seg1_test[config.TARGET_COLUMN].mean()*100:.2f}%")

# Модель 2: Middle + Large Business
seg2_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()

# УДАЛЯЕМ SEGMENT_GROUP
if config.SEGMENT_COLUMN in seg2_train.columns:
    seg2_train = seg2_train.drop(columns=[config.SEGMENT_COLUMN])
    seg2_val = seg2_val.drop(columns=[config.SEGMENT_COLUMN])
    seg2_test = seg2_test.drop(columns=[config.SEGMENT_COLUMN])
    print(f"\n✓ УДАЛЕН {config.SEGMENT_COLUMN} из модели 2")

print(f"\n{config.SEGMENT_2_NAME}:")
print(f"  Train: {len(seg2_train):,} | Churn: {seg2_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {len(seg2_val):,} | Churn: {seg2_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {len(seg2_test):,} | Churn: {seg2_test[config.TARGET_COLUMN].mean()*100:.2f}%")

print("\n" + "="*80)
print("ОБЪЯСНЕНИЕ: segment_group удален, так как после разделения данных")
print("он является константой внутри каждой модели и не несет информации.")
print("Он был полезен только для разделения на два сегмента.")
print("="*80)

---
# 9. НОВОЕ: ФУНКЦИИ ДЛЯ PSI И МЕТРИК ПО ПЕРЦЕНТИЛЯМ

In [None]:
# ====================================================================================
# HELPER FUNCTIONS
# ====================================================================================

def calculate_psi(expected, actual, buckets=10):
    """
    Calculate Population Stability Index (PSI)
    
    PSI < 0.1: No significant change
    0.1 <= PSI < 0.2: Moderate change
    PSI >= 0.2: Significant change (требуется пересмотр модели)
    """
    breakpoints = np.arange(0, buckets + 1) / buckets * 100
    breakpoints = np.percentile(expected, breakpoints)
    breakpoints[0] = -np.inf
    breakpoints[-1] = np.inf
    
    expected_percents = pd.cut(expected, breakpoints, duplicates='drop').value_counts(normalize=True).sort_index()
    actual_percents = pd.cut(actual, breakpoints, duplicates='drop').value_counts(normalize=True).sort_index()
    
    # Ensure same bins
    expected_percents = expected_percents.reindex(actual_percents.index, fill_value=0.001)
    actual_percents = actual_percents.reindex(expected_percents.index, fill_value=0.001)
    
    psi_value = np.sum((actual_percents - expected_percents) * np.log(actual_percents / expected_percents))
    
    return psi_value


def calculate_decile_table(y_true, y_pred_proba, n_deciles=10):
    """
    Создать таблицу метрик по децилям (перцентилям)
    
    Returns:
    - DataFrame с колонками: percentile, count, target_count, target_rate, 
                           precision_cum, recall_cum, lift
    """
    df = pd.DataFrame({
        'y_true': y_true,
        'y_pred_proba': y_pred_proba
    })
    
    # Сортировка по вероятности (от высокой к низкой)
    df = df.sort_values('y_pred_proba', ascending=False).reset_index(drop=True)
    
    # Разбиение на децили
    df['decile'] = pd.qcut(df['y_pred_proba'], q=n_deciles, labels=False, duplicates='drop') + 1
    df['decile'] = n_deciles - df['decile'] + 1  # Reverse (1 = highest prob)
    
    # Агрегация по децилям
    decile_table = df.groupby('decile').agg(
        count=('y_true', 'size'),
        target_count=('y_true', 'sum'),
        min_prob=('y_pred_proba', 'min'),
        max_prob=('y_pred_proba', 'max')
    ).reset_index()
    
    decile_table['target_rate'] = decile_table['target_count'] / decile_table['count']
    
    # Cumulative
    decile_table['count_cum'] = decile_table['count'].cumsum()
    decile_table['target_count_cum'] = decile_table['target_count'].cumsum()
    
    # Precision (cumulative)
    decile_table['precision_cum'] = decile_table['target_count_cum'] / decile_table['count_cum']
    
    # Recall (cumulative)
    total_targets = df['y_true'].sum()
    decile_table['recall_cum'] = decile_table['target_count_cum'] / total_targets
    
    # Lift
    baseline_rate = total_targets / len(df)
    decile_table['lift'] = decile_table['target_rate'] / baseline_rate
    
    # Rename
    decile_table = decile_table.rename(columns={'decile': 'percentile'})
    
    return decile_table


def prepare_data_for_model(df, categorical_features, exclude_cols, for_catboost=False):
    """
    Подготовка данных для моделей
    
    for_catboost=True: вернет categorical indices для CatBoost
    for_catboost=False: закодирует категориальные как числа для XGBoost/LightGBM
    """
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    
    X = df[feature_cols].copy()
    y = df[config.TARGET_COLUMN].copy() if config.TARGET_COLUMN in df.columns else None
    
    if for_catboost:
        # Конвертация категориальных в string для CatBoost
        for cat in categorical_features:
            if cat in X.columns:
                X[cat] = X[cat].astype(str).replace('nan', 'missing')
        
        # Индексы категориальных
        cat_indices = [i for i, c in enumerate(feature_cols) if c in categorical_features]
        return X, y, cat_indices
    else:
        # Label encoding для XGBoost/LightGBM
        encoders = {}
        for cat in categorical_features:
            if cat in X.columns:
                le = LabelEncoder()
                X[cat] = le.fit_transform(X[cat].astype(str))
                encoders[cat] = le
        
        return X, y, encoders


def calculate_class_weights(y):
    """Расчет весов классов"""
    n_samples = len(y)
    n_classes = 2
    n_class_0 = (y == 0).sum()
    n_class_1 = (y == 1).sum()

    weight_0 = n_samples / (n_classes * n_class_0)
    weight_1 = n_samples / (n_classes * n_class_1)

    weights = np.ones(len(y))
    weights[y == 1] = weight_1
    weights[y == 0] = weight_0

    return weights, weight_0, weight_1


def find_optimal_threshold(y_true, y_pred_proba, metric='f1'):
    """Поиск оптимального порога"""
    thresholds = np.arange(0.1, 0.9, 0.01)
    scores = []

    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)

        if metric == 'f1':
            score = f1_score(y_true, y_pred, zero_division=0)
        elif metric == 'recall':
            score = recall_score(y_true, y_pred, zero_division=0)
        elif metric == 'precision':
            score = precision_score(y_true, y_pred, zero_division=0)

        scores.append(score)

    optimal_idx = np.argmax(scores)
    return thresholds[optimal_idx], scores[optimal_idx]


def calculate_all_metrics(y_true, y_pred_proba, y_pred, threshold, dataset_name=''):
    """Расчет всех метрик"""
    metrics = {
        'dataset': dataset_name,
        'threshold': threshold,
        'roc_auc': roc_auc_score(y_true, y_pred_proba),
        'pr_auc': average_precision_score(y_true, y_pred_proba),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
    }
    metrics['gini'] = 2 * metrics['roc_auc'] - 1

    cm = confusion_matrix(y_true, y_pred)
    metrics['tn'] = cm[0, 0]
    metrics['fp'] = cm[0, 1]
    metrics['fn'] = cm[1, 0]
    metrics['tp'] = cm[1, 1]

    return metrics

print("\n✓ Helper functions определены")
print("  - calculate_psi: PSI расчет")
print("  - calculate_decile_table: Метрики по перцентилям")
print("  - prepare_data_for_model: Подготовка данных для моделей")
print("  - find_optimal_threshold: Оптимальный порог")
print("  - calculate_all_metrics: Все метрики")