# ОТЧЕТ ДЛЯ ДОКУМЕНТАЦИИ МОДЕЛИ CHURN PREDICTION

**Назначение**: Генерация всех таблиц, графиков и метрик для заполнения документации.

**Структура**:
1. Общая подготовка данных (preprocessing для всех сегментов)
2. **SEGMENT 1 (Small Business)** - все разделы документации
3. **SEGMENT 2 (Middle + Large Business)** - все разделы документации

**Результаты сохраняются в**:
- `output/` - CSV таблицы
- `figures/` - графики
- `models/` - обученные модели

---
# ЧАСТЬ 0: ИМПОРТЫ И КОНФИГУРАЦИЯ

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import shap
import pickle

# Config
from config import Config
config = Config()

# Создаем директории
Path('output').mkdir(exist_ok=True)
Path('figures').mkdir(exist_ok=True)
Path('models').mkdir(exist_ok=True)

# Стиль графиков
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ Импорты и конфигурация загружены")

---
# ЧАСТЬ 1: ЗАГРУЗКА И ОБЩИЙ PREPROCESSING

Этот preprocessing применяется ко ВСЕМ данным до разделения по сегментам.

## 1.1. Загрузка данных

In [None]:
print("="*80)
print("ЗАГРУЗКА ИСХОДНЫХ ДАННЫХ")
print("="*80)

df_full = pd.read_parquet('data/churn_train_ul.parquet')

print(f"\nИсходные данные:")
print(f"  Строк: {df_full.shape[0]:,}")
print(f"  Колонок: {df_full.shape[1]}")
print(f"  Период: {df_full['observation_point'].min()} - {df_full['observation_point'].max()}")
print(f"  Churn rate: {df_full[config.TARGET_COLUMN].mean()*100:.2f}%")

df_full.head()

## 1.2. Temporal Split (70/15/15)

In [None]:
print("\n" + "="*80)
print("TEMPORAL SPLIT")
print("="*80)

df_full['observation_point'] = pd.to_datetime(df_full['observation_point'])
df_sorted = df_full.sort_values('observation_point').reset_index(drop=True)

unique_dates = sorted(df_sorted['observation_point'].unique())
n_dates = len(unique_dates)

train_cutoff_idx = int(n_dates * 0.70)
val_cutoff_idx = int(n_dates * 0.85)

train_cutoff = unique_dates[train_cutoff_idx - 1]
val_cutoff = unique_dates[val_cutoff_idx - 1]

train_df = df_sorted[df_sorted['observation_point'] <= train_cutoff].copy()
val_df = df_sorted[(df_sorted['observation_point'] > train_cutoff) & 
                    (df_sorted['observation_point'] <= val_cutoff)].copy()
test_df = df_sorted[df_sorted['observation_point'] > val_cutoff].copy()

print(f"\nTRAIN:")
print(f"  Период: {train_df['observation_point'].min()} - {train_df['observation_point'].max()}")
print(f"  Записей: {len(train_df):,}")
print(f"  Churn rate: {train_df[config.TARGET_COLUMN].mean()*100:.2f}%")

print(f"\nVALIDATION:")
print(f"  Период: {val_df['observation_point'].min()} - {val_df['observation_point'].max()}")
print(f"  Записей: {len(val_df):,}")
print(f"  Churn rate: {val_df[config.TARGET_COLUMN].mean()*100:.2f}%")

print(f"\nTEST (Out-of-Time):")
print(f"  Период: {test_df['observation_point'].min()} - {test_df['observation_point'].max()}")
print(f"  Записей: {len(test_df):,}")
print(f"  Churn rate: {test_df[config.TARGET_COLUMN].mean()*100:.2f}%")

## 1.3. Gap Detection

In [None]:
print("\n" + "="*80)
print("GAP DETECTION")
print("="*80)

def remove_gaps(df):
    client_counts = df.groupby('cli_code')['observation_point'].count()
    expected_months = 6
    valid_clients = client_counts[client_counts == expected_months].index
    df_clean = df[df['cli_code'].isin(valid_clients)].copy()
    removed = len(df) - len(df_clean)
    print(f"  Удалено записей с пропусками: {removed:,}")
    return df_clean

train_df = remove_gaps(train_df)
val_df = remove_gaps(val_df)
test_df = remove_gaps(test_df)

print(f"\nПосле gap detection:")
print(f"  Train: {len(train_df):,}")
print(f"  Val: {len(val_df):,}")
print(f"  Test: {len(test_df):,}")

## 1.4. Preprocessing Pipeline

Единый preprocessing для обеспечения:
- Статистической стабильности
- Консистентности в продакшене
- Сравнимости моделей

In [None]:
class PreprocessingPipeline:
    """Полный preprocessing pipeline"""
    
    def __init__(self, config):
        self.config = config
        self.fitted_columns = None
        self.final_features = None
        self.constant_cols = []
        self.outlier_bounds = {}
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.numeric_cols_for_imputation = []
        self.categorical_cols_for_imputation = []
        self.features_to_drop_corr = []
    
    def fit_transform(self, train_df):
        """Fit and transform training data"""
        print("\n" + "="*80)
        print("PREPROCESSING: FIT_TRANSFORM ON TRAIN")
        print("="*80)
        
        df = train_df.copy()
        
        self.fitted_columns = [c for c in df.columns 
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]
        
        df = self._remove_constants(df, fit=True)
        df = self._handle_outliers(df, fit=True)
        df = self._handle_missing(df, fit=True)
        df = self._remove_correlations(df, fit=True)
        
        self.final_features = [c for c in df.columns 
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]
        
        print(f"\n✓ Preprocessing complete")
        print(f"  Final features: {len(self.final_features)}")
        
        return df
    
    def transform(self, df, name=''):
        """Transform validation/test data"""
        df = df.copy()
        df = self._remove_constants(df, fit=False)
        df = self._handle_outliers(df, fit=False)
        df = self._handle_missing(df, fit=False)
        df = self._remove_correlations(df, fit=False)
        df = self._align_columns(df, name)
        return df
    
    def _remove_constants(self, df, fit):
        if fit:
            print("\n1. Removing constant columns...")
            for col in df.columns:
                if col in config.ID_COLUMNS + [config.TARGET_COLUMN]:
                    continue
                if df[col].nunique(dropna=False) == 1:
                    self.constant_cols.append(col)
            
            if self.constant_cols:
                df = df.drop(columns=self.constant_cols)
                print(f"   Removed: {len(self.constant_cols)}")
            else:
                print(f"   ✓ No constant columns found")
        else:
            df = df.drop(columns=[c for c in self.constant_cols if c in df.columns])
        return df
    
    def _handle_outliers(self, df, fit):
        if fit:
            print(f"\n2. Handling outliers (IQR clipping)...")
            numeric = [c for c in df.select_dtypes(include=[np.number]).columns
                      if c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]
            
            for col in numeric:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - 1.5 * IQR
                upper = Q3 + 1.5 * IQR
                self.outlier_bounds[col] = (lower, upper)
            
            print(f"   Clipped: {len(self.outlier_bounds)} columns (IQR × 1.5)")
        
        for col, (lower, upper) in self.outlier_bounds.items():
            if col in df.columns:
                df[col] = df[col].clip(lower, upper)
        
        return df
    
    def _handle_missing(self, df, fit):
        if fit:
            print(f"\n3. Handling missing values...")
            
            self.numeric_cols_for_imputation = [
                c for c in df.select_dtypes(include=[np.number]).columns
                if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]
            ]
            
            self.categorical_cols_for_imputation = [
                c for c in config.CATEGORICAL_FEATURES if c in df.columns
            ]
            
            if self.numeric_cols_for_imputation:
                self.numeric_imputer = SimpleImputer(strategy='median')
                df[self.numeric_cols_for_imputation] = self.numeric_imputer.fit_transform(
                    df[self.numeric_cols_for_imputation]
                )
            
            if self.categorical_cols_for_imputation:
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                df[self.categorical_cols_for_imputation] = self.categorical_imputer.fit_transform(
                    df[self.categorical_cols_for_imputation]
                )
            
            print(f"   Imputed: {len(self.numeric_cols_for_imputation)} numeric, {len(self.categorical_cols_for_imputation)} categorical")
        else:
            if self.numeric_cols_for_imputation and self.numeric_imputer:
                cols = [c for c in self.numeric_cols_for_imputation if c in df.columns]
                if cols:
                    df[cols] = self.numeric_imputer.transform(df[cols])
            
            if self.categorical_cols_for_imputation and self.categorical_imputer:
                cols = [c for c in self.categorical_cols_for_imputation if c in df.columns]
                if cols:
                    df[cols] = self.categorical_imputer.transform(df[cols])
        
        return df
    
    def _remove_correlations(self, df, fit):
        if not config.REMOVE_HIGH_CORRELATIONS:
            return df
        
        if fit:
            print(f"\n4. Removing high correlations (threshold={config.CORRELATION_THRESHOLD})...")
            numeric = [c for c in df.select_dtypes(include=[np.number]).columns
                      if c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]
            
            if len(numeric) > 1:
                corr = df[numeric].corr().abs()
                upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                self.features_to_drop_corr = [c for c in upper.columns 
                                             if any(upper[c] > config.CORRELATION_THRESHOLD)]
                
                if self.features_to_drop_corr:
                    df = df.drop(columns=self.features_to_drop_corr)
                    print(f"   Removed: {len(self.features_to_drop_corr)} features")
                else:
                    print(f"   ✓ No highly correlated features found")
        else:
            df = df.drop(columns=[c for c in self.features_to_drop_corr if c in df.columns])
        
        return df
    
    def _align_columns(self, df, name):
        preserve = [c for c in config.ID_COLUMNS if c in df.columns]
        if config.TARGET_COLUMN in df.columns:
            preserve.append(config.TARGET_COLUMN)
        
        current = [c for c in df.columns if c not in preserve]
        missing = [c for c in self.final_features if c not in current]
        extra = [c for c in current if c not in self.final_features]
        
        if missing:
            for col in missing:
                df[col] = 0
        
        if extra:
            df = df.drop(columns=extra)
        
        order = preserve + self.final_features
        df = df[[c for c in order if c in df.columns]]
        
        print(f"  ✓ {name}: {df.shape}")
        
        return df

print("✓ PreprocessingPipeline класс определен")

## 1.5. Применение Preprocessing

In [None]:
pipeline = PreprocessingPipeline(config)

# Fit на train, transform на val и test
train_processed = pipeline.fit_transform(train_df)

print("\nPreprocessing: validation")
val_processed = pipeline.transform(val_df, name='validation')

print("\nPreprocessing: test (OOT)")
test_processed = pipeline.transform(test_df, name='test (OOT)')

print("\n" + "="*80)
print("PREPROCESSING SUMMARY")
print("="*80)
print(f"\nШаги preprocessing:")
print(f"  1. Константные колонки удалено: {len(pipeline.constant_cols)}")
print(f"  2. Выбросы обработано (IQR clipping): {len(pipeline.outlier_bounds)} колонок")
print(f"  3. Пропуски заполнено:")
print(f"     - Числовых: {len(pipeline.numeric_cols_for_imputation)}")
print(f"     - Категориальных: {len(pipeline.categorical_cols_for_imputation)}")
print(f"  4. Коррелирующих признаков удалено: {len(pipeline.features_to_drop_corr)}")
print(f"\nИтоговое количество признаков: {len(pipeline.final_features)}")

## 1.6. Разделение по сегментам

In [None]:
print("\n" + "="*80)
print("РАЗДЕЛЕНИЕ ПО СЕГМЕНТАМ")
print("="*80)

# SEGMENT 1: Small Business
seg1_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()

# Удаляем segment_group (одно значение) + ID + temporal
temporal_features = ['obs_year', 'obs_month', 'obs_quarter']
cols_to_drop_seg1 = [config.SEGMENT_COLUMN] + [c for c in config.ID_COLUMNS if c in seg1_train.columns] + temporal_features
seg1_train = seg1_train.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_train.columns])
seg1_val = seg1_val.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_val.columns])
seg1_test = seg1_test.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_test.columns])

print(f"\nSEGMENT 1: {config.SEGMENT_1_NAME}")
print(f"  Train: {seg1_train.shape} | Churn: {seg1_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {seg1_val.shape} | Churn: {seg1_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {seg1_test.shape} | Churn: {seg1_test[config.TARGET_COLUMN].mean()*100:.2f}%")

# SEGMENT 2: Middle + Large Business
seg2_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()

# Оставляем segment_group + удаляем ID + temporal
cols_to_drop_seg2 = [c for c in config.ID_COLUMNS if c in seg2_train.columns] + temporal_features
seg2_train = seg2_train.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_train.columns])
seg2_val = seg2_val.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_val.columns])
seg2_test = seg2_test.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_test.columns])

# Label Encoding для segment_group
segment_mapping = {'MIDDLE_BUSINESS': 0, 'LARGE_BUSINESS': 1}
seg2_train[config.SEGMENT_COLUMN] = seg2_train[config.SEGMENT_COLUMN].map(segment_mapping)
seg2_val[config.SEGMENT_COLUMN] = seg2_val[config.SEGMENT_COLUMN].map(segment_mapping)
seg2_test[config.SEGMENT_COLUMN] = seg2_test[config.SEGMENT_COLUMN].map(segment_mapping)

print(f"\nSEGMENT 2: {config.SEGMENT_2_NAME}")
print(f"  Train: {seg2_train.shape} | Churn: {seg2_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {seg2_val.shape} | Churn: {seg2_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {seg2_test.shape} | Churn: {seg2_test[config.TARGET_COLUMN].mean()*100:.2f}%")

print("\n✓ Данные готовы для моделирования")

---
---
# ЧАСТЬ 2: SEGMENT 1 - SMALL BUSINESS

**Все разделы документации для Segment 1**

## 2.1. Раздел 2.3 - Анализ потока (Статистика по выборке)

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: АНАЛИЗ ПОТОКА (Section 2.3)")
print("="*80)

# Объединяем все splits для seg1
seg1_full = pd.concat([seg1_train, seg1_val, seg1_test], axis=0)

# Получаем периоды из исходных данных
train_period = f"{train_df['observation_point'].min().date()} - {train_df['observation_point'].max().date()}"
test_period = f"{test_df['observation_point'].min().date()} - {test_df['observation_point'].max().date()}"

stats_seg1 = {
    'Период выборки': f"{df_full['observation_point'].min().date()} - {df_full['observation_point'].max().date()}",
    'Количество наблюдений в контрактах': f"{len(seg1_full):,}",
    'Количество дефолтов в контрактах': f"{int(seg1_full[config.TARGET_COLUMN].sum()):,}",
    'Уровень фактической целевой переменной': f"{seg1_full[config.TARGET_COLUMN].mean()*100:.2f}%"
}

print("\nТаблица для документации:")
print("\n| Наименование показателя | Значение |")
print("| :---- | :---- |")
for key, value in stats_seg1.items():
    print(f"| {key} | {value} |")

# Сохраняем в CSV
pd.DataFrame([stats_seg1]).T.to_csv('output/seg1_flow_analysis.csv', header=['Значение'])
print("\n✓ Сохранено: output/seg1_flow_analysis.csv")

## 2.2. Раздел 3.3 - Результаты сбора ABT

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: СТАТИСТИКА ABT (Section 3.3)")
print("="*80)

# Разделяем на числовые и не числовые
numeric_cols = seg1_full.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = seg1_full.select_dtypes(exclude=[np.number]).columns.tolist()

# Убираем target из предикторов
if config.TARGET_COLUMN in numeric_cols:
    numeric_cols.remove(config.TARGET_COLUMN)

abt_stats_seg1 = {
    'Количество наблюдений': f"{len(seg1_full):,}",
    'Количество событий': f"{int(seg1_full[config.TARGET_COLUMN].sum()):,}",
    'Количество целевых переменных': '1 (target_churn_3m)',
    'Количество числовых предикторов': len(numeric_cols),
    'Количество не числовых предикторов': len(non_numeric_cols)
}

print("\nТаблица для документации:")
print("\n| Наименование показателя | Значение |")
print("| :---- | :---- |")
for key, value in abt_stats_seg1.items():
    print(f"| {key} | {value} |")

# Сохраняем
pd.DataFrame([abt_stats_seg1]).T.to_csv('output/seg1_abt_statistics.csv', header=['Значение'])
print("\n✓ Сохранено: output/seg1_abt_statistics.csv")

## 2.3. Раздел 3.5.1-3.5.2 - Обработка данных

Информация уже представлена в общем preprocessing выше.

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: ОБРАБОТКА ДАННЫХ (Section 3.5.1-3.5.2)")
print("="*80)

print("\n3.5.1. Удаление и замена пропущенных значений:")
print("  - Метод: Median Imputation для числовых признаков")
print("  - Метод: Most Frequent для категориальных")
print(f"  - Обработано числовых: {len(pipeline.numeric_cols_for_imputation)}")
print(f"  - Обработано категориальных: {len(pipeline.categorical_cols_for_imputation)}")

print("\n3.5.2. Обработка категориальных значений:")
print("  - segment_group: УДАЛЕНА (только одно значение SMALL_BUSINESS)")
print("  - Временные признаки (obs_year, obs_month, obs_quarter): УДАЛЕНЫ (высокий PSI)")

## 2.4. Раздел 3.5.3 - Индекс PSI

In [None]:
def calculate_psi(expected, actual, bins=10):
    """Calculate Population Stability Index"""
    combined = np.concatenate([expected, actual])
    min_val = combined.min()
    max_val = combined.max()
    
    breakpoints = np.linspace(min_val, max_val, bins + 1)
    breakpoints[0] = -np.inf
    breakpoints[-1] = np.inf
    
    expected_counts = np.histogram(expected, bins=breakpoints)[0]
    actual_counts = np.histogram(actual, bins=breakpoints)[0]
    
    expected_percents = expected_counts / len(expected)
    actual_percents = actual_counts / len(actual)
    
    expected_percents = np.where(expected_percents == 0, 0.0001, expected_percents)
    actual_percents = np.where(actual_percents == 0, 0.0001, actual_percents)
    
    psi_values = (actual_percents - expected_percents) * np.log(actual_percents / expected_percents)
    psi = np.sum(psi_values)
    
    return psi

print("\n" + "="*80)
print("SEGMENT 1: PSI ANALYSIS (Section 3.5.3)")
print("="*80)

# Только числовые признаки
numeric_features = [c for c in seg1_train.columns 
                   if c != config.TARGET_COLUMN and seg1_train[c].dtype in [np.number]]

psi_results_seg1 = []
for col in numeric_features:
    psi = calculate_psi(seg1_train[col].values, seg1_test[col].values)
    psi_results_seg1.append({'feature': col, 'PSI': psi})

psi_df_seg1 = pd.DataFrame(psi_results_seg1).sort_values('PSI', ascending=False)

# Категории PSI
stable = (psi_df_seg1['PSI'] < 0.1).sum()
moderate = ((psi_df_seg1['PSI'] >= 0.1) & (psi_df_seg1['PSI'] < 0.2)).sum()
high = (psi_df_seg1['PSI'] >= 0.2).sum()

print(f"\nОбщая статистика PSI:")
print(f"  Всего признаков: {len(psi_df_seg1)}")
print(f"  Стабильных (PSI < 0.1): {stable} ({stable/len(psi_df_seg1)*100:.1f}%)")
print(f"  Умеренный drift (0.1-0.2): {moderate} ({moderate/len(psi_df_seg1)*100:.1f}%)")
print(f"  Высокий drift (PSI > 0.2): {high} ({high/len(psi_df_seg1)*100:.1f}%)")

print(f"\nТОП-10 признаков с наибольшим PSI:")
print(psi_df_seg1.head(10).to_string(index=False))

# Сохраняем
psi_df_seg1.to_csv('output/seg1_psi_analysis.csv', index=False)
print("\n✓ Сохранено: output/seg1_psi_analysis.csv")

## 2.5. Раздел 3.5.4 - Корреляционный анализ

In [None]:
from scipy.stats import pointbiserialr

print("\n" + "="*80)
print("SEGMENT 1: КОРРЕЛЯЦИОННЫЙ АНАЛИЗ (Section 3.5.4)")
print("="*80)

# Point-biserial корреляция с target
numeric_features = [c for c in seg1_train.select_dtypes(include=[np.number]).columns 
                   if c != config.TARGET_COLUMN]

correlations_seg1 = []
for col in numeric_features:
    corr, pval = pointbiserialr(seg1_train[config.TARGET_COLUMN], seg1_train[col])
    correlations_seg1.append({
        'feature': col,
        'correlation': corr,
        'p_value': pval,
        'significant': pval < 0.05
    })

corr_df_seg1 = pd.DataFrame(correlations_seg1)
corr_df_seg1['abs_correlation'] = corr_df_seg1['correlation'].abs()
corr_df_seg1 = corr_df_seg1.sort_values('abs_correlation', ascending=False)

significant = corr_df_seg1['significant'].sum()
avg_corr = corr_df_seg1['abs_correlation'].mean()
max_corr = corr_df_seg1['abs_correlation'].max()

print(f"\nОбщая статистика:")
print(f"  Всего признаков: {len(corr_df_seg1)}")
print(f"  Значимых (p<0.05): {significant}")
print(f"  Средняя |корреляция|: {avg_corr:.4f}")
print(f"  Максимальная |корреляция|: {max_corr:.4f}")

print(f"\nТОП-20 признаков по корреляции с target:")
print(corr_df_seg1[['feature', 'correlation', 'p_value', 'significant']].head(20).to_string(index=False))

# Сохраняем
corr_df_seg1.to_csv('output/seg1_target_correlation.csv', index=False)
print("\n✓ Сохранено: output/seg1_target_correlation.csv")

## 2.6. Раздел 4.1 - Разбиение выборки

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: РАЗБИЕНИЕ ВЫБОРКИ (Section 4.1)")
print("="*80)

split_table_seg1 = {
    'Роль данных': ['Train', 'Val', 'Test'],
    'Количество наблюдений': [
        f"{len(seg1_train):,}",
        f"{len(seg1_val):,}",
        f"{len(seg1_test):,}"
    ],
    'Количество событий': [
        f"{int(seg1_train[config.TARGET_COLUMN].sum()):,}",
        f"{int(seg1_val[config.TARGET_COLUMN].sum()):,}",
        f"{int(seg1_test[config.TARGET_COLUMN].sum()):,}"
    ],
    'Churn Rate': [
        f"{seg1_train[config.TARGET_COLUMN].mean()*100:.2f}%",
        f"{seg1_val[config.TARGET_COLUMN].mean()*100:.2f}%",
        f"{seg1_test[config.TARGET_COLUMN].mean()*100:.2f}%"
    ]
}

split_df_seg1 = pd.DataFrame(split_table_seg1)
print("\nТаблица для документации:")
print(split_df_seg1.to_markdown(index=False))

# Сохраняем
split_df_seg1.to_csv('output/seg1_split_table.csv', index=False)
print("\n✓ Сохранено: output/seg1_split_table.csv")

## 2.7. Раздел 4.2-4.3 - Обучение модели и результаты

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: ОБУЧЕНИЕ МОДЕЛИ (Section 4.2-4.3)")
print("="*80)

# Подготовка данных
X_train_seg1 = seg1_train.drop(columns=[config.TARGET_COLUMN])
y_train_seg1 = seg1_train[config.TARGET_COLUMN]

X_val_seg1 = seg1_val.drop(columns=[config.TARGET_COLUMN])
y_val_seg1 = seg1_val[config.TARGET_COLUMN]

X_test_seg1 = seg1_test.drop(columns=[config.TARGET_COLUMN])
y_test_seg1 = seg1_test[config.TARGET_COLUMN]

# XGBoost для Segment 1
print("\nОбучение XGBoost...")
model_seg1 = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    objective='binary:logistic',
    eval_metric='auc',
    early_stopping_rounds=50,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

model_seg1.fit(
    X_train_seg1, y_train_seg1,
    eval_set=[(X_val_seg1, y_val_seg1)],
    verbose=False
)

# Предсказания
y_pred_proba_train_seg1 = model_seg1.predict_proba(X_train_seg1)[:, 1]
y_pred_proba_val_seg1 = model_seg1.predict_proba(X_val_seg1)[:, 1]
y_pred_proba_test_seg1 = model_seg1.predict_proba(X_test_seg1)[:, 1]

# Метрики
threshold_seg1 = 0.12

roc_auc_train_seg1 = roc_auc_score(y_train_seg1, y_pred_proba_train_seg1)
roc_auc_val_seg1 = roc_auc_score(y_val_seg1, y_pred_proba_val_seg1)
roc_auc_test_seg1 = roc_auc_score(y_test_seg1, y_pred_proba_test_seg1)

gini_train_seg1 = 2 * roc_auc_train_seg1 - 1
gini_val_seg1 = 2 * roc_auc_val_seg1 - 1
gini_test_seg1 = 2 * roc_auc_test_seg1 - 1

print(f"\n✓ Модель обучена")
print(f"\nМетрики (XGBoost, threshold={threshold_seg1}):")
print(f"  Train - ROC-AUC: {roc_auc_train_seg1:.4f}, Gini: {gini_train_seg1:.4f}")
print(f"  Val   - ROC-AUC: {roc_auc_val_seg1:.4f}, Gini: {gini_val_seg1:.4f}")
print(f"  Test  - ROC-AUC: {roc_auc_test_seg1:.4f}, Gini: {gini_test_seg1:.4f}")

# Сохранение модели
with open('models/seg1_xgboost_final.pkl', 'wb') as f:
    pickle.dump(model_seg1, f)
print("\n✓ Модель сохранена: models/seg1_xgboost_final.pkl")

# Сохраняем метрики
metrics_seg1 = pd.DataFrame({
    'Dataset': ['Train', 'Validation', 'Test'],
    'ROC-AUC': [roc_auc_train_seg1, roc_auc_val_seg1, roc_auc_test_seg1],
    'Gini': [gini_train_seg1, gini_val_seg1, gini_test_seg1]
})
metrics_seg1.to_csv('output/seg1_model_metrics.csv', index=False)
print("✓ Метрики сохранены: output/seg1_model_metrics.csv")

## 2.8. Раздел 5.2 - Важность признаков

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: FEATURE IMPORTANCE (Section 5.2)")
print("="*80)

# Feature importance
feature_importance_seg1 = pd.DataFrame({
    'feature': X_train_seg1.columns,
    'importance': model_seg1.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nТОП-20 признаков:")
print(feature_importance_seg1.head(20).to_string(index=False))

# График
plt.figure(figsize=(10, 8))
top20 = feature_importance_seg1.head(20)
plt.barh(range(len(top20)), top20['importance'])
plt.yticks(range(len(top20)), top20['feature'])
plt.xlabel('Importance')
plt.title('SEGMENT 1: Top 20 Feature Importance (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('figures/seg1_feature_importance.png', dpi=150, bbox_inches='tight')
plt.close()

# Сохраняем
feature_importance_seg1.to_csv('output/seg1_feature_importance.csv', index=False)
print("\n✓ Сохранено: output/seg1_feature_importance.csv")
print("✓ График сохранен: figures/seg1_feature_importance.png")

## 2.9. Раздел 5.3 - SHAP Analysis

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: SHAP ANALYSIS (Section 5.3)")
print("="*80)

# SHAP values (на валидационной выборке для скорости)
print("\nРасчет SHAP values...")
explainer_seg1 = shap.TreeExplainer(model_seg1)
shap_values_seg1 = explainer_seg1.shap_values(X_val_seg1)

# SHAP summary plot (importance)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_seg1, X_val_seg1, plot_type='bar', show=False, max_display=20)
plt.title('SEGMENT 1: SHAP Feature Importance')
plt.tight_layout()
plt.savefig('figures/seg1_shap_importance.png', dpi=150, bbox_inches='tight')
plt.close()

# SHAP beeswarm plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_seg1, X_val_seg1, show=False, max_display=20)
plt.title('SEGMENT 1: SHAP Summary (Beeswarm)')
plt.tight_layout()
plt.savefig('figures/seg1_shap_beeswarm.png', dpi=150, bbox_inches='tight')
plt.close()

print("\n✓ SHAP графики сохранены:")
print("  - figures/seg1_shap_importance.png")
print("  - figures/seg1_shap_beeswarm.png")

## 2.10. Раздел 5.4 - Decile Analysis и Lift

In [None]:
def calculate_deciles(y_true, y_pred_proba, n_bins=10):
    """Calculate decile analysis with lift"""
    df = pd.DataFrame({
        'y_true': y_true,
        'y_pred_proba': y_pred_proba
    })
    
    df['decile'] = pd.qcut(df['y_pred_proba'], q=n_bins, labels=False, duplicates='drop') + 1
    
    decile_stats = df.groupby('decile').agg({
        'y_true': ['count', 'sum', 'mean'],
        'y_pred_proba': ['min', 'max', 'mean']
    }).reset_index()
    
    decile_stats.columns = ['Decile', 'Count', 'Events', 'Event_Rate', 
                            'Min_Score', 'Max_Score', 'Avg_Score']
    
    # Cumulative metrics
    decile_stats = decile_stats.sort_values('Decile', ascending=False)
    decile_stats['Cum_Events'] = decile_stats['Events'].cumsum()
    decile_stats['Cum_Count'] = decile_stats['Count'].cumsum()
    decile_stats['Cum_Event_Rate'] = decile_stats['Cum_Events'] / decile_stats['Cum_Count']
    
    # Lift
    overall_event_rate = df['y_true'].mean()
    decile_stats['Lift'] = decile_stats['Event_Rate'] / overall_event_rate
    decile_stats['Cum_Lift'] = decile_stats['Cum_Event_Rate'] / overall_event_rate
    
    # Cumulative Gain %
    total_events = df['y_true'].sum()
    decile_stats['Cum_Gain_%'] = (decile_stats['Cum_Events'] / total_events) * 100
    
    return decile_stats.sort_values('Decile')

print("\n" + "="*80)
print("SEGMENT 1: DECILE ANALYSIS + LIFT (Section 5.4)")
print("="*80)

# Train
decile_train_seg1 = calculate_deciles(y_train_seg1, y_pred_proba_train_seg1)
print("\nDECILE ANALYSIS - TRAIN:")
print(decile_train_seg1.to_string(index=False))

# Test
decile_test_seg1 = calculate_deciles(y_test_seg1, y_pred_proba_test_seg1)
print("\n\nDECILE ANALYSIS - TEST:")
print(decile_test_seg1.to_string(index=False))

# Сохраняем
decile_train_seg1.to_csv('output/seg1_decile_analysis_train.csv', index=False)
decile_test_seg1.to_csv('output/seg1_decile_analysis_test.csv', index=False)

# График Lift
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Lift chart
axes[0].plot(decile_train_seg1['Decile'], decile_train_seg1['Lift'], marker='o', label='Train')
axes[0].plot(decile_test_seg1['Decile'], decile_test_seg1['Lift'], marker='s', label='Test')
axes[0].axhline(y=1, color='r', linestyle='--', label='Baseline')
axes[0].set_xlabel('Decile')
axes[0].set_ylabel('Lift')
axes[0].set_title('SEGMENT 1: Lift by Decile')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Cumulative Gain
axes[1].plot(decile_train_seg1['Decile'], decile_train_seg1['Cum_Gain_%'], marker='o', label='Train')
axes[1].plot(decile_test_seg1['Decile'], decile_test_seg1['Cum_Gain_%'], marker='s', label='Test')
axes[1].plot([1, 10], [10, 100], 'r--', label='Random')
axes[1].set_xlabel('Decile')
axes[1].set_ylabel('Cumulative Gain (%)')
axes[1].set_title('SEGMENT 1: Cumulative Gain')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/seg1_decile_lift.png', dpi=150, bbox_inches='tight')
plt.close()

print("\n✓ Сохранено:")
print("  - output/seg1_decile_analysis_train.csv")
print("  - output/seg1_decile_analysis_test.csv")
print("  - figures/seg1_decile_lift.png")

## 2.11. ROC Curve для Segment 1

In [None]:
print("\n" + "="*80)
print("SEGMENT 1: ROC CURVE")
print("="*80)

# ROC curves
fpr_train_seg1, tpr_train_seg1, _ = roc_curve(y_train_seg1, y_pred_proba_train_seg1)
fpr_val_seg1, tpr_val_seg1, _ = roc_curve(y_val_seg1, y_pred_proba_val_seg1)
fpr_test_seg1, tpr_test_seg1, _ = roc_curve(y_test_seg1, y_pred_proba_test_seg1)

plt.figure(figsize=(8, 6))
plt.plot(fpr_train_seg1, tpr_train_seg1, label=f'Train (AUC={roc_auc_train_seg1:.4f})', linewidth=2)
plt.plot(fpr_val_seg1, tpr_val_seg1, label=f'Val (AUC={roc_auc_val_seg1:.4f})', linewidth=2)
plt.plot(fpr_test_seg1, tpr_test_seg1, label=f'Test (AUC={roc_auc_test_seg1:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SEGMENT 1 (Small Business): ROC Curve - XGBoost')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('figures/seg1_roc_curve.png', dpi=150, bbox_inches='tight')
plt.close()

print("✓ ROC кривая сохранена: figures/seg1_roc_curve.png")

---
---
# ЧАСТЬ 3: SEGMENT 2 - MIDDLE + LARGE BUSINESS

**Все разделы документации для Segment 2**

## 3.1. Раздел 2.3 - Анализ потока (Статистика по выборке)

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: АНАЛИЗ ПОТОКА (Section 2.3)")
print("="*80)

seg2_full = pd.concat([seg2_train, seg2_val, seg2_test], axis=0)

stats_seg2 = {
    'Период выборки': f"{df_full['observation_point'].min().date()} - {df_full['observation_point'].max().date()}",
    'Количество наблюдений в контрактах': f"{len(seg2_full):,}",
    'Количество дефолтов в контрактах': f"{int(seg2_full[config.TARGET_COLUMN].sum()):,}",
    'Уровень фактической целевой переменной': f"{seg2_full[config.TARGET_COLUMN].mean()*100:.2f}%"
}

print("\nТаблица для документации:")
print("\n| Наименование показателя | Значение |")
print("| :---- | :---- |")
for key, value in stats_seg2.items():
    print(f"| {key} | {value} |")

pd.DataFrame([stats_seg2]).T.to_csv('output/seg2_flow_analysis.csv', header=['Значение'])
print("\n✓ Сохранено: output/seg2_flow_analysis.csv")

## 3.2. Раздел 3.3 - Результаты сбора ABT

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: СТАТИСТИКА ABT (Section 3.3)")
print("="*80)

numeric_cols = seg2_full.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = seg2_full.select_dtypes(exclude=[np.number]).columns.tolist()

if config.TARGET_COLUMN in numeric_cols:
    numeric_cols.remove(config.TARGET_COLUMN)

abt_stats_seg2 = {
    'Количество наблюдений': f"{len(seg2_full):,}",
    'Количество событий': f"{int(seg2_full[config.TARGET_COLUMN].sum()):,}",
    'Количество целевых переменных': '1 (target_churn_3m)',
    'Количество числовых предикторов': len(numeric_cols),
    'Количество не числовых предикторов': len(non_numeric_cols)
}

print("\nТаблица для документации:")
print("\n| Наименование показателя | Значение |")
print("| :---- | :---- |")
for key, value in abt_stats_seg2.items():
    print(f"| {key} | {value} |")

pd.DataFrame([abt_stats_seg2]).T.to_csv('output/seg2_abt_statistics.csv', header=['Значение'])
print("\n✓ Сохранено: output/seg2_abt_statistics.csv")

## 3.3. Раздел 3.5.1-3.5.2 - Обработка данных

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: ОБРАБОТКА ДАННЫХ (Section 3.5.1-3.5.2)")
print("="*80)

print("\n3.5.1. Удаление и замена пропущенных значений:")
print("  - Метод: Median Imputation для числовых признаков")
print("  - Метод: Most Frequent для категориальных")
print(f"  - Обработано числовых: {len(pipeline.numeric_cols_for_imputation)}")
print(f"  - Обработано категориальных: {len(pipeline.categorical_cols_for_imputation)}")

print("\n3.5.2. Обработка категориальных значений:")
print("  - segment_group: Label Encoding (MIDDLE_BUSINESS → 0, LARGE_BUSINESS → 1)")
print("  - Временные признаки (obs_year, obs_month, obs_quarter): УДАЛЕНЫ (высокий PSI)")

## 3.4. Раздел 3.5.3 - Индекс PSI

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: PSI ANALYSIS (Section 3.5.3)")
print("="*80)

numeric_features = [c for c in seg2_train.columns 
                   if c != config.TARGET_COLUMN and seg2_train[c].dtype in [np.number]]

psi_results_seg2 = []
for col in numeric_features:
    psi = calculate_psi(seg2_train[col].values, seg2_test[col].values)
    psi_results_seg2.append({'feature': col, 'PSI': psi})

psi_df_seg2 = pd.DataFrame(psi_results_seg2).sort_values('PSI', ascending=False)

stable = (psi_df_seg2['PSI'] < 0.1).sum()
moderate = ((psi_df_seg2['PSI'] >= 0.1) & (psi_df_seg2['PSI'] < 0.2)).sum()
high = (psi_df_seg2['PSI'] >= 0.2).sum()

print(f"\nОбщая статистика PSI:")
print(f"  Всего признаков: {len(psi_df_seg2)}")
print(f"  Стабильных (PSI < 0.1): {stable} ({stable/len(psi_df_seg2)*100:.1f}%)")
print(f"  Умеренный drift (0.1-0.2): {moderate} ({moderate/len(psi_df_seg2)*100:.1f}%)")
print(f"  Высокий drift (PSI > 0.2): {high} ({high/len(psi_df_seg2)*100:.1f}%)")

print(f"\nТОП-10 признаков с наибольшим PSI:")
print(psi_df_seg2.head(10).to_string(index=False))

psi_df_seg2.to_csv('output/seg2_psi_analysis.csv', index=False)
print("\n✓ Сохранено: output/seg2_psi_analysis.csv")

## 3.5. Раздел 3.5.4 - Корреляционный анализ

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: КОРРЕЛЯЦИОННЫЙ АНАЛИЗ (Section 3.5.4)")
print("="*80)

numeric_features = [c for c in seg2_train.select_dtypes(include=[np.number]).columns 
                   if c != config.TARGET_COLUMN]

correlations_seg2 = []
for col in numeric_features:
    corr, pval = pointbiserialr(seg2_train[config.TARGET_COLUMN], seg2_train[col])
    correlations_seg2.append({
        'feature': col,
        'correlation': corr,
        'p_value': pval,
        'significant': pval < 0.05
    })

corr_df_seg2 = pd.DataFrame(correlations_seg2)
corr_df_seg2['abs_correlation'] = corr_df_seg2['correlation'].abs()
corr_df_seg2 = corr_df_seg2.sort_values('abs_correlation', ascending=False)

significant = corr_df_seg2['significant'].sum()
avg_corr = corr_df_seg2['abs_correlation'].mean()
max_corr = corr_df_seg2['abs_correlation'].max()

print(f"\nОбщая статистика:")
print(f"  Всего признаков: {len(corr_df_seg2)}")
print(f"  Значимых (p<0.05): {significant}")
print(f"  Средняя |корреляция|: {avg_corr:.4f}")
print(f"  Максимальная |корреляция|: {max_corr:.4f}")

print(f"\nТОП-20 признаков по корреляции с target:")
print(corr_df_seg2[['feature', 'correlation', 'p_value', 'significant']].head(20).to_string(index=False))

corr_df_seg2.to_csv('output/seg2_target_correlation.csv', index=False)
print("\n✓ Сохранено: output/seg2_target_correlation.csv")

## 3.6. Раздел 4.1 - Разбиение выборки

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: РАЗБИЕНИЕ ВЫБОРКИ (Section 4.1)")
print("="*80)

split_table_seg2 = {
    'Роль данных': ['Train', 'Val', 'Test'],
    'Количество наблюдений': [
        f"{len(seg2_train):,}",
        f"{len(seg2_val):,}",
        f"{len(seg2_test):,}"
    ],
    'Количество событий': [
        f"{int(seg2_train[config.TARGET_COLUMN].sum()):,}",
        f"{int(seg2_val[config.TARGET_COLUMN].sum()):,}",
        f"{int(seg2_test[config.TARGET_COLUMN].sum()):,}"
    ],
    'Churn Rate': [
        f"{seg2_train[config.TARGET_COLUMN].mean()*100:.2f}%",
        f"{seg2_val[config.TARGET_COLUMN].mean()*100:.2f}%",
        f"{seg2_test[config.TARGET_COLUMN].mean()*100:.2f}%"
    ]
}

split_df_seg2 = pd.DataFrame(split_table_seg2)
print("\nТаблица для документации:")
print(split_df_seg2.to_markdown(index=False))

split_df_seg2.to_csv('output/seg2_split_table.csv', index=False)
print("\n✓ Сохранено: output/seg2_split_table.csv")

## 3.7. Раздел 4.2-4.3 - Обучение модели и результаты

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: ОБУЧЕНИЕ МОДЕЛИ (Section 4.2-4.3)")
print("="*80)

# Подготовка данных
X_train_seg2 = seg2_train.drop(columns=[config.TARGET_COLUMN])
y_train_seg2 = seg2_train[config.TARGET_COLUMN]

X_val_seg2 = seg2_val.drop(columns=[config.TARGET_COLUMN])
y_val_seg2 = seg2_val[config.TARGET_COLUMN]

X_test_seg2 = seg2_test.drop(columns=[config.TARGET_COLUMN])
y_test_seg2 = seg2_test[config.TARGET_COLUMN]

# CatBoost для Segment 2
print("\nОбучение CatBoost...")
model_seg2 = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='AUC',
    early_stopping_rounds=50,
    use_best_model=True,
    random_seed=42,
    task_type='CPU',
    verbose=False,
    allow_writing_files=False
)

model_seg2.fit(
    X_train_seg2, y_train_seg2,
    eval_set=(X_val_seg2, y_val_seg2),
    verbose=False
)

# Предсказания
y_pred_proba_train_seg2 = model_seg2.predict_proba(X_train_seg2)[:, 1]
y_pred_proba_val_seg2 = model_seg2.predict_proba(X_val_seg2)[:, 1]
y_pred_proba_test_seg2 = model_seg2.predict_proba(X_test_seg2)[:, 1]

# Метрики
threshold_seg2 = 0.10

roc_auc_train_seg2 = roc_auc_score(y_train_seg2, y_pred_proba_train_seg2)
roc_auc_val_seg2 = roc_auc_score(y_val_seg2, y_pred_proba_val_seg2)
roc_auc_test_seg2 = roc_auc_score(y_test_seg2, y_pred_proba_test_seg2)

gini_train_seg2 = 2 * roc_auc_train_seg2 - 1
gini_val_seg2 = 2 * roc_auc_val_seg2 - 1
gini_test_seg2 = 2 * roc_auc_test_seg2 - 1

print(f"\n✓ Модель обучена")
print(f"\nМетрики (CatBoost, threshold={threshold_seg2}):")
print(f"  Train - ROC-AUC: {roc_auc_train_seg2:.4f}, Gini: {gini_train_seg2:.4f}")
print(f"  Val   - ROC-AUC: {roc_auc_val_seg2:.4f}, Gini: {gini_val_seg2:.4f}")
print(f"  Test  - ROC-AUC: {roc_auc_test_seg2:.4f}, Gini: {gini_test_seg2:.4f}")

# Сохранение модели
with open('models/seg2_catboost_final.pkl', 'wb') as f:
    pickle.dump(model_seg2, f)
print("\n✓ Модель сохранена: models/seg2_catboost_final.pkl")

# Сохраняем метрики
metrics_seg2 = pd.DataFrame({
    'Dataset': ['Train', 'Validation', 'Test'],
    'ROC-AUC': [roc_auc_train_seg2, roc_auc_val_seg2, roc_auc_test_seg2],
    'Gini': [gini_train_seg2, gini_val_seg2, gini_test_seg2]
})
metrics_seg2.to_csv('output/seg2_model_metrics.csv', index=False)
print("✓ Метрики сохранены: output/seg2_model_metrics.csv")

## 3.8. Раздел 5.2 - Важность признаков

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: FEATURE IMPORTANCE (Section 5.2)")
print("="*80)

# Feature importance
feature_importance_seg2 = pd.DataFrame({
    'feature': X_train_seg2.columns,
    'importance': model_seg2.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nТОП-20 признаков:")
print(feature_importance_seg2.head(20).to_string(index=False))

# График
plt.figure(figsize=(10, 8))
top20 = feature_importance_seg2.head(20)
plt.barh(range(len(top20)), top20['importance'])
plt.yticks(range(len(top20)), top20['feature'])
plt.xlabel('Importance')
plt.title('SEGMENT 2: Top 20 Feature Importance (CatBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('figures/seg2_feature_importance.png', dpi=150, bbox_inches='tight')
plt.close()

# Сохраняем
feature_importance_seg2.to_csv('output/seg2_feature_importance.csv', index=False)
print("\n✓ Сохранено: output/seg2_feature_importance.csv")
print("✓ График сохранен: figures/seg2_feature_importance.png")

## 3.9. Раздел 5.3 - SHAP Analysis

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: SHAP ANALYSIS (Section 5.3)")
print("="*80)

# SHAP values
print("\nРасчет SHAP values...")
explainer_seg2 = shap.TreeExplainer(model_seg2)
shap_values_seg2 = explainer_seg2.shap_values(X_val_seg2)

# SHAP summary plot (importance)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_seg2, X_val_seg2, plot_type='bar', show=False, max_display=20)
plt.title('SEGMENT 2: SHAP Feature Importance')
plt.tight_layout()
plt.savefig('figures/seg2_shap_importance.png', dpi=150, bbox_inches='tight')
plt.close()

# SHAP beeswarm plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_seg2, X_val_seg2, show=False, max_display=20)
plt.title('SEGMENT 2: SHAP Summary (Beeswarm)')
plt.tight_layout()
plt.savefig('figures/seg2_shap_beeswarm.png', dpi=150, bbox_inches='tight')
plt.close()

print("\n✓ SHAP графики сохранены:")
print("  - figures/seg2_shap_importance.png")
print("  - figures/seg2_shap_beeswarm.png")

## 3.10. Раздел 5.4 - Decile Analysis и Lift

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: DECILE ANALYSIS + LIFT (Section 5.4)")
print("="*80)

# Train
decile_train_seg2 = calculate_deciles(y_train_seg2, y_pred_proba_train_seg2)
print("\nDECILE ANALYSIS - TRAIN:")
print(decile_train_seg2.to_string(index=False))

# Test
decile_test_seg2 = calculate_deciles(y_test_seg2, y_pred_proba_test_seg2)
print("\n\nDECILE ANALYSIS - TEST:")
print(decile_test_seg2.to_string(index=False))

# Сохраняем
decile_train_seg2.to_csv('output/seg2_decile_analysis_train.csv', index=False)
decile_test_seg2.to_csv('output/seg2_decile_analysis_test.csv', index=False)

# График Lift
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Lift chart
axes[0].plot(decile_train_seg2['Decile'], decile_train_seg2['Lift'], marker='o', label='Train')
axes[0].plot(decile_test_seg2['Decile'], decile_test_seg2['Lift'], marker='s', label='Test')
axes[0].axhline(y=1, color='r', linestyle='--', label='Baseline')
axes[0].set_xlabel('Decile')
axes[0].set_ylabel('Lift')
axes[0].set_title('SEGMENT 2: Lift by Decile')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Cumulative Gain
axes[1].plot(decile_train_seg2['Decile'], decile_train_seg2['Cum_Gain_%'], marker='o', label='Train')
axes[1].plot(decile_test_seg2['Decile'], decile_test_seg2['Cum_Gain_%'], marker='s', label='Test')
axes[1].plot([1, 10], [10, 100], 'r--', label='Random')
axes[1].set_xlabel('Decile')
axes[1].set_ylabel('Cumulative Gain (%)')
axes[1].set_title('SEGMENT 2: Cumulative Gain')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/seg2_decile_lift.png', dpi=150, bbox_inches='tight')
plt.close()

print("\n✓ Сохранено:")
print("  - output/seg2_decile_analysis_train.csv")
print("  - output/seg2_decile_analysis_test.csv")
print("  - figures/seg2_decile_lift.png")

## 3.11. ROC Curve для Segment 2

In [None]:
print("\n" + "="*80)
print("SEGMENT 2: ROC CURVE")
print("="*80)

# ROC curves
fpr_train_seg2, tpr_train_seg2, _ = roc_curve(y_train_seg2, y_pred_proba_train_seg2)
fpr_val_seg2, tpr_val_seg2, _ = roc_curve(y_val_seg2, y_pred_proba_val_seg2)
fpr_test_seg2, tpr_test_seg2, _ = roc_curve(y_test_seg2, y_pred_proba_test_seg2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_train_seg2, tpr_train_seg2, label=f'Train (AUC={roc_auc_train_seg2:.4f})', linewidth=2)
plt.plot(fpr_val_seg2, tpr_val_seg2, label=f'Val (AUC={roc_auc_val_seg2:.4f})', linewidth=2)
plt.plot(fpr_test_seg2, tpr_test_seg2, label=f'Test (AUC={roc_auc_test_seg2:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SEGMENT 2 (Middle + Large Business): ROC Curve - CatBoost')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('figures/seg2_roc_curve.png', dpi=150, bbox_inches='tight')
plt.close()

print("✓ ROC кривая сохранена: figures/seg2_roc_curve.png")

---
---
# ИТОГОВАЯ СВОДКА

In [None]:
print("\n" + "="*80)
print("ИТОГОВАЯ СВОДКА - ВСЕ РЕЗУЛЬТАТЫ")
print("="*80)

print("\n" + "="*80)
print("SEGMENT 1: SMALL BUSINESS (XGBoost)")
print("="*80)
print(f"\nМетрики на Test:")
print(f"  ROC-AUC: {roc_auc_test_seg1:.4f}")
print(f"  Gini: {gini_test_seg1:.4f}")
print(f"  Threshold: {threshold_seg1}")
print(f"  Churn Rate: {y_test_seg1.mean()*100:.2f}%")
print(f"\nТОП-5 важных признаков:")
print(feature_importance_seg1.head(5).to_string(index=False))

print("\n" + "="*80)
print("SEGMENT 2: MIDDLE + LARGE BUSINESS (CatBoost)")
print("="*80)
print(f"\nМетрики на Test:")
print(f"  ROC-AUC: {roc_auc_test_seg2:.4f}")
print(f"  Gini: {gini_test_seg2:.4f}")
print(f"  Threshold: {threshold_seg2}")
print(f"  Churn Rate: {y_test_seg2.mean()*100:.2f}%")
print(f"\nТОП-5 важных признаков:")
print(feature_importance_seg2.head(5).to_string(index=False))

print("\n" + "="*80)
print("СОХРАНЕННЫЕ ФАЙЛЫ")
print("="*80)

print("\nCSV таблицы (output/):")
csv_files = [
    'seg1_flow_analysis.csv',
    'seg1_abt_statistics.csv',
    'seg1_psi_analysis.csv',
    'seg1_target_correlation.csv',
    'seg1_split_table.csv',
    'seg1_model_metrics.csv',
    'seg1_feature_importance.csv',
    'seg1_decile_analysis_train.csv',
    'seg1_decile_analysis_test.csv',
    'seg2_flow_analysis.csv',
    'seg2_abt_statistics.csv',
    'seg2_psi_analysis.csv',
    'seg2_target_correlation.csv',
    'seg2_split_table.csv',
    'seg2_model_metrics.csv',
    'seg2_feature_importance.csv',
    'seg2_decile_analysis_train.csv',
    'seg2_decile_analysis_test.csv'
]
for f in csv_files:
    print(f"  ✓ {f}")

print("\nГрафики (figures/):")
fig_files = [
    'seg1_feature_importance.png',
    'seg1_shap_importance.png',
    'seg1_shap_beeswarm.png',
    'seg1_decile_lift.png',
    'seg1_roc_curve.png',
    'seg2_feature_importance.png',
    'seg2_shap_importance.png',
    'seg2_shap_beeswarm.png',
    'seg2_decile_lift.png',
    'seg2_roc_curve.png'
]
for f in fig_files:
    print(f"  ✓ {f}")

print("\nМодели (models/):")
print("  ✓ seg1_xgboost_final.pkl")
print("  ✓ seg2_catboost_final.pkl")

print("\n" + "="*80)
print("✅ ВСЕ РЕЗУЛЬТАТЫ ГОТОВЫ ДЛЯ ДОКУМЕНТАЦИИ!")
print("="*80)