# ИМПОРТ БИБЛИОТЕК И КОНФИГУРАЦИЯ

In [None]:
import os
import warnings
from datetime import datetime
from pathlib import Path
import time
import gc
import pickle

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import pointbiserialr

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
class Config:
    RANDOM_SEED = 42
    
    DATA_DIR = Path("data")
    OUTPUT_DIR = Path("output")
    FIGURES_DIR = Path("figures")
    MODELS_DIR = Path("models")
    
    TRAIN_FILE = "churn_train_ul.parquet"
    
    ID_COLUMNS = ['cli_code', 'client_id', 'observation_point']
    TARGET_COLUMN = 'target_churn_3m'
    SEGMENT_COLUMN = 'segment_group'
    DATE_COLUMN = 'observation_point'
    CATEGORICAL_FEATURES = ['segment_group', 'obs_month', 'obs_quarter']
    
    SEGMENT_1_NAME = "Small Business"
    SEGMENT_1_VALUES = ['SMALL_BUSINESS']
    
    SEGMENT_2_NAME = "Middle + Large Business"
    SEGMENT_2_VALUES = ['MIDDLE_BUSINESS', 'LARGE_BUSINESS']
    
    TRAIN_SIZE = 0.70
    VAL_SIZE = 0.15
    TEST_SIZE = 0.15
    
    REMOVE_GAPS = True
    HANDLE_OUTLIERS = True
    REMOVE_HIGH_CORRELATIONS = True
    CORRELATION_THRESHOLD = 0.85
    OUTLIER_IQR_MULTIPLIER = 1.5
    
    CORRELATION_P_VALUE_THRESHOLD = 0.05
    DATA_LEAKAGE_THRESHOLD = 0.9
    TOP_N_CORRELATIONS = 20
    TOP_N_VISUALIZATION = 30
    
    @classmethod
    def create_directories(cls):
        for dir_path in [cls.OUTPUT_DIR, cls.FIGURES_DIR, cls.MODELS_DIR]:
            dir_path.mkdir(parents=True, exist_ok=True)
    
    @classmethod
    def get_train_path(cls):
        return cls.DATA_DIR / cls.TRAIN_FILE

config = Config()
config.create_directories()
np.random.seed(config.RANDOM_SEED)

print("\nКонфигурация инициализирована")
print(f"  Random seed: {config.RANDOM_SEED}")
print(f"  Сегмент 1: {config.SEGMENT_1_NAME} {config.SEGMENT_1_VALUES}")
print(f"  Сегмент 2: {config.SEGMENT_2_NAME} {config.SEGMENT_2_VALUES}")
print(f"  Split: {config.TRAIN_SIZE}/{config.VAL_SIZE}/{config.TEST_SIZE}")
print(f"\nPREPROCESSING FLAGS:")
print(f"  Gap detection: {config.REMOVE_GAPS}")
print(f"  Outliers handling: {config.HANDLE_OUTLIERS} (IQR x {config.OUTLIER_IQR_MULTIPLIER})")
print(f"  High correlations removal: {config.REMOVE_HIGH_CORRELATIONS} (threshold={config.CORRELATION_THRESHOLD})")

# ЗАГРУЗКА ДАННЫХ

In [None]:
train_path = config.get_train_path()

print("\n" + "="*80)
print("ЗАГРУЗКА ДАННЫХ")
print("="*80)
print(f"Файл: {train_path}")

if not train_path.exists():
    raise FileNotFoundError(
        f"Файл не найден: {train_path}\n"
        f"Убедитесь, что файл {config.TRAIN_FILE} находится в папке data/"
    )

file_size = train_path.stat().st_size / (1024**2)
print(f"Размер файла: {file_size:.2f} MB")

start = time.time()
df_full = pd.read_parquet(train_path)
load_time = time.time() - start

memory = df_full.memory_usage(deep=True).sum() / (1024**2)

print(f"\nЗагружено за {load_time:.2f} сек")
print(f"  Размер: {df_full.shape}")
print(f"  Память: {memory:.2f} MB")
print(f"  Строк: {len(df_full):,}")
print(f"  Колонок: {df_full.shape[1]}")
print("="*80)

# EXPLORATORY DATA ANALYSIS

In [None]:
print("\n" + "="*80)
print("БАЗОВАЯ ИНФОРМАЦИЯ О ДАННЫХ")
print("="*80)

print(f"\n1. РАЗМЕР ДАННЫХ:")
print(f"   Строк: {len(df_full):,}")
print(f"   Колонок: {df_full.shape[1]}")
print(f"   Память: {df_full.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

print(f"\n2. ТИПЫ ДАННЫХ:")
dtype_counts = df_full.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    print(f"   {dtype}: {count}")

In [None]:
print("\n" + "="*80)
print("АНАЛИЗ ЦЕЛЕВОЙ ПЕРЕМЕННОЙ")
print("="*80)

churn_rate = df_full[config.TARGET_COLUMN].mean()
n_churned = df_full[config.TARGET_COLUMN].sum()
n_total = len(df_full)
ratio = (1-churn_rate)/churn_rate

print(f"\n1. ОБЩИЙ CHURN RATE:")
print(f"   Churn rate: {churn_rate:.4f} ({churn_rate*100:.2f}%)")
print(f"   Churned: {n_churned:,}")
print(f"   Not churned: {n_total - n_churned:,}")
print(f"   Class ratio: 1:{ratio:.1f}")

print(f"\n2. CHURN RATE ПО СЕГМЕНТАМ:")
for segment in df_full[config.SEGMENT_COLUMN].unique():
    segment_df = df_full[df_full[config.SEGMENT_COLUMN] == segment]
    seg_churn = segment_df[config.TARGET_COLUMN].mean()
    seg_count = len(segment_df)
    seg_pct = seg_count / len(df_full) * 100
    print(f"   {segment}:")
    print(f"     Размер: {seg_count:,} ({seg_pct:.1f}%)")
    print(f"     Churn rate: {seg_churn:.4f} ({seg_churn*100:.2f}%)")

print("="*80)

In [None]:
print("\n" + "="*80)
print("АНАЛИЗ ПРОПУЩЕННЫХ ЗНАЧЕНИЙ")
print("="*80)

missing = df_full.isnull().sum()
missing_df = pd.DataFrame({
    'Missing': missing[missing > 0],
    'Percent': (missing[missing > 0] / len(df_full) * 100).round(2)
}).sort_values('Missing', ascending=False)

print(f"\nКолонок с пропусками: {len(missing_df)}")

if len(missing_df) > 0:
    print(f"\nТоп-20 колонок с наибольшим количеством пропусков:")
    print(missing_df.head(20))
else:
    print("\nПропущенных значений не обнаружено")

print("="*80)

In [None]:
print("\n" + "="*80)
print("АНАЛИЗ КОНСТАНТНЫХ КОЛОНОК")
print("="*80)

constant_cols = []
for col in df_full.columns:
    if df_full[col].nunique(dropna=False) == 1:
        constant_cols.append(col)

print(f"\nКонстантных колонок: {len(constant_cols)}")

if constant_cols:
    print(f"\nСписок константных колонок:")
    for col in constant_cols:
        print(f"  - {col} (значение: {df_full[col].iloc[0]})")
else:
    print("\nКонстантных колонок не обнаружено")

print("="*80)

In [None]:
print("\n" + "="*80)
print("АНАЛИЗ ВРЕМЕННОГО РАСПРЕДЕЛЕНИЯ")
print("="*80)

df_full[config.DATE_COLUMN] = pd.to_datetime(df_full[config.DATE_COLUMN])

print(f"\n1. ВРЕМЕННОЙ ПЕРИОД:")
print(f"   Начало: {df_full[config.DATE_COLUMN].min().date()}")
print(f"   Конец: {df_full[config.DATE_COLUMN].max().date()}")
print(f"   Уникальных дат: {df_full[config.DATE_COLUMN].nunique()}")

print(f"\n2. КЛИЕНТЫ:")
print(f"   Уникальных cli_code: {df_full['cli_code'].nunique():,}")
print(f"   Уникальных client_id: {df_full['client_id'].nunique():,}")

print(f"\n3. РАСПРЕДЕЛЕНИЕ ЗАПИСЕЙ ПО ДАТАМ:")
date_dist = df_full.groupby(config.DATE_COLUMN).size()
print(f"   Среднее записей на дату: {date_dist.mean():.0f}")
print(f"   Минимум: {date_dist.min():,}")
print(f"   Максимум: {date_dist.max():,}")

print("="*80)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

target_dist = df_full[config.TARGET_COLUMN].value_counts()
axes[0].bar(['No Churn', 'Churn'], [target_dist[0], target_dist[1]],
           color=['green', 'red'], alpha=0.7, edgecolor='black')
axes[0].set_title('Общее распределение Target', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Количество')
axes[0].set_yscale('log')
for i, v in enumerate([target_dist[0], target_dist[1]]):
    axes[0].text(i, v, f'{v:,}\n({v/len(df_full)*100:.2f}%)',
                ha='center', va='bottom', fontsize=10)

segment_churn = df_full.groupby([config.SEGMENT_COLUMN, 
                                  config.TARGET_COLUMN]).size().unstack(fill_value=0)
segment_churn.plot(kind='bar', stacked=True, ax=axes[1],
                  color=['green', 'red'], alpha=0.7, edgecolor='black')
axes[1].set_title('Распределение по сегментам', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Сегмент')
axes[1].set_ylabel('Количество')
axes[1].legend(['No Churn', 'Churn'], loc='upper right')
axes[1].tick_params(axis='x', rotation=45)

churn_rates = df_full.groupby(config.SEGMENT_COLUMN)[config.TARGET_COLUMN].mean() * 100
axes[2].bar(range(len(churn_rates)), churn_rates.values,
           color='coral', alpha=0.7, edgecolor='black')
axes[2].set_xticks(range(len(churn_rates)))
axes[2].set_xticklabels(churn_rates.index, rotation=45, ha='right')
axes[2].set_title('Churn Rate по сегментам', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Churn Rate (%)')
for i, v in enumerate(churn_rates.values):
    axes[2].text(i, v, f'{v:.2f}%', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'eda_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("Сохранено: figures/eda_target_distribution.png")

# ВРЕМЕННОЕ РАЗБИЕНИЕ

In [None]:
print("\n" + "="*80)
print("ВРЕМЕННОЕ РАЗБИЕНИЕ (TEMPORAL SPLIT)")
print("="*80)

df_sorted = df_full.sort_values(config.DATE_COLUMN).reset_index(drop=True)
unique_dates = sorted(df_sorted[config.DATE_COLUMN].unique())
n_dates = len(unique_dates)

print(f"\nУникальных дат: {n_dates}")
print(f"Период: {unique_dates[0].date()} - {unique_dates[-1].date()}")

train_cutoff = int(n_dates * config.TRAIN_SIZE)
val_cutoff = int(n_dates * (config.TRAIN_SIZE + config.VAL_SIZE))

train_end = unique_dates[train_cutoff - 1]
val_end = unique_dates[val_cutoff - 1]

print(f"\nCutoff даты:")
print(f"  Train: до {train_end.date()} ({train_cutoff} дат)")
print(f"  Val: {unique_dates[train_cutoff].date()} - {val_end.date()} ({val_cutoff - train_cutoff} дат)")
print(f"  Test (OOT): {unique_dates[val_cutoff].date()}+ ({n_dates - val_cutoff} дат)")

train_df = df_sorted[df_sorted[config.DATE_COLUMN] <= train_end].copy()
val_df = df_sorted[(df_sorted[config.DATE_COLUMN] > train_end) & 
                   (df_sorted[config.DATE_COLUMN] <= val_end)].copy()
test_df = df_sorted[df_sorted[config.DATE_COLUMN] > val_end].copy()

print(f"\n{'='*80}")
print("СТАТИСТИКА ПО SPLIT")
print(f"{'='*80}")

for name, df in [('TRAIN', train_df), ('VALIDATION', val_df), ('TEST (OOT)', test_df)]:
    churn_r = df[config.TARGET_COLUMN].mean()
    print(f"\n{name}:")
    print(f"  Записей: {len(df):,}")
    print(f"  Клиентов (cli_code): {df['cli_code'].nunique():,}")
    print(f"  Период: {df[config.DATE_COLUMN].min().date()} - {df[config.DATE_COLUMN].max().date()}")
    print(f"  Churn rate: {churn_r:.4f} ({churn_r*100:.2f}%)")
    print(f"  Процент от общего: {len(df)/len(df_full)*100:.2f}%")

assert train_df[config.DATE_COLUMN].max() < val_df[config.DATE_COLUMN].min(), "Data leakage detected!"
assert val_df[config.DATE_COLUMN].max() < test_df[config.DATE_COLUMN].min(), "Data leakage detected!"
print(f"\nTemporal ordering verified - NO DATA LEAKAGE")

print("="*80)

In [None]:
if config.REMOVE_GAPS:
    print("\n" + "="*80)
    print("GAP DETECTION - УДАЛЕНИЕ КЛИЕНТОВ С ПРОБЕЛАМИ")
    print("="*80)

    print("\nАнализ пробелов в train данных...")

    unique_clients = train_df['cli_code'].unique()
    chunk_size = 10000
    clients_with_gaps_list = []

    for i in range(0, len(unique_clients), chunk_size):
        chunk_clients = unique_clients[i:i+chunk_size]
        chunk = train_df[train_df['cli_code'].isin(chunk_clients)].copy()
        chunk = chunk.sort_values(['cli_code', config.DATE_COLUMN])

        chunk['month_num'] = chunk[config.DATE_COLUMN].dt.to_period('M').apply(lambda x: x.ordinal)
        chunk['month_diff'] = chunk.groupby('cli_code')['month_num'].diff()

        gaps = chunk.groupby('cli_code')['month_diff'].agg([
            ('max_gap', 'max'),
            ('total_gaps', lambda x: (x > 1).sum())
        ]).reset_index()

        chunk_gaps = gaps[gaps['max_gap'] > 1]
        clients_with_gaps_list.append(chunk_gaps)

        if (i // chunk_size + 1) % 10 == 0:
            gc.collect()
            print(f"  Обработано {i+chunk_size:,}/{len(unique_clients):,} клиентов")

    clients_with_gaps = pd.concat(clients_with_gaps_list, ignore_index=True)

    gap_pct = len(clients_with_gaps) / len(unique_clients) * 100
    print(f"\nКлиентов с пробелами: {len(clients_with_gaps):,} ({gap_pct:.2f}%)")

    if len(clients_with_gaps) > 0:
        bad_clients = set(clients_with_gaps['cli_code'])

        train_before = len(train_df)
        val_before = len(val_df)
        test_before = len(test_df)
        
        train_df = train_df[~train_df['cli_code'].isin(bad_clients)].copy()
        val_df = val_df[~val_df['cli_code'].isin(bad_clients)].copy()
        test_df = test_df[~test_df['cli_code'].isin(bad_clients)].copy()

        print(f"\nУдалено:")
        print(f"  Train: {train_before:,} -> {len(train_df):,} (-{train_before - len(train_df):,})")
        print(f"  Val: {val_before:,} -> {len(val_df):,} (-{val_before - len(val_df):,})")
        print(f"  Test: {test_before:,} -> {len(test_df):,} (-{test_before - len(test_df):,})")

        del clients_with_gaps, bad_clients
        gc.collect()
    else:
        print("\nКлиентов с пробелами не обнаружено")

    print("="*80)
else:
    print("\nGap detection отключен")

# PREPROCESSING PIPELINE

In [None]:
class PreprocessingPipeline:

    def __init__(self, config):
        self.config = config
        self.fitted_columns = None
        self.final_features = None
        self.constant_cols = []
        self.outlier_bounds = {}
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.numeric_cols_for_imputation = []
        self.categorical_cols_for_imputation = []
        self.features_to_drop_corr = []

    def fit_transform(self, train_df):
        print("\n" + "="*80)
        print("PREPROCESSING: FIT_TRANSFORM ON TRAIN")
        print("="*80)

        df = train_df.copy()

        self.fitted_columns = [c for c in df.columns
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]

        df = self._remove_constants(df, fit=True)
        df = self._handle_outliers(df, fit=True)
        df = self._handle_missing(df, fit=True)
        df = self._remove_correlations(df, fit=True)

        self.final_features = [c for c in df.columns
                              if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]]

        print(f"\nPreprocessing complete")
        print(f"  Final features: {len(self.final_features)}")

        return df

    def transform(self, df, dataset_name='test'):
        print(f"\nPreprocessing: {dataset_name}")

        df = df.copy()

        df = self._remove_constants(df, fit=False)
        df = self._handle_outliers(df, fit=False)
        df = self._handle_missing(df, fit=False)
        df = self._remove_correlations(df, fit=False)
        df = self._align_columns(df, dataset_name)

        print(f"  {dataset_name}: {df.shape}")

        return df

    def _remove_constants(self, df, fit):
        if fit:
            print("\n1. Removing constant columns...")
            for col in df.columns:
                if col in config.ID_COLUMNS + [config.TARGET_COLUMN]:
                    continue
                if df[col].nunique(dropna=False) == 1:
                    self.constant_cols.append(col)

            if self.constant_cols:
                df = df.drop(columns=self.constant_cols)
                print(f"   Removed: {len(self.constant_cols)}")
            else:
                print(f"   No constant columns found")
        return df

    def _handle_outliers(self, df, fit):
        if not config.HANDLE_OUTLIERS:
            return df

        if fit:
            print("\n2. Handling outliers (IQR clipping)...")
            keywords = ['profit', 'income', 'expense', 'margin', 'provision',
                       'balance', 'assets', 'liabilities', 'revenue', 'cost']
            
            cols = [c for c in df.columns
                   if any(kw in c.lower() for kw in keywords)
                   and c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]

            for col in cols:
                if df[col].dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']:
                    Q1, Q3 = df[col].quantile([0.25, 0.75])
                    IQR = Q3 - Q1
                    self.outlier_bounds[col] = {
                        'lower': Q1 - config.OUTLIER_IQR_MULTIPLIER * IQR,
                        'upper': Q3 + config.OUTLIER_IQR_MULTIPLIER * IQR
                    }

            for col, bounds in self.outlier_bounds.items():
                df[col] = df[col].clip(lower=bounds['lower'], upper=bounds['upper'])

            print(f"   Clipped: {len(self.outlier_bounds)} columns")
        else:
            for col, bounds in self.outlier_bounds.items():
                if col in df.columns:
                    df[col] = df[col].clip(lower=bounds['lower'], upper=bounds['upper'])

        return df

    def _handle_missing(self, df, fit):
        if fit:
            print("\n3. Handling missing values...")
            self.numeric_cols_for_imputation = [
                c for c in df.select_dtypes(include=[np.number]).columns
                if c not in config.ID_COLUMNS + [config.TARGET_COLUMN]
            ]
            self.categorical_cols_for_imputation = [
                c for c in config.CATEGORICAL_FEATURES if c in df.columns
            ]

            self.numeric_imputer = SimpleImputer(strategy='median')
            self.categorical_imputer = SimpleImputer(strategy='most_frequent')

            if len(self.numeric_cols_for_imputation) > 0:
                df[self.numeric_cols_for_imputation] = self.numeric_imputer.fit_transform(
                    df[self.numeric_cols_for_imputation]
                )

            if len(self.categorical_cols_for_imputation) > 0:
                df[self.categorical_cols_for_imputation] = self.categorical_imputer.fit_transform(
                    df[self.categorical_cols_for_imputation]
                )

            print(f"   Imputed: {len(self.numeric_cols_for_imputation)} numeric, "
                  f"{len(self.categorical_cols_for_imputation)} categorical")
        else:
            if len(self.numeric_cols_for_imputation) > 0:
                present = [c for c in self.numeric_cols_for_imputation if c in df.columns]
                if present:
                    df[present] = self.numeric_imputer.transform(df[present])

            if len(self.categorical_cols_for_imputation) > 0:
                present = [c for c in self.categorical_cols_for_imputation if c in df.columns]
                if present:
                    df[present] = self.categorical_imputer.transform(df[present])

        return df

    def _remove_correlations(self, df, fit):
        if not config.REMOVE_HIGH_CORRELATIONS:
            return df

        if fit:
            print(f"\n4. Removing high correlations (threshold={config.CORRELATION_THRESHOLD})...")
            numeric = [c for c in df.select_dtypes(include=[np.number]).columns
                      if c not in config.ID_COLUMNS + [config.TARGET_COLUMN] + config.CATEGORICAL_FEATURES]

            if len(numeric) > 1:
                corr = df[numeric].corr().abs()
                upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                self.features_to_drop_corr = [c for c in upper.columns
                                             if any(upper[c] > config.CORRELATION_THRESHOLD)]

                if self.features_to_drop_corr:
                    df = df.drop(columns=self.features_to_drop_corr)
                    print(f"   Removed: {len(self.features_to_drop_corr)} features")
                else:
                    print(f"   No highly correlated features found")

        return df

    def _align_columns(self, df, name):
        preserve = [c for c in config.ID_COLUMNS if c in df.columns]
        if config.TARGET_COLUMN in df.columns:
            preserve.append(config.TARGET_COLUMN)

        current = [c for c in df.columns if c not in preserve]
        missing = [c for c in self.final_features if c not in current]
        extra = [c for c in current if c not in self.final_features]

        if missing:
            for c in missing:
                df[c] = 0

        if extra:
            df = df.drop(columns=extra)

        order = preserve + self.final_features
        df = df[[c for c in order if c in df.columns]]

        return df

print("PreprocessingPipeline класс определен")

In [None]:
pipeline = PreprocessingPipeline(config)
train_processed = pipeline.fit_transform(train_df)
val_processed = pipeline.transform(val_df, 'validation')
test_processed = pipeline.transform(test_df, 'test (OOT)')

print("\n" + "="*80)
print("PREPROCESSING SUMMARY")
print("="*80)
print(f"\nШаги preprocessing:")
print(f"  1. Константные колонки удалено: {len(pipeline.constant_cols)}")
print(f"  2. Выбросы обработано (IQR clipping): {len(pipeline.outlier_bounds)} колонок")
print(f"  3. Пропуски заполнено:")
print(f"     - Числовых: {len(pipeline.numeric_cols_for_imputation)}")
print(f"     - Категориальных: {len(pipeline.categorical_cols_for_imputation)}")
print(f"  4. Коррелирующих признаков удалено: {len(pipeline.features_to_drop_corr)}")
print(f"\nИтоговое количество признаков: {len(pipeline.final_features)}")
print("="*80)

# РАЗДЕЛЕНИЕ ПО СЕГМЕНТАМ

In [None]:
print("\n" + "="*80)
print("РАЗДЕЛЕНИЕ ПО СЕГМЕНТАМ")
print("="*80)

print(f"\n1. SEGMENT 1: {config.SEGMENT_1_NAME.upper()}")
print("-" * 80)

seg1_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()
seg1_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_1_VALUES)].copy()

print(f"Исходные размеры:")
print(f"  Train: {seg1_train.shape}")
print(f"  Val: {seg1_val.shape}")
print(f"  Test: {seg1_test.shape}")

print(f"\nУникальных значений segment_group: {seg1_train[config.SEGMENT_COLUMN].nunique()}")
print(f"Значения: {seg1_train[config.SEGMENT_COLUMN].unique()}")

temporal_features = ['obs_year', 'obs_month', 'obs_quarter']
cols_to_drop_seg1 = [config.SEGMENT_COLUMN] + [c for c in config.ID_COLUMNS if c in seg1_train.columns] + temporal_features
seg1_train = seg1_train.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_train.columns])
seg1_val = seg1_val.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_val.columns])
seg1_test = seg1_test.drop(columns=[c for c in cols_to_drop_seg1 if c in seg1_test.columns])

print(f"\nИтоговые размеры seg1:")
print(f"  Train: {seg1_train.shape} | Churn: {seg1_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {seg1_val.shape} | Churn: {seg1_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {seg1_test.shape} | Churn: {seg1_test[config.TARGET_COLUMN].mean()*100:.2f}%")

print(f"\n\n2. SEGMENT 2: {config.SEGMENT_2_NAME.upper()}")
print("-" * 80)

seg2_train = train_processed[train_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_val = val_processed[val_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()
seg2_test = test_processed[test_processed[config.SEGMENT_COLUMN].isin(config.SEGMENT_2_VALUES)].copy()

print(f"Исходные размеры:")
print(f"  Train: {seg2_train.shape}")
print(f"  Val: {seg2_val.shape}")
print(f"  Test: {seg2_test.shape}")

print(f"\nУникальных значений segment_group: {seg2_train[config.SEGMENT_COLUMN].nunique()}")
print(f"Значения: {seg2_train[config.SEGMENT_COLUMN].unique()}")

cols_to_drop_seg2 = [c for c in config.ID_COLUMNS if c in seg2_train.columns] + temporal_features
seg2_train = seg2_train.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_train.columns])
seg2_val = seg2_val.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_val.columns])
seg2_test = seg2_test.drop(columns=[c for c in cols_to_drop_seg2 if c in seg2_test.columns])

segment_mapping = {'MIDDLE_BUSINESS': 0, 'LARGE_BUSINESS': 1}
seg2_train[config.SEGMENT_COLUMN] = seg2_train[config.SEGMENT_COLUMN].map(segment_mapping)
seg2_val[config.SEGMENT_COLUMN] = seg2_val[config.SEGMENT_COLUMN].map(segment_mapping)
seg2_test[config.SEGMENT_COLUMN] = seg2_test[config.SEGMENT_COLUMN].map(segment_mapping)
print(f"   MIDDLE_BUSINESS -> 0, LARGE_BUSINESS -> 1")

print(f"\nИтоговые размеры seg2:")
print(f"  Train: {seg2_train.shape} | Churn: {seg2_train[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Val: {seg2_val.shape} | Churn: {seg2_val[config.TARGET_COLUMN].mean()*100:.2f}%")
print(f"  Test: {seg2_test.shape} | Churn: {seg2_test[config.TARGET_COLUMN].mean()*100:.2f}%")

# CORRELATION ANALYSIS

In [None]:
def calculate_pointbiserial_correlations(df, target_col, p_threshold=0.05):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != target_col]
    
    results = []
    target_values = df[target_col].values
    
    print(f"Анализируем {len(numeric_cols)} числовых признаков...")
    
    for col in numeric_cols:
        feature_values = df[col].values
        
        if len(np.unique(feature_values)) == 1:
            continue
        
        try:
            corr, pval = pointbiserialr(target_values, feature_values)
            
            results.append({
                'feature': col,
                'correlation': corr,
                'abs_correlation': abs(corr),
                'p_value': pval,
                'significant': pval < p_threshold
            })
        except Exception as e:
            print(f"  Ошибка для {col}: {e}")
            continue
    
    corr_df = pd.DataFrame(results)
    corr_df = corr_df.sort_values('abs_correlation', ascending=False).reset_index(drop=True)
    
    return corr_df


def plot_top_correlations(corr_df, segment_name, top_n=30, save_path=None):
    top_corr = corr_df.head(top_n).copy()
    
    fig, ax = plt.subplots(figsize=(12, max(8, top_n * 0.3)))
    
    colors = ['green' if x >= 0 else 'red' for x in top_corr['correlation']]
    
    bars = ax.barh(range(len(top_corr)), top_corr['correlation'].values,
                   color=colors, alpha=0.7, edgecolor='black')
    
    ax.set_yticks(range(len(top_corr)))
    ax.set_yticklabels(top_corr['feature'].values, fontsize=9)
    ax.set_xlabel('Point-Biserial Correlation', fontsize=12, fontweight='bold')
    ax.set_title(f'Top-{top_n} Correlations with Target: {segment_name}',
                fontsize=14, fontweight='bold', pad=20)
    ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
    ax.grid(axis='x', alpha=0.3)
    
    ax.invert_yaxis()
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Сохранено: {save_path}")
    
    plt.show()

In [None]:
print("\n" + "="*80)
print(f"CORRELATION ANALYSIS: SEGMENT 1 - {config.SEGMENT_1_NAME.upper()}")
print("="*80)

print(f"\nРасчет Point-Biserial корреляций на train данных...")
start_time = time.time()

corr_seg1 = calculate_pointbiserial_correlations(
    seg1_train, 
    config.TARGET_COLUMN,
    config.CORRELATION_P_VALUE_THRESHOLD
)

elapsed = time.time() - start_time
print(f"\nРасчет завершен за {elapsed:.2f} сек")

print(f"\nОБЩАЯ СТАТИСТИКА:")
print(f"  Всего признаков: {len(corr_seg1)}")
print(f"  Значимых (p<0.05): {corr_seg1['significant'].sum()}")
print(f"  Средняя |корреляция|: {corr_seg1['abs_correlation'].mean():.4f}")
print(f"  Максимальная |корреляция|: {corr_seg1['abs_correlation'].max():.4f}")

leakage_features = corr_seg1[corr_seg1['abs_correlation'] > config.DATA_LEAKAGE_THRESHOLD]
if len(leakage_features) > 0:
    print(f"\n  ВНИМАНИЕ: Обнаружены признаки с очень высокой корреляцией (>0.9):")
    print(leakage_features[['feature', 'correlation', 'p_value']].head(10))
    print(f"\n Это может указывать на data leakage")
else:
    print(f"\n Признаков с подозрением на data leakage не обнаружено")

print(f"\nТОП-{config.TOP_N_CORRELATIONS} КОРРЕЛЯЦИЙ (по модулю):")
print(corr_seg1.head(config.TOP_N_CORRELATIONS)[['feature', 'correlation', 'p_value', 'significant']].to_string(index=False))

In [None]:
plot_top_correlations(
    corr_seg1,
    config.SEGMENT_1_NAME,
    top_n=config.TOP_N_VISUALIZATION,
    save_path=config.FIGURES_DIR / 'correlation_segment1.png'
)

In [None]:
print("\n" + "="*80)
print(f"CORRELATION ANALYSIS: SEGMENT 2 - {config.SEGMENT_2_NAME.upper()}")
print("="*80)

print(f"\nРасчет Point-Biserial корреляций на train данных...")
start_time = time.time()

corr_seg2 = calculate_pointbiserial_correlations(
    seg2_train, 
    config.TARGET_COLUMN,
    config.CORRELATION_P_VALUE_THRESHOLD
)

elapsed = time.time() - start_time
print(f"\nРасчет завершен за {elapsed:.2f} сек")

print(f"\nОБЩАЯ СТАТИСТИКА:")
print(f"  Всего признаков: {len(corr_seg2)}")
print(f"  Значимых (p<0.05): {corr_seg2['significant'].sum()}")
print(f"  Средняя |корреляция|: {corr_seg2['abs_correlation'].mean():.4f}")
print(f"  Максимальная |корреляция|: {corr_seg2['abs_correlation'].max():.4f}")

leakage_features = corr_seg2[corr_seg2['abs_correlation'] > config.DATA_LEAKAGE_THRESHOLD]
if len(leakage_features) > 0:
    print(f"\n ВНИМАНИЕ: Обнаружены признаки с очень высокой корреляцией (>0.9):")
    print(leakage_features[['feature', 'correlation', 'p_value']].head(10))
    print(f"\n Это может указывать на data leakage")
else:
    print(f"\n Признаков с подозрением на data leakage не обнаружено")

print(f"\nТОП-{config.TOP_N_CORRELATIONS} КОРРЕЛЯЦИЙ (по модулю):")
print(corr_seg2.head(config.TOP_N_CORRELATIONS)[['feature', 'correlation', 'p_value', 'significant']].to_string(index=False))

In [None]:
plot_top_correlations(
    corr_seg2,
    config.SEGMENT_2_NAME,
    top_n=config.TOP_N_VISUALIZATION,
    save_path=config.FIGURES_DIR / 'correlation_segment2.png'
)

# СТАТИСТИКА ABT И PSI

In [None]:
print("\n" + "="*80)
print("СТАТИСТИКА ИТОГОВОЙ ВИТРИНЫ ABT")
print("="*80)

data = {
    'Segment 1': {'train': seg1_train, 'val': seg1_val, 'test': seg1_test},
    'Segment 2': {'train': seg2_train, 'val': seg2_val, 'test': seg2_test}
}

segments_info = {
    'Segment 1': config.SEGMENT_1_NAME,
    'Segment 2': config.SEGMENT_2_NAME
}

abt_statistics = {}

for seg_id, seg_data in data.items():
    print(f"\n{seg_id}: {segments_info[seg_id]}")
    print("="*80)
    
    train_df = seg_data['train']
    val_df = seg_data['val'] 
    test_df = seg_data['test']
    
    full_df = pd.concat([train_df, val_df, test_df], axis=0)
    
    numeric_cols = full_df.select_dtypes(include=[np.number]).columns.tolist()
    non_numeric_cols = full_df.select_dtypes(exclude=[np.number]).columns.tolist()
    
    if config.TARGET_COLUMN in numeric_cols:
        numeric_cols.remove(config.TARGET_COLUMN)
    
    stats = {
        'Количество наблюдений': len(full_df),
        'Количество событий (churn=1)': int(full_df[config.TARGET_COLUMN].sum()),
        'Уровень целевой переменной (%)': f"{full_df[config.TARGET_COLUMN].mean()*100:.2f}%",
        'Количество числовых предикторов': len(numeric_cols),
        'Количество не числовых предикторов': len(non_numeric_cols),
        'Всего признаков': len(numeric_cols) + len(non_numeric_cols),
        'Train размер': len(train_df),
        'Val размер': len(val_df),
        'Test размер': len(test_df)
    }
    
    abt_statistics[seg_id] = stats
    
    print("\nСтатистика ABT:")
    print("-" * 80)
    for key, value in stats.items():
        print(f"  {key:45s}: {value}")

In [None]:
def calculate_psi(expected, actual, bins=10):
    combined = np.concatenate([expected, actual])
    min_val = combined.min()
    max_val = combined.max()
    
    breakpoints = np.linspace(min_val, max_val, bins + 1)
    breakpoints[0] = -np.inf
    breakpoints[-1] = np.inf
    
    expected_counts = np.histogram(expected, bins=breakpoints)[0]
    actual_counts = np.histogram(actual, bins=breakpoints)[0]
    
    expected_percents = expected_counts / len(expected)
    actual_percents = actual_counts / len(actual)
    
    expected_percents = np.where(expected_percents == 0, 0.0001, expected_percents)
    actual_percents = np.where(actual_percents == 0, 0.0001, actual_percents)
    
    psi_values = (actual_percents - expected_percents) * np.log(actual_percents / expected_percents)
    psi = np.sum(psi_values)
    
    return psi

def interpret_psi(psi_value):
    if psi_value < 0.1:
        return "Отлично - модель стабильна"
    elif psi_value < 0.2:
        return "Приемлемо - небольшие изменения"
    else:
        return "Требует внимания - значительный drift"

print("\n" + "="*80)
print("PSI (POPULATION STABILITY INDEX) - TRAIN vs TEST")
print("="*80)

psi_results = {}

for seg_id, seg_data in data.items():
    print(f"\n{seg_id}: {segments_info[seg_id]}")
    print("="*80)
    
    train_df = seg_data['train']
    test_df = seg_data['test']
    
    X_train = train_df.drop(columns=[config.TARGET_COLUMN])
    X_test = test_df.drop(columns=[config.TARGET_COLUMN])
    
    feature_psi = {}
    
    for col in X_train.columns:
        try:
            psi_val = calculate_psi(X_train[col].values, X_test[col].values)
            feature_psi[col] = psi_val
        except Exception as e:
            feature_psi[col] = np.nan
    
    psi_df = pd.DataFrame({
        'Feature': list(feature_psi.keys()),
        'PSI': list(feature_psi.values())
    }).sort_values('PSI', ascending=False).reset_index(drop=True)
    
    psi_df['Interpretation'] = psi_df['PSI'].apply(interpret_psi)
    
    overall_psi = psi_df['PSI'].mean()
    
    print(f"\nОбщий PSI (среднее по всем признакам): {overall_psi:.6f}")
    print(f"Интерпретация: {interpret_psi(overall_psi)}")
    
    print("\n" + "-" * 80)
    print("ТОП-15 признаков с наибольшим PSI:")
    print("-" * 80)
    print(psi_df.head(15).to_string(index=False))
    
    excellent = (psi_df['PSI'] < 0.1).sum()
    acceptable = ((psi_df['PSI'] >= 0.1) & (psi_df['PSI'] < 0.2)).sum()
    concerning = (psi_df['PSI'] >= 0.2).sum()
    
    print("\n" + "-" * 80)
    print("Распределение признаков по PSI:")
    print(f"Отличная стабильность (PSI < 0.1):     {excellent} признаков ({excellent/len(psi_df)*100:.1f}%)")
    print(f"Приемлемая стабильность (0.1-0.2):    {acceptable} признаков ({acceptable/len(psi_df)*100:.1f}%)")
    print(f"Требует внимания (PSI >= 0.2):         {concerning} признаков ({concerning/len(psi_df)*100:.1f}%)")
    
    psi_results[seg_id] = {
        'overall_psi': overall_psi,
        'psi_df': psi_df,
        'excellent': excellent,
        'acceptable': acceptable,
        'concerning': concerning
    }
    
    seg_num = seg_id.split()[1]
    psi_file = config.OUTPUT_DIR / f'psi_analysis_seg{seg_num}.csv'
    psi_df.to_csv(psi_file, index=False)
    print(f"\nСохранено: {psi_file}")

In [None]:
print("\nМЕТОД РАЗБИЕНИЯ:")
print("-" * 80)
print("\nTEMPORAL SPLIT (по времени)")
print("  - Train: 70% первых наблюдений по времени")
print("  - Validation: 15% средних")
print("  - Test (OOT): 15% последних")
print("\nОБОСНОВАНИЕ:")
print("  - Предотвращение data leakage")
print("  - Test = Out-of-Time validation (реальное будущее)")
print("  - Gap detection: удалены клиенты с пропусками в наблюдениях")

print("\n" + "="*80)
print("ТАБЛИЦА РАЗБИЕНИЯ ВЫБОРКИ")
print("="*80)

split_table_data = []

for seg_id, seg_data in data.items():
    for split_name in ['train', 'val', 'test']:
        df = seg_data[split_name]
        split_table_data.append({
            'Сегмент': seg_id,
            'Роль данных': split_name.upper(),
            'Количество наблюдений': len(df),
            'Количество событий (churn=1)': int(df[config.TARGET_COLUMN].sum()),
            'Churn Rate (%)': f"{df[config.TARGET_COLUMN].mean()*100:.2f}%"
        })

split_table = pd.DataFrame(split_table_data)
print("\n" + split_table.to_string(index=False))

# ОБУЧЕНИЕ МОДЕЛЕЙ

In [None]:
def calculate_metrics(y_true, y_pred_proba, threshold=0.5):
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    metrics = {
        'threshold': threshold,
        'roc_auc': roc_auc_score(y_true, y_pred_proba),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0)
    }
    
    metrics['gini'] = 2 * metrics['roc_auc'] - 1
    
    cm = confusion_matrix(y_true, y_pred)
    metrics['tn'], metrics['fp'] = cm[0, 0], cm[0, 1]
    metrics['fn'], metrics['tp'] = cm[1, 0], cm[1, 1]
    
    return metrics


def train_catboost_classweight(X_train, y_train, X_val, y_val, random_seed=42):
    pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
    
    model = CatBoostClassifier(
        iterations=300,
        depth=6,
        learning_rate=0.05,
        loss_function='Logloss',
        eval_metric='AUC',
        early_stopping_rounds=50,
        use_best_model=True,
        random_seed=random_seed,
        task_type='CPU',
        verbose=False,
        allow_writing_files=False,
        class_weights=[1, pos_weight]
    )
    
    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)
    model.fit(train_pool, eval_set=val_pool)
    
    return model


def train_xgboost(X_train, y_train, X_val, y_val, random_seed=42):
    model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        objective='binary:logistic',
        eval_metric='auc',
        early_stopping_rounds=50,
        random_state=random_seed,
        n_jobs=-1,
        verbosity=0
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    return model


def train_logistic(X_train, y_train, random_seed=42):
    model = LogisticRegression(
        max_iter=1000,
        random_state=random_seed,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    return model

In [None]:
print("\n" + "="*80)
print("ОБУЧЕНИЕ МОДЕЛЕЙ С CLASSWEIGHT")
print("="*80)

final_models = {}
comparison_results = {}

thresholds = {
    'Segment 1': 0.12,
    'Segment 2': 0.10
}

for seg_id, seg_data in data.items():
    print(f"\n{seg_id}: {segments_info[seg_id]}")
    print("-" * 80)
    
    X_train = seg_data['train'].drop(columns=[config.TARGET_COLUMN])
    y_train = seg_data['train'][config.TARGET_COLUMN]
    
    X_val = seg_data['val'].drop(columns=[config.TARGET_COLUMN])
    y_val = seg_data['val'][config.TARGET_COLUMN]
    
    X_test = seg_data['test'].drop(columns=[config.TARGET_COLUMN])
    y_test = seg_data['test'][config.TARGET_COLUMN]
    
    print(f"  Train shape: {X_train.shape}")
    print(f"  Class ratio: 1:{(len(y_train) - y_train.sum()) / y_train.sum():.1f}")
    
    seg_results = {}
    
    print(f"\n  Обучение CatBoost с class_weight...")
    start = time.time()
    catboost_model = train_catboost_classweight(X_train, y_train, X_val, y_val, config.RANDOM_SEED)
    catboost_time = time.time() - start
    y_pred_catboost = catboost_model.predict_proba(X_test)[:, 1]
    catboost_metrics = calculate_metrics(y_test, y_pred_catboost, thresholds[seg_id])
    catboost_metrics['train_time'] = catboost_time
    seg_results['CatBoost_classweight'] = catboost_metrics
    print(f"    Завершено за {catboost_time:.1f} сек | ROC-AUC: {catboost_metrics['roc_auc']:.4f}")
    
    print(f"  Обучение XGBoost...")
    start = time.time()
    xgboost_model = train_xgboost(X_train, y_train, X_val, y_val, config.RANDOM_SEED)
    xgboost_time = time.time() - start
    y_pred_xgboost = xgboost_model.predict_proba(X_test)[:, 1]
    xgboost_metrics = calculate_metrics(y_test, y_pred_xgboost, thresholds[seg_id])
    xgboost_metrics['train_time'] = xgboost_time
    seg_results['XGBoost'] = xgboost_metrics
    print(f"    Завершено за {xgboost_time:.1f} сек | ROC-AUC: {xgboost_metrics['roc_auc']:.4f}")
    
    print(f"  Обучение LogisticRegression...")
    start = time.time()
    logistic_model = train_logistic(X_train, y_train, config.RANDOM_SEED)
    logistic_time = time.time() - start
    y_pred_logistic = logistic_model.predict_proba(X_test)[:, 1]
    logistic_metrics = calculate_metrics(y_test, y_pred_logistic, thresholds[seg_id])
    logistic_metrics['train_time'] = logistic_time
    seg_results['LogisticRegression'] = logistic_metrics
    print(f"    Завершено за {logistic_time:.1f} сек | ROC-AUC: {logistic_metrics['roc_auc']:.4f}")
    
    comparison_results[seg_id] = seg_results
    
    final_models[seg_id] = {
        'model': catboost_model,
        'X_train': X_train,
        'X_test': X_test,
        'y_test': y_test,
        'y_test_proba': y_pred_catboost,
        'algorithm': 'CatBoost_classweight',
        'threshold': thresholds[seg_id]
    }

In [None]:
print("\n" + "="*80)
print("СРАВНЕНИЕ МОДЕЛЕЙ")
print("="*80)

for seg_id, seg_results in comparison_results.items():
    print(f"\n{seg_id}: {segments_info[seg_id]}")
    print("-" * 80)
    
    comparison_df = pd.DataFrame([
        {
            'Model': model_name,
            'ROC-AUC': metrics['roc_auc'],
            'Gini': metrics['gini'],
            'F1': metrics['f1'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'Train Time (s)': metrics['train_time']
        }
        for model_name, metrics in seg_results.items()
    ]).sort_values('ROC-AUC', ascending=False)
    
    print("\n" + comparison_df.to_string(index=False))
    
    seg_num = seg_id.split()[1]
    comparison_file = config.OUTPUT_DIR / f'model_comparison_seg{seg_num}.csv'
    comparison_df.to_csv(comparison_file, index=False)
    print(f"\nСохранено: {comparison_file}")

# FEATURE IMPORTANCE

In [None]:
for seg_id, model_data in final_models.items():
    print(f"\n{seg_id}: {segments_info[seg_id]}")
    print("="*80)
    
    model = model_data['model']
    
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        feature_names = model_data['X_train'].columns
        
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importance
        }).sort_values('Importance', ascending=False).reset_index(drop=True)
        
        print(f"\nТОП-20 признаков по важности:")
        print("-" * 80)
        print(importance_df.head(20).to_string(index=False))
        
        top20 = importance_df.head(20)
        fig, ax = plt.subplots(figsize=(10, 8))
        ax.barh(range(len(top20)), top20['Importance'], color='steelblue', alpha=0.7)
        ax.set_yticks(range(len(top20)))
        ax.set_yticklabels(top20['Feature'], fontsize=9)
        ax.set_xlabel('Importance', fontsize=11, fontweight='bold')
        ax.set_title(f'Feature Importance - {seg_id}', fontsize=13, fontweight='bold')
        ax.invert_yaxis()
        plt.tight_layout()
        
        seg_num = seg_id.split()[1]
        plt.savefig(config.FIGURES_DIR / f'feature_importance_seg{seg_num}.png', dpi=100)
        plt.show()
        
        importance_file = config.OUTPUT_DIR / f'feature_importance_seg{seg_num}.csv'
        importance_df.to_csv(importance_file, index=False)
        print(f"\nСохранено: {importance_file}")

# ROC CURVES

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

colors = ['#2E86AB', '#A23B72']

for idx, (seg_id, model_data) in enumerate(final_models.items()):
    y_test = model_data['y_test']
    y_pred_proba = model_data['y_test_proba']
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    algorithm = model_data['algorithm']
    
    label = f"{seg_id} | {algorithm} (AUC={roc_auc:.4f})"
    ax.plot(fpr, tpr, color=colors[idx], lw=2, label=label)

ax.plot([0, 1], [0, 1], 'k--', lw=1, label='Random (AUC=0.5000)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC CURVES - ФИНАЛЬНЫЕ МОДЕЛИ', fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='lower right', fontsize=10)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'final_roc_curves.png', dpi=100)
plt.show()

# СОХРАНЕНИЕ МОДЕЛЕЙ

In [None]:
print("\n" + "="*80)
print("СОХРАНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ")
print("="*80)

for seg_id, model_data in final_models.items():
    seg_num = seg_id.split()[1]
    algorithm = model_data['algorithm'].lower().replace(' ', '_')
    
    model_file = config.MODELS_DIR / f"final_model_seg{seg_num}_{algorithm}.pkl"
    
    with open(model_file, 'wb') as f:
        pickle.dump(model_data['model'], f)
    
    file_size = model_file.stat().st_size / 1024
    
    print(f"\n{model_file.name}")
    print(f"  Сегмент: {seg_id}")
    print(f"  Алгоритм: {model_data['algorithm']}")
    print(f"  ROC-AUC: {roc_auc_score(model_data['y_test'], model_data['y_test_proba']):.4f}")
    print(f"  Размер: {file_size:.1f} KB")

print("\n" + "="*80)
print("ВАЛИДАЦИЯ ЗАВЕРШЕНА")
print("="*80)