# Генерация блендов с помощью StratifiedKFold

## Imports

In [1]:
import os
import pathlib
import random
import typing as tp
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")


def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


TEST_SIZE = 0.11
set_seed(560)


# Изменить на свои директории
DATA_PATH = "data/"
WEIGHTS_PATH = "cluster_weights.xlsx"
SAMPLE_SUBM_PATH = "submissions/sample_submission.csv"

## Класс для создания блендов

In [2]:
class Blend:
    """
    Класс для создания бленда
    """

    def __init__(
        self,
        blend_name: str,
        test_size: float,
        impute_type: tp.Literal["0_impute", "knn_impute"],
        use_clusters: bool,
        use_lags: bool,
        feats_to_agg: tp.List[str],
        feats_by_agg: tp.List[str],
        agg_func: tp.List[str],
        use_ohe_start_cluster: bool,
        use_smote_sampling: bool,
    ):
        self.blend_name = blend_name
        if use_clusters:
            TRAIN_PATH = DATA_PATH + "train_cluster_kmeans12.pqt"
            TEST_PATH = DATA_PATH + "test_cluster_kmeans12.pqt"
        elif impute_type == "0_impute":
            TRAIN_PATH = DATA_PATH + "train_data.pqt"
            TEST_PATH = DATA_PATH + "test_data.pqt"
        elif impute_type == "knn_impute":
            TRAIN_PATH = DATA_PATH + "faiss_imputed_train_final_final.pqt"
            TEST_PATH = DATA_PATH + "imputed_test.pqt"

        train = pd.read_parquet(TRAIN_PATH)
        cluster_letters = [
            "a",
            "b",
            "c",
            "d",
            "e",
            "f",
            "g",
            "h",
            "i",
            "j",
            "k",
            "l",
            "m",
            "n",
            "o",
            "p",
        ]
        if use_clusters:
            train.rename(
                columns={
                    ("cluster_" + str(i)): ("cluster_" + cluster_letters[i])
                    for i in range(16)
                },
                inplace=True,
            )

        ids_and_clusters = train[train["date"] == "month_3"][
            ["id", "end_cluster"]
        ].drop_duplicates()
        train_ids, test_ids, _, _ = train_test_split(
            ids_and_clusters["id"],
            ids_and_clusters["end_cluster"],
            stratify=ids_and_clusters["end_cluster"],
            test_size=test_size,
            random_state=560,
            shuffle=True,
        )
        self.ids_and_clusters = ids_and_clusters
        dataset_train = train[train["id"].isin(train_ids)]
        dataset_val = train[train["id"].isin(test_ids)]

        dataset_train = Blend.generate_agg_features(
            dataset_train,
            feats_to_agg,
            feats_by_agg,
            agg_func_list=agg_func,
        )

        X_train_processed = dataset_train.copy()
        if use_ohe_start_cluster:
            X_train_processed = Blend.ohe_start_cluster(
                dataset_train, X_train_processed
            )
        if use_lags:
            not_to_lag = list(X_train_processed.filter(like="agg_").columns) + [
                "id",
                "date",
                "end_cluster",
            ]
            if use_clusters:
                not_to_lag += ["cluster_" + cluster_letters[i] for i in range(16)]
            X_train_processed = Blend.make_lags(
                X_train_processed,
                columns_not_to_use=not_to_lag,
                drop_months=True,
                lags=True,
            )

        X_train, y_train = (
            X_train_processed.drop(columns=["end_cluster"]),
            X_train_processed["end_cluster"],
        )

        if use_lags:
            X_train["changed_m1_m2"] = (
                X_train["start_cluster_lag2"] != X_train["start_cluster_lag1"]
            ) * 1
            X_train["changed_m2_m3"] = (
                X_train["start_cluster_lag1"] != X_train["start_cluster"]
            ) * 1
            X_train["changed_m1_m3"] = (
                X_train["start_cluster_lag2"] != X_train["start_cluster"]
            ) * 1

        dataset_val = Blend.generate_agg_features(
            dataset_train,
            feats_to_agg,
            feats_by_agg,
            build_for_train=False,
            agg_func_list=agg_func,
            test_dataset=dataset_val,
        )

        X_val_processed = dataset_val.copy()
        if use_ohe_start_cluster:
            X_val_processed = Blend.ohe_start_cluster(dataset_train, X_val_processed)
        if use_lags:
            not_to_lag = list(X_val_processed.filter(like="agg_").columns) + [
                "id",
                "date",
                "end_cluster",
            ]
            if use_clusters:
                not_to_lag += ["cluster_" + cluster_letters[i] for i in range(16)]
            X_val_processed = Blend.make_lags(
                X_val_processed,
                columns_not_to_use=not_to_lag,
                drop_months=True,
                lags=True,
            )

        X_val, y_val = (
            X_val_processed.drop(columns=["end_cluster"]),
            X_val_processed["end_cluster"],
        )

        if use_lags:
            X_val["changed_m1_m2"] = (
                X_val["start_cluster_lag2"] != X_val["start_cluster_lag1"]
            ) * 1
            X_val["changed_m2_m3"] = (
                X_val["start_cluster_lag1"] != X_val["start_cluster"]
            ) * 1
            X_val["changed_m1_m3"] = (
                X_val["start_cluster_lag2"] != X_val["start_cluster"]
            ) * 1

        cat_cols = [
            "channel_code",
            "city",
            "city_type",
            "okved",
            "segment",
            "start_cluster",
            "index_city_code",
            "ogrn_month",
            "ogrn_year",
        ]

        if use_lags:
            self.cat_cols = []
            for col in cat_cols:
                self.cat_cols.append(col)
                self.cat_cols.append(col + "_lag1")
                self.cat_cols.append(col + "_lag2")
            if use_clusters:
                self.cat_cols += ["cluster_" + cluster_letters[i] for i in range(16)]
        else:
            self.cat_cols = cat_cols
        X_train = X_train.drop(columns=["date"])
        X_val = X_val.drop(columns=["date"])

        X_train.fillna("0", inplace=True)
        X_train[self.cat_cols] = X_train[self.cat_cols].astype("category")
        X_train[X_train.dtypes[X_train.dtypes == "object"].index] = X_train[
            X_train.dtypes[X_train.dtypes == "object"].index
        ].astype(float)

        X_val.fillna("0", inplace=True)
        X_val[self.cat_cols] = X_val[self.cat_cols].astype("category")
        X_val[X_val.dtypes[X_val.dtypes == "object"].index] = X_val[
            X_val.dtypes[X_val.dtypes == "object"].index
        ].astype(float)

        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val

        test = pd.read_parquet(TEST_PATH)
        if use_clusters:
            test.rename(
                columns={
                    ("cluster_" + str(i)): ("cluster_" + cluster_letters[i])
                    for i in range(16)
                },
                inplace=True,
            )

        dataset_test, _ = Blend.predict_start(test, cat_cols)
        test["start_cluster"] = dataset_test["start_cluster"].copy()
        test = Blend.generate_agg_features(
            dataset_train,
            feats_to_agg,
            feats_by_agg,
            build_for_train=False,
            agg_func_list=agg_func,
            test_dataset=test,
        )

        X_test_processed = test.copy()
        if use_ohe_start_cluster:
            X_test_processed = Blend.ohe_start_cluster(dataset_train, X_test_processed)
        if use_lags:
            not_to_lag = list(X_test_processed.filter(like="agg_").columns) + [
                "id",
                "date",
                "end_cluster",
            ]
            if use_clusters:
                not_to_lag += ["cluster_" + cluster_letters[i] for i in range(16)]
            X_test_processed = Blend.make_lags(
                X_test_processed,
                columns_not_to_use=not_to_lag,
                drop_months=True,
                lags=True,
            )

        X_test = X_test_processed
        if use_lags:
            X_test["changed_m1_m2"] = (
                X_test["start_cluster_lag2"] != X_test["start_cluster_lag1"]
            ) * 1
            X_test["changed_m2_m3"] = (
                X_test["start_cluster_lag1"] != X_test["start_cluster"]
            ) * 1
            X_test["changed_m1_m3"] = (
                X_test["start_cluster_lag2"] != X_test["start_cluster"]
            ) * 1

        X_test = X_test.drop(columns=["id", "date"])
        X_test.fillna("0", inplace=True)
        X_test[self.cat_cols] = X_test[self.cat_cols].astype("category")
        X_test[X_test.dtypes[X_test.dtypes == "object"].index] = X_test[
            X_test.dtypes[X_test.dtypes == "object"].index
        ].astype(float)
        self.X_test = X_test
        print("Train Data Summary:")
        print(f"    - Blend Name {self.blend_name}")
        print(f"    - Train Shape {X_train.shape}")
        print(f"    - Val Size {X_val.shape}")

    @staticmethod
    def generate_agg_features(
        dataset: pd.DataFrame,
        feats_to_agg_list: tp.List[str],
        feats_by_agg_list: tp.List[str],
        build_for_train: bool = True,
        agg_by_3rd_month: bool = True,
        agg_func_list: tp.List[str] = ["max", "mean"],
        test_dataset: pd.DataFrame | None = None,
    ) -> pd.DataFrame:
        if build_for_train:
            train_df = dataset.copy()
            to_agg = (
                dataset[dataset.date == "month_3"].copy()
                if agg_by_3rd_month
                else dataset.copy()
            )
            for agg_feat in feats_by_agg_list:
                grouped = (
                    to_agg.groupby(agg_feat)[feats_to_agg_list]
                    .agg(agg_func_list)
                    .reset_index()
                )
                grouped.columns = [
                    (
                        col[0]
                        if col[1] == ""
                        else "agg_{}_{}_{}".format(agg_feat, col[0], col[1])
                    )
                    for col in grouped.columns
                ]
                train_df = pd.merge(train_df, grouped, on=agg_feat, how="left")
            return train_df
        else:
            test_df = test_dataset.copy()
            for agg_feat in feats_by_agg_list:
                subset_cols = list(dataset.filter(like=("agg_" + agg_feat)).columns) + [
                    agg_feat
                ]
                grouped = dataset[subset_cols].groupby(agg_feat).max()
                test_df = pd.merge(test_df, grouped, on=agg_feat, how="left")
            return test_df

    @staticmethod
    def ohe_start_cluster(
        train_dataset: pd.DataFrame, test_dataset: pd.DataFrame
    ) -> pd.DataFrame:

        result = test_dataset.copy()

        # Инициализация MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        products_train = (
            train_dataset["start_cluster"]
            .str.lstrip("{")
            .str.rstrip("}")
            .str.replace(" ", "")
            .str.split(",")
        )
        mlb.fit(products_train)

        products_test = (
            result["start_cluster"]
            .str.lstrip("{")
            .str.rstrip("}")
            .str.replace(" ", "")
            .str.split(",")
        )
        one_hot_encoded_labels = mlb.transform(products_test)
        for idx, product in tqdm(enumerate(mlb.classes_)):
            result["start_cluster_product_" + product] = one_hot_encoded_labels[:, idx]

        return result

    @staticmethod
    def make_lags(
        dataset: pd.DataFrame,
        columns_not_to_use: list[str],
        drop_months: bool = False,
        lags: bool = True,
    ) -> pd.DataFrame:

        result = dataset.copy()

        if lags:
            for column in tqdm(result.columns):

                if column not in columns_not_to_use:
                    result[column + "_lag1"] = result.groupby("id")[column].shift(1)
                    result[column + "_lag2"] = result.groupby("id")[column].shift(2)

        if drop_months:

            result = result.drop(
                result[
                    (result["date"] == "month_1") | (result["date"] == "month_2")
                ].index
            )
            result = result.drop(
                result[
                    (result["date"] == "month_4") | (result["date"] == "month_5")
                ].index
            )

        return result

    @staticmethod
    def predict_start(
        dataset_df: pd.DataFrame,
        cat_features: tp.List[str],
        task_type: str = "GPU",
        model=None,
        params=None,
    ):
        """
        Заполняем пропуски в start_cluster для 6 месяца, обучаемся на 2
        Возвращаем копию датасета и модель
        """
        dataset = dataset_df.copy()
        dataset.fillna(0, inplace=True)
        cat_features = [feat for feat in cat_features if feat != "start_cluster"]
        cat_features = cat_features + ["prev_month"]
        if not params:
            params = {
                "random_state": 560,
                "task_type": task_type,
                "cat_features": cat_features,
            }
        if not model:
            model = CatBoostClassifier(**params)
        month_4 = dataset[dataset.date == "month_4"].copy()
        month_5 = dataset[dataset.date == "month_5"].copy()
        month_6 = dataset[dataset.date == "month_6"].copy()
        train_dataset = month_5.merge(
            month_4[["id", "start_cluster"]], on="id", how="right"
        )
        pred_dataset = month_6.merge(
            month_5[["id", "start_cluster"]], on="id", how="right"
        )
        train_dataset.rename(
            columns={"start_cluster_x": "target", "start_cluster_y": "prev_month"},
            inplace=True,
        )
        pred_dataset.rename(
            columns={"start_cluster_x": "target", "start_cluster_y": "prev_month"},
            inplace=True,
        )
        X = train_dataset.drop(["id", "date", "target"], axis=1)
        y = train_dataset["target"]
        model.fit(X, y, verbose=False)
        dataset.loc[dataset.date == "month_6", "start_cluster"] = model.predict(
            pred_dataset.drop(["id", "date", "target"], axis=1)
        )
        return dataset, model

    def train_cb_models(
        self,
        n_splits: int,
        save_validation_result: bool = True,
        custom_cb_params: tp.Dict["str", tp.Any] | None = None,
    ):
        skf = StratifiedKFold(n_splits=n_splits, random_state=560, shuffle=True)

        self.scores = []
        self.models = []
        X_train = self.X_train
        y_train = self.y_train
        X_val = self.X_val
        y_val = self.y_val

        cluster_weights = pd.read_excel(WEIGHTS_PATH).set_index("cluster")
        weights_dict = cluster_weights["unnorm_weight"].to_dict()

        def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
            unnorm_weights = np.array([weights_dict[label] for label in labels])
            weights = unnorm_weights / unnorm_weights.sum()
            classes_roc_auc = roc_auc_score(
                y_true, y_pred, labels=labels, multi_class="ovr", average=None
            )
            return sum(weights * classes_roc_auc)

        for i, (train_index, test_index) in enumerate(
            skf.split(self.ids_and_clusters["id"], self.ids_and_clusters["end_cluster"])
        ):

            # ------------[Обучающие выборки для фолда]------------

            X_fold_train = X_train[X_train["id"].isin(train_index)].drop("id", axis=1)
            X_fold_val = X_train[X_train["id"].isin(test_index)].drop("id", axis=1)
            y_fold_train = y_train[X_fold_train.index]
            y_fold_val = y_train[X_fold_val.index]

            # ---------[Создание и обучение модели]-----------------

            params_cat = (
                {
                    "cat_features": self.cat_cols,
                    "n_estimators": 400,
                    "task_type": "GPU",
                    "verbose": 50,
                    "random_state": 560,
                }
                if not custom_cb_params
                else custom_cb_params
            )
            print(f"\nFold {i + 1} Training...")
            clf = CatBoostClassifier(**params_cat)
            clf.fit(X_fold_train, y_fold_train, eval_set=(X_fold_val, y_fold_val))

            # -----------[Валидация]--------------

            print(f"\nFold {i + 1} Validating...")
            y_pred_proba = clf.predict_proba(X_fold_val)
            score = weighted_roc_auc(
                y_fold_val, y_pred_proba, clf.classes_, weights_dict
            )
            print(f"AUC {score}")

            # -------------[Добавляем модель в бленд]---------

            self.models.append(clf)
            self.scores.append(score)

        preds = []

        for model in self.models:
            preds.append(model.predict_proba(X_val.drop("id", axis=1)))

        all_preds = np.array(preds).mean(axis=0)

        print(
            f"Validation Score:",
            weighted_roc_auc(
                y_val, np.array(preds).mean(axis=0), clf.classes_, weights_dict
            ),
        )
        res_path = self.blend_name + "/"
        pathlib.Path(res_path).mkdir(parents=True, exist_ok=True)
        if save_validation_result:
            test_pred_proba_df = pd.DataFrame(all_preds, columns=clf.classes_)
            sorted_classes = sorted(test_pred_proba_df.columns.to_list())
            test_pred_proba_df = test_pred_proba_df[sorted_classes]

            val_path = res_path + "val_results/"
            pathlib.Path(val_path).mkdir(parents=True, exist_ok=True)
            test_pred_proba_df.to_csv(
                val_path + "val_" + self.blend_name + ".csv", index=False
            )

        model_path = res_path + "models/"
        pathlib.Path(model_path).mkdir(parents=True, exist_ok=True)
        for i, model in enumerate(self.models):
            model.save_model(
                model_path + f"model_{i}.cbm",
                format="cbm",
                export_parameters=None,
                pool=None,
            )

    def predict(self):
        preds = []
        X_test = self.X_test
        models = self.models

        for model in models:
            preds.append(model.predict_proba(X_test))

        test_pred_proba_df = pd.DataFrame(
            np.array(preds).mean(axis=0), columns=models[0].classes_
        )
        sorted_classes = sorted(test_pred_proba_df.columns.to_list())
        sample_submission_df = pd.read_csv(SAMPLE_SUBM_PATH)
        sample_submission_df[sorted_classes] = test_pred_proba_df[sorted_classes]
        pathlib.Path(self.blend_name + "/test_results/").mkdir(
            parents=True, exist_ok=True
        )
        sample_submission_df.to_csv(
            self.blend_name + "/test_results/" + "test_" + self.blend_name + ".csv",
            index=False,
        )

## Задаем параметры, создаем бленд

In [3]:
cluster_letters = [
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
]
feats_to_agg = [
    "balance_amt_min",
    "balance_amt_max",
    "sum_cred_e_oper_3m",
    "sum_deb_h_oper_3m",
    "sum_of_paym_2m",
    "sum_of_paym_1y",
    "sum_cred_h_oper_3m",
]

feats_by_agg = ["okved", "channel_code"] + [
    "cluster_" + cluster_letters[i] for i in range(16)
]

agg_func = ["max"]

blend_model = Blend(
    "kfold_blend_6",
    TEST_SIZE,
    "0_impute",
    use_clusters=True,
    use_lags=True,
    feats_to_agg=feats_to_agg,
    feats_by_agg=feats_by_agg,
    agg_func=agg_func,
    use_ohe_start_cluster=True,
    use_smote_sampling=False,
)

0it [00:00, ?it/s]

  0%|          | 0/248 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/248 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/247 [00:00<?, ?it/s]

Train Data Summary:
    - Blend Name kfold_blend_6
    - Train Shape (178000, 455)
    - Val Size (22000, 455)


In [4]:
blend_model.train_cb_models(
    n_splits=3,
    save_validation_result=True,
)


Fold 1 Training...
Learning rate set to 0.254241
0:	learn: 1.2860509	test: 1.2832675	best: 1.2832675 (0)	total: 57.6ms	remaining: 23s
50:	learn: 0.8076106	test: 0.8285881	best: 0.8285881 (50)	total: 2.24s	remaining: 15.3s
100:	learn: 0.7757242	test: 0.8176349	best: 0.8176349 (100)	total: 4.26s	remaining: 12.6s
150:	learn: 0.7561370	test: 0.8143276	best: 0.8143276 (150)	total: 6.23s	remaining: 10.3s
200:	learn: 0.7390112	test: 0.8129027	best: 0.8129027 (200)	total: 8.21s	remaining: 8.13s
250:	learn: 0.7235439	test: 0.8124438	best: 0.8124158 (220)	total: 10.2s	remaining: 6.03s
300:	learn: 0.7072038	test: 0.8126045	best: 0.8123506 (276)	total: 12.2s	remaining: 4s
350:	learn: 0.6936503	test: 0.8121343	best: 0.8120210 (341)	total: 14.2s	remaining: 1.98s
399:	learn: 0.6794237	test: 0.8127439	best: 0.8120210 (341)	total: 16.2s	remaining: 0us
bestTest = 0.8120210049
bestIteration = 341
Shrink model to first 342 iterations.

Fold 1 Validating...
AUC 0.9041702584733377

Fold 2 Training...
Learn

In [5]:
blend_model.predict()

In [6]:
!zip -r kfold_blend_6.zip kfold_blend_6

  adding: kfold_blend_6/ (stored 0%)
  adding: kfold_blend_6/test_results/ (stored 0%)
  adding: kfold_blend_6/test_results/test_kfold_blend_6.csv (deflated 56%)
  adding: kfold_blend_6/models/ (stored 0%)
  adding: kfold_blend_6/models/model_1.cbm (deflated 63%)
  adding: kfold_blend_6/models/model_0.cbm (deflated 63%)
  adding: kfold_blend_6/models/model_2.cbm (deflated 63%)
  adding: kfold_blend_6/val_results/ (stored 0%)
  adding: kfold_blend_6/val_results/val_kfold_blend_6.csv (deflated 56%)
