# Импорт библиотек

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import gc

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, \
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix, \
    classification_report, roc_curve, auc, precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin

from tqdm import tqdm_notebook

from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from catboost import CatBoostClassifier
from catboost import Pool

import optuna

import matplotlib.pyplot as plt
import seaborn as sns

RAND = 25

In [4]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr').drop(columns='target')
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
train_df.head()

In [6]:
# train_labels.to_csv(index=False)
compression_opts = dict(method='zip',
                        archive_name='train_labels.csv')  
train_labels.to_csv('train_labels.zip', index=False,
          compression=compression_opts) 

# Метрика для Amex

[Ссылка на метрику](https://www.kaggle.com/code/inversion/amex-competition-metric-python)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
def get_metrics(y_test: np.array, y_pred: np.array, y_score: np.array, name: str) -> pd.DataFrame:
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    # Основные метрики для задачи классификации
    df_metrics['Accuracy'] = [accuracy_score(y_test, y_pred)]
    
    # В ROC-AUC подаем фактические значения y и вероятности!!!!
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score)]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Logloss'] = [log_loss(y_test, y_score)]
    
    y_pred_df = pd.DataFrame(data={'prediction': y_score})
    y_true = pd.DataFrame(data={'target': y_test.reset_index(drop=True)})
    df_metrics['Target_Metr'] = amex_metric(y_true, y_pred_df)
    
    return df_metrics

# Предобработка данных

In [None]:
# Удаление колонок с большим количеством пропусков
many_passes_col = ['D_66']
for i, j in zip(train_df.columns.to_list(), train_df.isnull().mean()):
    if j >= 0.5:
        many_passes_col.append(i)
    
train_df.drop(columns=many_passes_col, inplace=True)
print('Количество удаленных столбцов:', len(many_passes_col))

In [None]:
# Категориальные и цифровые колонки датасета
cat_col = train_df.select_dtypes(include=['category']).columns.to_list()
dig_col = [i for i in train_df.columns if i not in cat_col + ['S_2', 'customer_ID']]

# Кодирование категориальных данных
le_cat_col = {i: LabelEncoder().fit(train_df[i]) for i in cat_col}

In [None]:
def preproc_cat_num(data: pd.DataFrame) -> pd.DataFrame:
#     Создание новых признаков для категориальных и числовых колонок

    data[cat_col] = data[cat_col].apply(lambda col: le_cat_col[col.name].transform(col))

    data_num = data.groupby("customer_ID")[dig_col].agg(['mean', 'std', 'min', 'max', 'last'])
    data_num.columns = [f'{i}_{j}' for i, j in data_num.columns]
    data_num = data_num.astype('float16')
    
    data_cat = data.groupby("customer_ID")[cat_col].agg(['count', 'last', 'nunique'])
    data_cat.columns = [f'{i}_{j}' for i, j in data_cat.columns]
    data_cat = data_cat.astype('category')

    result = pd.concat([data_cat, data_num], axis=1)
    
    del data_num, data_cat
    gc.collect()
    return result


df = preproc_cat_num(train_df)
df.shape

In [None]:
# Соединение обработанного датафрейма с таргет метками
df = df.merge(train_labels, on='customer_ID')
df.reset_index(drop=True, inplace=True)

del train_df
gc.collect()

In [None]:
print('Размер фрейма после удаления:', df.shape)

In [None]:
def replacing_missing_values(date: pd.DataFrame, change_to: float = -100.) -> pd.DataFrame:
    # Замена пропущенных данных в числовых признаках
    float_data = date.select_dtypes('float16').columns.to_list()
    not_float = [i for i in date.columns.to_list() if i not in float_data]
    return pd.concat([date[not_float], date[float_data].fillna(change_to).astype('float16')], axis=1)

df = replacing_missing_values(df)

# Baseline

In [None]:
# Уменьшил для того чтобы хватило памяти
df = df[:300000]

In [None]:
X = df.drop(['customer_ID', 'target'], axis=1)
y = df['target']

# Категориальные столбцы
cat_features = df.select_dtypes(include=['category']).columns.to_list()

# Подготовка данных для бинаризованного датафрейма
transformer = make_column_transformer(
    (OneHotEncoder(drop='first'), cat_features), remainder='passthrough'
)

transformed = transformer.fit(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RAND)

del df

## Lightgbm

In [None]:
gbm = LGBMClassifier(random_state=RAND)

gbm.fit(X_train, y_train, 
       eval_set=[(X_train, y_train), (X_test, y_test)],
       callbacks=[early_stopping(200), log_evaluation(100)],
       eval_metric=['auc','binary_logloss'])

preds = gbm.predict(X_test)
preds_prob = gbm.predict_proba(X_test)[:,1]
metric_df = get_metrics(y_test, preds, preds_prob, 'Lightgbm Base')
metric_df

## LGBM + Optuna

In [None]:
def objective(trial):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [1000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.9, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.9, step=0.1),
#         'boosting_type': 'gbdt',
#         'device': 'gpu'
    }


    model = LGBMClassifier(**param_grid)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=0,
        eval_metric="binary_logloss",
#         callbacks=[
#             optuna.integration.LightGBMPruningCallback(trial, "auc")
#         ],  # Add a pruning callback
    )
    
    preds_prob = model.predict_proba(X_test)[:,1]

    y_pred = pd.DataFrame(data={'prediction': preds_prob})
    y_true = pd.DataFrame(data={'target': y_test.reset_index(drop=True)})
    scores = amex_metric(y_true, y_pred)

    return scores


study_lgb = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial)
study_lgb.optimize(func, n_trials=5)

In [None]:
clf = LGBMClassifier(**study_lgb.best_params)

clf.fit(X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=0,
        early_stopping_rounds=100)


preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]

metric_df = pd.concat([metric_df, get_metrics(y_test, preds, preds_prob, 'LGBM Optuna')], ignore_index=True)
metric_df

## CatBoost

In [None]:
clf = CatBoostClassifier(random_state=RAND,
                        eval_metric="AUC",
                        cat_features=cat_features)

clf.fit(X_train,
        y_train,
        eval_set = [(X_test, y_test)],
        verbose=False,
        early_stopping_rounds=20)


preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]

metric_df = pd.concat([metric_df, get_metrics(y_test, preds, preds_prob, 'Catboost Base')], ignore_index=True)
metric_df

## CatBoost + Optuna

In [None]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        'n_estimators': trial.suggest_categorical("n_estimators", [1000]),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': RAND,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
#         'task_type': 'GPU',
#         'bootstrap_type': 'Poisson',
        'cat_features': cat_features
    }
    
    model = CatBoostClassifier(**params)  
    model.fit(
        X_train, 
        y_train, 
        eval_set = [(X_test, y_test)], 
        early_stopping_rounds = 20, 
        verbose = True
    )
    
    preds_prob = model.predict_proba(X_test)[:,1]

    y_pred = pd.DataFrame(data={'prediction': preds_prob})
    y_true = pd.DataFrame(data={'target': y_test.reset_index(drop=True)})
    scores = amex_metric(y_true, y_pred)

    return scores
    

study_cat = optuna.create_study(direction = 'maximize', study_name="CatBoost Classifier")
study_cat.optimize(objective, n_trials = 1)
print('Best value:', study_cat.best_value)

In [None]:
clf = CatBoostClassifier(**study_cat.best_params)

clf.fit(X_train,
        y_train,
        eval_set = [(X_test, y_test)],
        verbose=False,
        early_stopping_rounds=100,
        cat_features=cat_features
       )


preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]

metric_df = pd.concat([metric_df, get_metrics(y_test, preds, preds_prob, 'Catboost Optuna')], ignore_index=True)
metric_df

## Stacking

In [None]:
class StackingClassifierOwn(BaseEstimator, ClassifierMixin):
    def __init__(self, models, ens_model, cat_feat, cv: int = 3):
        """
        models - базовые модели для стекинга
        ens_model - мета-модель
        cv - кол-во фолдов
        """
        self.models = models
        self.ens_model = ens_model
        self.cv = cv
        self.cat_feat = cat_feat

    def crossval_predict(self, model, X: pd.DataFrame, y: pd.Series) -> np.array:
        """
        Получение новых мета-признаков при помощи кросс-валидации
        """
        folds = StratifiedKFold(n_splits=self.cv)
        predicts = []

        for fold, (train_index, test_index) in enumerate(folds.split(X, y)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            if model.__class__.__name__ == 'LGBMClassifier':
                model.fit(X_train,
                          y_train,
                          eval_metric="auc",
                          eval_set=[(X_test, y_test)],
                          verbose=False,
                          early_stopping_rounds=100)
            elif model.__class__.__name__ == 'CatBoostClassifier':
                train_data = Pool(data=X_train,
                                  label=y_train,
                                  cat_features=self.cat_feat)
                eval_data = Pool(data=X_test,
                                 label=y_test,
                                 cat_features=self.cat_feat)
                model.fit(train_data,
                          eval_set=eval_data,
                          use_best_model=True, 
                          verbose=False,
                          early_stopping_rounds=100)
            else:
                model.fit(X_train, y_train)
            y_predict_val = model.predict_proba(X_test)[:,1]
            predicts.append(y_predict_val)
        return np.concatenate(predicts)

    def fit(self, X: pd.DataFrame, y: pd.Series, X_bin: pd.DataFrame):
        """
        Обучение ансамбля моделей в зависимости от самой модели и данных 
        (бинаризованные, категориальые)
        """
        meta_X = list()
        for model in tqdm_notebook(self.models):
            # обучение на категориальных данных
            if model.__class__.__name__ in [
                    'LGBMClassifier', 'CatBoostClassifier'
            ]:
                yhat = self.crossval_predict(model=model, X=X, y=y)
                yhat = yhat.reshape(len(yhat), 1)
                meta_X.append(yhat)
                
                # настройка модели для предсказания на test
                if model.__class__.__name__ == 'CatBoostClassifier':
                    model.fit(X, y, cat_features=self.cat_feat, use_best_model=True, silent=True)
                else:
                    model.fit(X, y, verbose=False)
            # обучение на бинаризованных данных
            else:
                yhat = self.crossval_predict(model=model, X=X_bin, y=y)
                yhat = yhat.reshape(len(yhat), 1)
                meta_X.append(yhat)
                model.fit(X_bin, y)
                
        meta_X = np.hstack(meta_X)
        self.ens_model.fit(meta_X, y)

        return self

    def predict(self, X: pd.DataFrame, X_bin: pd.DataFrame) -> np.array:
        """
        получение прогнозов - классов
        """
        meta_X = list()
        for model in self.models:
            if model.__class__.__name__ in [
                    'LGBMClassifier', 'CatBoostClassifier'
            ]:
                # ответы на тестовой выборке
                yhat = model.predict(X)
            else:
                yhat = model.predict(X_bin)
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
        meta_X = np.hstack(meta_X)

        return self.ens_model.predict(meta_X)

    def predict_proba(self, X: pd.DataFrame, X_bin: pd.DataFrame) -> np.array:
        """
        получение прогнозов - вероятностей
        """
        meta_X = list()
        for model in self.models:
            if model.__class__.__name__ in [
                    'LGBMClassifier', 'CatBoostClassifier'
            ]:
                # ответы на тестовой выборке
                yhat = model.predict(X)
            else:
                yhat = model.predict(X_bin)
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
        meta_X = np.hstack(meta_X)

        return self.ens_model.predict_proba(meta_X)

In [None]:
# Объявляем базоыве алгоритмы (должны быть некоррелированы)
clf_01 = LGBMClassifier(**study_lgb.best_params)
clf_02 = LGBMClassifier(**study_lgb.best_params, random_state=RAND+1)
clf_03 = CatBoostClassifier(**study_cat.best_params)

models = [clf_01, clf_02, clf_03] 
stack_model = LogisticRegression(random_state=RAND)

# Бинаризованные тренировочные данные
X_train_bin = pd.DataFrame(transformed.transform(X_train), columns=transformer.get_feature_names_out())
# Обучение
stack = StackingClassifierOwn(models, stack_model, cat_features, cv=5)
stack.fit(X_train, y_train, X_train_bin)

In [None]:
# Бинаризованные тестовые данные
X_test_bin = pd.DataFrame(transformed.transform(X_train), columns=transformer.get_feature_names_out())

preds = stack.predict(X_test, X_test_bin)
preds_prob = stack.predict_proba(X_test, X_test_bin)[:,1]

metric_df = pd.concat([metric_df, get_metrics(y_test, preds, preds_prob, 'Stacking')], ignore_index=True)
metric_df

In [None]:
# Построение ROC-AUC кривой
fpr, tpr, thresholds = roc_curve(y_test, preds_prob)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, color='darkorange', label=f'AUC = {round(roc_auc,3)})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')

plt.legend()
plt.show()

In [None]:
del X_train_bin, X, y, X_train, y_train, X_test_bin, X_test, y_test
gc.collect()

# Предсказание тестовых данных

In [None]:
# Загрузка тестовых данных(данные для предсказания)

test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
print('Количество уникальных ID:', test_df['customer_ID'].unique().shape[0])

test_df.drop(columns=many_passes_col, inplace=True)

In [None]:
# Уникальные ID клиентов и их количество
uniq_ID = test_df['customer_ID'].unique()
uniq_len = uniq_ID.shape[0]

# Разделяю одну воборку на несколько маленьких
step = uniq_len // 20
# Датафрейм для предсказанных значений
submis = pd.DataFrame({'customer_ID': pd.Series(dtype='object'),
                   'prediction': pd.Series(dtype='int64')})

for i in tqdm_notebook(range(0, uniq_len, step)):
    part_test_df = test_df[
        test_df['customer_ID'].isin(
            uniq_ID[i: i+step]
        )
    ]
#     Предсказание новых данных
    part_test_df = preproc_cat_num(part_test_df)
    part_test_df.drop(columns=many_passes_col, inplace=True)
    part_test_df = replacing_missing_values(part_test_df).reset_index()
    
    customer_ID = part_test_df['customer_ID']
    X_test = part_test_df.drop(['customer_ID'], axis=1)
    X_test_bin = pd.DataFrame(transformed.transform(X_test), 
                              columns=transformer.get_feature_names_out())
    
    preds = stack.predict(X_test, X_test_bin)
    preds = pd.DataFrame({'customer_ID': pd.Series(customer_ID), 'prediction': pd.Series(preds)})
    submis = pd.concat([submis, preds], axis=0, ignore_index=True)
    

In [None]:
submis.head()

In [None]:
submis.to_csv('submission.csv', index=False)