# Libraries

In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import early_stopping, LGBMClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Вспомогательные блоки организации для пайплайна
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn import set_config
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
set_config(transform_output="pandas")

# Data

In [2]:
train = pd.read_csv('../data/train.csv').drop(columns=['id'])
test = pd.read_csv('../data/test.csv').drop(columns=['id'])
train

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


## Features

In [3]:
train_f = train.drop_duplicates(subset=['CustomerId', 'Surname'])
train_f

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165028,15704770,Oluchukwu,630,France,Male,50.0,8,0.00,2,1.0,1.0,5962.50,0
165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [4]:
cat_features = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']
num_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']

In [5]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop='first', sparse=False))])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    # ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, num_features),
    ("categorical", categorical_transformer, cat_features)])

preprocessor

In [6]:
X = preprocessor.fit_transform(train_f).rename(columns={column: column.split('__')[1] for column in preprocessor.fit_transform(train_f).columns})
y = train_f['Exited']

In [7]:
X_test = preprocessor.fit_transform(test).rename(columns={column: column.split('__')[1] for column in preprocessor.fit_transform(test).columns})

# ML

In [8]:
catboost_params = {
    'iterations': 2000,
    'eval_metric': "AUC",
    'learning_rate': 0.1696310500969717,
    'l2_leaf_reg': 48,
    'colsample_bylevel': 0.5868476873585279,
    'min_data_in_leaf': 25,
    'auto_class_weights': 'SqrtBalanced',
    'depth': 5,
    'boosting_type': 'Ordered',
    'bootstrap_type': 'Bernoulli',
    'objective': 'Logloss',
    'subsample': 0.7402107645449161,
    'random_seed': 2024,
    'verbose': 0,
    'thread_count': -1,
    'early_stopping_rounds': 50,
} # 0.9000956693941863.

lgbm_params = {
    'lambda_l1': 2.3772843324101983e-05,
    'lambda_l2': 3.669720725756965e-07,
    'num_leaves': 4,
    'feature_fraction': 0.7574838929544363,
    'bagging_fraction': 0.7330639363496431,
    'bagging_freq': 7,
    'min_child_samples': 51,
    'n_jobs': -1,
    'n_estimators': 1000, 
    'verbosity': 0,
} # 0.9009227922388704

In [9]:
estimators = [
    ('CatBoost', CatBoostClassifier(**catboost_params)),
    ('LGBM', LGBMClassifier(**lgbm_params))
]

In [30]:
model = StackingClassifier(
    estimators,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    stack_method='predict_proba',
    n_jobs=-1,
    verbose=2
)
model

In [31]:
model.fit(X, y)



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.




[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   26.5s remaining:   39.8s




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   32.1s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   53.6s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   54.4s finished


In [14]:
def generate_oof_trainset(train, test, target, strat_kfold, estimators):
    oof_train = pd.DataFrame() # Initializing empty data frame

    print(train.shape, target.shape)
    for (idx, (train_id, test_id)) in enumerate(strat_kfold.split(train, target.astype(str))):
        print(f"Fold {idx+1}")
        X_train, X_test = train.iloc[train_id], train.iloc[test_id]
        y_train, y_test = target.iloc[train_id], target.iloc[test_id]

        curr_split = [None] * (len(estimators) + 1) # Initializing list of lists to save all predictions for a split from all models for the current split

        for (i, (model_name, model)) in enumerate(tqdm(estimators)):
            print(model_name)
            if 'class' in model_name:
                y_train = y_train.astype(str)
                y_test = y_test.astype(str)
            else:
                y_train = y_train.astype(float)
                y_test = y_test.astype(float)

            if 'Cat' in model_name:
                model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)
            elif 'LGB' in model_name:
                model.fit(X_train, y_train, eval_set=(X_test, y_test), callbacks=[
                    early_stopping(100),
                ])
            else:
                model.fit(X_train, y_train)

            y_pred = model.predict_proba(X_test)[:, 1]

            if 'class' in model_name:
                y_pred = y_pred.astype(float)
                y_test = y_test.astype(float)

            print(roc_auc_score(y_test, y_pred))
            curr_split[i] = y_pred

        curr_split[-1] = y_test
        oof_train = pd.concat([oof_train, pd.DataFrame(curr_split).T], ignore_index=True)

    oof_test = pd.DataFrame()
    print('Making test')
    for (idx, (model_name, model)) in enumerate(estimators):
        print(model_name)
        if 'class' in model_name:
            target = target.astype(str)
        else:
            target = target.astype(float)
        if 'Cat' in model_name:
            model.fit(train, target, early_stopping_rounds=100)
        else:
            model.fit(train, target)
        y_pred = model.predict_proba(test)[:, 1]
        if 'class' in model_name:
            y_pred = y_pred.astype(float)

        oof_test[idx] = y_pred

    return oof_train, oof_test

In [None]:
oof_train, oof_test = generate_oof_trainset(X, X_test, y, StratifiedKFold(n_splits=5, shuffle=True, random_state=2024), estimators=estimators)

# Submit

## Get Weights for OOF

In [53]:
np.linspace(0, 1, 101)

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [45]:
best_auc = 0
best_params = None
for w1 in np.linspace(0, 1, 101):
    ensemble_predict = w1 * oof_train[0] + (1 - w1) * oof_train[1]
    auc = roc_auc_score(oof_train[2], ensemble_predict)
    if auc >= best_auc:
        print(auc)
        best_auc = auc
        best_params = (w1, 1 - w1)

0.9030232725838212
0.9030475231040603
0.903070441944419
0.9030923906285662
0.9031127270119329
0.9031319615605233
0.9031496643725967
0.9031662186361613
0.9031811294564575
0.9031953544650002
0.9032088828296199
0.9032216478647707
0.9032329640972052
0.9032432208080263
0.9032529101839522
0.903261583028439
0.9032694583544212
0.9032767223398177
0.9032825029949562
0.9032877308709273
0.9032924357561981
0.9032964544912103
0.903299250206546
0.9033010615484474
0.9033023475301108
0.9033031528342379
0.903303361015002


In [46]:
best_params

(0.26, 0.74)

In [47]:
y_pred = sum(w * oof_test[c] for (w, c) in zip(best_params, oof_test.columns))
y_pred

0         0.024081
1         0.850789
2         0.024517
3         0.288808
4         0.368992
            ...   
110018    0.028023
110019    0.072133
110020    0.027671
110021    0.158798
110022    0.176500
Length: 110023, dtype: float64

In [48]:
sample = pd.read_csv('../data/sample_submission.csv')
sample['Exited'] = y_pred
sample

Unnamed: 0,id,Exited
0,165034,0.024081
1,165035,0.850789
2,165036,0.024517
3,165037,0.288808
4,165038,0.368992
...,...,...
110018,275052,0.028023
110019,275053,0.072133
110020,275054,0.027671
110021,275055,0.158798


In [49]:
sample.to_csv('../subs/dropped_dubl+oof_weights#0.9033.csv', index=False)