In [1]:
! pip install xgbimputer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import catboost as cat
from xgbimputer import XGBImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

Collecting xgbimputer
  Downloading xgbimputer-0.2.0-py3-none-any.whl.metadata (9.2 kB)
Downloading xgbimputer-0.2.0-py3-none-any.whl (8.0 kB)
Installing collected packages: xgbimputer
Successfully installed xgbimputer-0.2.0


This experiment is influenced by [this](https://www.kaggle.com/code/sunilkumarmuduli/from-beginner-to-roc-star-the-a-to-z-guide) notebook. In summary, at that study all the missing values were filled with `None` and all the columns were specified as categorical. I just wanted to see instead of filling with `None`, using `XGBImputer` values could enchance the CatBoost algorithm.

In [2]:
train1 = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv', index_col='id')
original = pd.read_csv('/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv')
original['Depression'] = original['Depression'].map({"Yes": 1, "No": 0})
train = pd.concat([train1, original], ignore_index=True)

y = train['Depression'].astype(int)
X = train.drop(['Depression'], axis=1)
cat_features = X.select_dtypes(include = "object").columns.values

In [3]:
cat_feature_index = []
for idx in cat_features:
    cat_feature_index.append(X.columns.get_loc(idx))

In [4]:
imputer = XGBImputer(categorical_features_index=cat_feature_index, replace_categorical_values_back=True,)
X_transformed = imputer.fit_transform(X)
test_transformed = imputer.transform(test)

XGBImputer - Epoch: 1 | Categorical gamma: inf/6479.6667 | Numerical gamma: inf/0.0049089356
XGBImputer - Epoch: 2 | Categorical gamma: 6479.6667/724.6667 | Numerical gamma: 0.0049089356/0.0019103858
XGBImputer - Epoch: 3 | Categorical gamma: 724.6667/0.6667 | Numerical gamma: 0.0019103858/0.000846897
XGBImputer - Epoch: 4 | Categorical gamma: 0.6667/0. | Numerical gamma: 0.000846897/0.
XGBImputer - Epoch: 5 | Categorical gamma: 0./0. | Numerical gamma: 0./0.
XGBImputer - Epoch: 1 | Categorical gamma: inf/3755.6667 | Numerical gamma: inf/0.0141308693
XGBImputer - Epoch: 2 | Categorical gamma: 3755.6667/685.6667 | Numerical gamma: 0.0141308693/0.003312343
XGBImputer - Epoch: 3 | Categorical gamma: 685.6667/0. | Numerical gamma: 0.003312343/0.0007575703
XGBImputer - Epoch: 4 | Categorical gamma: 0./0. | Numerical gamma: 0.0007575703/0.
XGBImputer - Epoch: 5 | Categorical gamma: 0./0. | Numerical gamma: 0./0.


In [5]:
X_transformed = pd.DataFrame(X_transformed,columns = X.columns.values).astype("string")
test_transformed = pd.DataFrame(test_transformed,columns = X.columns.values).astype("string")

In [6]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'depth': 6,
    'random_strength':0,
    'l2_leaf_reg': 0.7047064221215757,
    'random_seed':42,
    # 'task_type': 'GPU'
}

X = X_transformed
test = test_transformed

cv = StratifiedKFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores = []
test_preds = []
for i, (train_idx, val_idx) in enumerate(cv_splits):
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]

    dtrain = cat.Pool(data = X_train_fold, label = y_train_fold, cat_features = X.columns.values)
    dval = cat.Pool(data = X_val_fold, label = y_val_fold, cat_features = X.columns.values)
    dtest = cat.Pool(data = test,cat_features = X.columns.values)
    model = cat.train(pool = dtrain,
                      params = cat_params,
                      num_boost_round=1000,
                      verbose=0,
                      eval_set=[dval],
                      early_stopping_rounds = 50)
    val_pred = model.predict(X_val_fold,prediction_type="Class")
    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)
    test_pred = model.predict(dtest,prediction_type = "Probability")[:, 1]
    test_preds.append(test_pred)
    print(f'Fold {i + 1} accuracy_score: {score}')

print(f'Cross-validated accuracy_score: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

Fold 1 accuracy_score: 0.9402484992321652
Fold 2 accuracy_score: 0.9400020941677428
Fold 3 accuracy_score: 0.9407699556734495
Fold 4 accuracy_score: 0.9405954416948797
Fold 5 accuracy_score: 0.9390597186834665
Cross-validated accuracy_score: 0.940 +/- 0.001


In [7]:
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
sample_submission['Depression'] = np.round(np.mean(test_preds, axis=0))
sample_submission.head()

Unnamed: 0,id,Depression
0,140700,0.0
1,140701,0.0
2,140702,0.0
3,140703,1.0
4,140704,0.0


In [8]:
sample_submission.to_csv('submission.csv', index=False)