# Libraries 👨‍🏫

In [23]:
# Data
import polars as pl
import numpy as np

# ML
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Tools
from pathlib import Path
from tqdm.notebook import tqdm

# Data 🤹‍♀️

## Load ur data 📦

In [2]:
folder_path = Path('../data/')

In [41]:
train_df = pl.read_csv(folder_path / 'train.csv').drop('id')
test_df = pl.read_csv(folder_path / 'test.csv').drop('id')
sample_df = pl.read_csv(folder_path / 'sample_submission.csv')

In [4]:
train_df.head()

person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
i64,i64,str,f64,str,str,i64,f64,f64,str,i64,i64
37,35000,"""RENT""",0.0,"""EDUCATION""","""B""",6000,11.49,0.17,"""N""",14,0
22,56000,"""OWN""",6.0,"""MEDICAL""","""C""",4000,13.35,0.07,"""N""",2,0
29,28800,"""OWN""",8.0,"""PERSONAL""","""A""",6000,8.9,0.21,"""N""",10,0
30,70000,"""RENT""",14.0,"""VENTURE""","""B""",12000,11.11,0.17,"""N""",5,0
22,60000,"""RENT""",2.0,"""MEDICAL""","""A""",6000,6.92,0.1,"""N""",3,0


# ML 🦾

In [5]:
X = train_df.drop('loan_status')
y = train_df.select('loan_status')

In [6]:
cat_features = train_df.select(pl.col(pl.String)).columns

In [7]:
cv = StratifiedKFold()

In [1]:
models_list = []
metrics_list = []

pbar = tqdm(cv.split(X, y), total=cv.get_n_splits())
for train_idx, test_idx, in pbar:
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    train_pool = Pool(X_train.to_pandas(), y_train.to_pandas(), cat_features=cat_features)
    test_pool = Pool(X_test.to_pandas(), y_test.to_pandas(), cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=5000, 
        learning_rate=0.05,
        eval_metric='AUC',
        loss_function='CrossEntropy',
        random_seed=2024
    )

    model.fit(
        train_pool, 
        eval_set=test_pool,
        early_stopping_rounds=150,
        verbose=500
    )

    y_pred = model.predict_proba(X_test.to_pandas())[:, 1]
    score = roc_auc_score(y_test, y_pred)
    pbar.set_description(f'ROC-AUC: {score:.4f}')

    models_list.append(model)
    metrics_list.append(score)

print(f'All metrics: {metrics_list}')
print(f'Mean ROC-AUC: {np.mean(metrics_list) - np.std(metrics_list)}')

NameError: name 'tqdm' is not defined

# Submission 🤖

In [42]:
y_subm = np.zeros(sample_df.shape[0])

In [43]:
for model in tqdm(models_list):
    y_pred = model.predict_proba(test_df.to_pandas())[:, 1]
    y_subm += y_pred
y_subm /= len(models_list)
y_subm

  0%|          | 0/5 [00:00<?, ?it/s]

array([0.99885418, 0.02028319, 0.60540859, ..., 0.00651037, 0.1138485 ,
       0.95386074])

In [48]:
sample_df = sample_df.with_columns(loan_status = y_subm)

In [49]:
folder_save = Path('../subs')

In [50]:
sample_df.write_csv(folder_save / 'baseline.csv')