In [2]:
# Data
import polars as pl
import numpy as np
import phik

# ML
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Some tools
from pathlib import Path
from tqdm.notebook import tqdm

# Data 🤹‍♀️

## Load ur data 📦

In [3]:
folder_path = Path('../data/')

In [4]:
train_df = pl.read_csv(folder_path / 'concated.csv')
sample_df = pl.read_csv(folder_path / 'sample_submission.csv')
test_df = pl.read_csv(folder_path / 'test.csv').drop('id')

In [5]:
train_df.shape

(91061, 12)

In [19]:
train_df

person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
i64,i64,str,f64,str,str,i64,f64,i64,f64,str,i64
24,63000,"""MORTGAGE""",7.0,"""VENTURE""","""B""",6000,10.59,0,0.1,"""N""",2
27,48000,"""MORTGAGE""",4.0,"""VENTURE""","""A""",6000,7.49,0,0.13,"""N""",7
35,60000,"""MORTGAGE""",12.0,"""PERSONAL""","""B""",7000,10.62,0,0.12,"""N""",8
32,65000,"""OWN""",3.0,"""EDUCATION""","""C""",12000,13.35,0,0.18,"""N""",10
23,60000,"""MORTGAGE""",1.0,"""MEDICAL""","""A""",10000,8.49,0,0.17,"""N""",2
…,…,…,…,…,…,…,…,…,…,…,…
23,31000,"""RENT""",3.0,"""DEBTCONSOLIDATION""","""A""",12000,7.29,1,0.39,"""N""",3
35,70000,"""MORTGAGE""",16.0,"""HOMEIMPROVEMENT""","""A""",6000,5.42,0,0.09,"""N""",10
21,51996,"""MORTGAGE""",5.0,"""EDUCATION""","""B""",6000,10.62,0,0.12,"""N""",3
30,70000,"""MORTGAGE""",5.0,"""DEBTCONSOLIDATION""","""A""",14000,7.88,0,0.19,"""N""",5


## Feature Engineering ⚡️

### Try already done features 🤓 

In [11]:
(
    train_df.select(
        # pl.col(pl.Int64, pl.Float64)
        '*'
    ).to_pandas()
    .phik_matrix()
    ['loan_status'].sort_values()
)

interval columns not set, guessing: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_cred_hist_length']


person_income                 0.004249
person_age                    0.010261
person_emp_length             0.025630
cb_person_cred_hist_length    0.033936
loan_intent                   0.158070
loan_amnt                     0.186542
cb_person_default_on_file     0.288304
person_home_ownership         0.355777
loan_grade                    0.414850
loan_int_rate                 0.518182
loan_percent_income           0.543314
loan_status                   1.000000
Name: loan_status, dtype: float64

In [27]:
feature_rate = train_df.with_columns(
    (pl.col('loan_percent_income') / pl.col('person_income')).alias('loan_int_rate_per_persone_income'),
    (pl.col('loan_percent_income') > 0.2).alias('check_loan_bool'),
    ((pl.col('cb_person_default_on_file') == 'Y') & (pl.col('loan_grade').is_in(['C', 'D', 'E']))).alias('risk_flag')
    # (pl.col('loan_amnt') / pl.col('person_income') - pl.col('loan_percent_income')).alias('loan_to_income')
)

In [29]:
(
    feature_rate.select(
        # pl.col(pl.Int64, pl.Float64)
        '*'
    ).to_pandas()
    .phik_matrix()
    ['loan_status'].sort_values()
)

interval columns not set, guessing: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_cred_hist_length', 'loan_int_rate_per_persone_income']


person_income                       0.004249
person_age                          0.010261
person_emp_length                   0.025630
cb_person_cred_hist_length          0.033936
loan_intent                         0.158070
loan_amnt                           0.186542
loan_int_rate_per_persone_income    0.219960
risk_flag                           0.273636
cb_person_default_on_file           0.288304
person_home_ownership               0.355777
loan_grade                          0.414850
check_loan_bool                     0.454090
loan_int_rate                       0.518182
loan_percent_income                 0.543314
loan_status                         1.000000
Name: loan_status, dtype: float64

In [None]:
def preprocess(df):
    df['loan_to_income'] = ((df['loan_amnt'] / df['person_income']) - df['loan_percent_income']).astype('string').astype('category')
    df['age_income_interaction'] = (df['person_age'] * df['person_income']).astype('string').astype('category')
    df['loan_to_emp_length_ratio'] = (df['loan_amnt'] / df['person_emp_length'].replace({'None': original['person_emp_length'].mean()}).astype('float')).astype('string').astype('category')
    monthly_income = df['person_income'] / 12
    df['monthly_debt'] = (df['loan_amnt'] * (1 + df['loan_int_rate'].replace({'None': original['loan_int_rate'].mean()})) / 12)
    df['dti_ratio'] = (df['monthly_debt'] / monthly_income).astype('string').astype('category')
    df['monthly_debt'] = df['monthly_debt'].astype('string').astype('category')
    df['risk_flag'] = (np.where((df['cb_person_default_on_file'] == 'Y') & (df['loan_grade'].isin(['C', 'D', 'E'])), 1, 0))
    df['risk_flag'] = df['risk_flag'].astype('category')
    df['person_home_ownership'] = df['person_home_ownership'].astype('category')
    df['loan_intent'] = df['loan_intent'].astype('category')
    df['loan_grade'] = df['loan_grade'].astype('category')
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('string').astype('category')
    df['loan_int_rate'] = (df['loan_int_rate'] * 100).astype('string').astype('category')
    df['loan_percent_income'] = (df['loan_percent_income'] * 100).astype('string').astype('category')

In [34]:
train_df.with_columns(
    (pl.col('loan_amnt') / pl.col('person_income') - pl.col('loan_percent_income')).alias('loan_to_income'),
    (pl.col(''))
)

person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_to_income
i64,i64,str,f64,str,str,i64,f64,i64,f64,str,i64,f64
24,63000,"""MORTGAGE""",7.0,"""VENTURE""","""B""",6000,10.59,0,0.1,"""N""",2,-0.004762
27,48000,"""MORTGAGE""",4.0,"""VENTURE""","""A""",6000,7.49,0,0.13,"""N""",7,-0.005
35,60000,"""MORTGAGE""",12.0,"""PERSONAL""","""B""",7000,10.62,0,0.12,"""N""",8,-0.003333
32,65000,"""OWN""",3.0,"""EDUCATION""","""C""",12000,13.35,0,0.18,"""N""",10,0.004615
23,60000,"""MORTGAGE""",1.0,"""MEDICAL""","""A""",10000,8.49,0,0.17,"""N""",2,-0.003333
…,…,…,…,…,…,…,…,…,…,…,…,…
23,31000,"""RENT""",3.0,"""DEBTCONSOLIDATION""","""A""",12000,7.29,1,0.39,"""N""",3,-0.002903
35,70000,"""MORTGAGE""",16.0,"""HOMEIMPROVEMENT""","""A""",6000,5.42,0,0.09,"""N""",10,-0.004286
21,51996,"""MORTGAGE""",5.0,"""EDUCATION""","""B""",6000,10.62,0,0.12,"""N""",3,-0.004607
30,70000,"""MORTGAGE""",5.0,"""DEBTCONSOLIDATION""","""A""",14000,7.88,0,0.19,"""N""",5,0.01


In [30]:
def calculate_corr(df: pl.DataFrame):
    return (
        df.select(
            pl.col(pl.Int64, pl.Float64, pl.Boolean)
        ).to_pandas()
        .phik_matrix()
        ['loan_status'].sort_values()
    )

In [31]:
calculate_corr(feature_rate)

interval columns not set, guessing: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_cred_hist_length', 'loan_int_rate_per_persone_income']


person_income                       0.004249
person_age                          0.010261
person_emp_length                   0.025630
cb_person_cred_hist_length          0.033936
loan_amnt                           0.186542
loan_int_rate_per_persone_income    0.219960
risk_flag                           0.273636
check_loan_bool                     0.454090
loan_int_rate                       0.518182
loan_percent_income                 0.543314
loan_status                         1.000000
Name: loan_status, dtype: float64

In [34]:
filtered = feature_rate.drop(['person_income', 'person_age', 'person_emp_length', 'cb_person_cred_hist_length'])

## Test ur code 🤖

In [35]:
def calc_metrics(dataset: pl.DataFrame, cv=StratifiedKFold(), get_artifacts=False):
    models_list = []
    metrics_list = []

    X = dataset.drop('loan_status')
    y = dataset.select('loan_status')

    cat_features = X.select(pl.col(pl.String)).columns

    pbar = tqdm(cv.split(X, y), total=cv.get_n_splits())
    for train_idx, test_idx, in pbar:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        train_pool = Pool(X_train.to_pandas(), y_train.to_pandas(), cat_features=cat_features)
        test_pool = Pool(X_test.to_pandas(), y_test.to_pandas(), cat_features=cat_features)

        model = CatBoostClassifier(
            iterations=5000, 
            learning_rate=0.05,
            eval_metric='AUC',
            loss_function='Logloss',
            random_seed=2024
        )

        model.fit(
            train_pool, 
            eval_set=test_pool,
            early_stopping_rounds=150,
            verbose=500
        )

        y_pred = model.predict_proba(X_test.to_pandas())[:, 1]
        score = roc_auc_score(y_test, y_pred)
        pbar.set_description(f'ROC-AUC: {score:.4f}')

        models_list.append(model)
        metrics_list.append(score)

    print(f'All metrics: {metrics_list}')
    print(f'Mean ROC-AUC: {np.mean(metrics_list) - np.std(metrics_list)}')

    if get_artifacts:
        return models_list, metrics_list

In [36]:
models_list, metrics_list = calc_metrics(filtered, get_artifacts=True)

  0%|          | 0/5 [00:00<?, ?it/s]

0:	test: 0.8549280	best: 0.8549280 (0)	total: 118ms	remaining: 9m 49s
500:	test: 0.9290144	best: 0.9290144 (500)	total: 13.8s	remaining: 2m 3s
1000:	test: 0.9327038	best: 0.9327072 (999)	total: 28s	remaining: 1m 52s
1500:	test: 0.9348161	best: 0.9348212 (1490)	total: 42.3s	remaining: 1m 38s
2000:	test: 0.9362342	best: 0.9362646 (1987)	total: 56.8s	remaining: 1m 25s
2500:	test: 0.9374439	best: 0.9374453 (2488)	total: 1m 11s	remaining: 1m 11s
3000:	test: 0.9382619	best: 0.9382808 (2980)	total: 1m 25s	remaining: 57.2s
3500:	test: 0.9385677	best: 0.9387107 (3400)	total: 1m 40s	remaining: 43s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.9387106582
bestIteration = 3400

Shrink model to first 3401 iterations.
0:	test: 0.8583233	best: 0.8583233 (0)	total: 43.9ms	remaining: 3m 39s
500:	test: 0.9324237	best: 0.9324237 (500)	total: 13.5s	remaining: 2m 1s
1000:	test: 0.9356292	best: 0.9356316 (991)	total: 27.8s	remaining: 1m 50s
1500:	test: 0.9377196	best: 0.9377231 (1497)	

KeyboardInterrupt: 

# Predict

In [40]:
weights = [weight / np.sum(metrics_list) for weight in metrics_list]
weights

[0.20036091832660638,
 0.20010007103956562,
 0.19969826526141335,
 0.20035778742864785,
 0.19948295794376675]

In [41]:
y_subm = np.zeros(sample_df.shape[0])

In [43]:
test_df = test_df.with_columns(
    (pl.col('loan_percent_income') / pl.col('person_income')).alias('loan_int_rate_per_persone_income')
)
cat_features = test_df.select(pl.col(pl.String)).columns
test_pool = Pool(test_df.to_pandas(), cat_features=cat_features)

In [44]:
for model, weight in tqdm(zip(models_list, weights), total=len(weights)):
    y_pred = model.predict_proba(test_pool)[:, 1] * weight
    y_subm += y_pred
y_subm

  0%|          | 0/5 [00:00<?, ?it/s]

array([0.99983442, 0.04043389, 0.48940756, ..., 0.0076375 , 0.22591011,
       0.97226121])

In [45]:
sample_df = sample_df.with_columns(loan_status = y_subm)

In [46]:
folder_save = Path('../subs')

In [47]:
sample_df.write_csv(folder_save / 'cleared_dataset_weigted_predict.csv')