In [None]:
# follow this gpt response and idea
# https://chatgpt.com/share/693db66a-4118-8012-91a5-7db54a2aade9

In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold,StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

loan_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)

In [152]:
# cols details

cols = loan_df.columns
feats = cols[:-1]
target = cols[-1]

ordinal_cols = ['grade_subgrade', 'education_level']
nominal_cols = ['gender', 'marital_status', 'employment_status', 'loan_purpose']
numeric_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

cat_feat_indices = [loan_df.columns.get_loc(col) for col in nominal_cols]
    # [5, 6, 8, 9]

In [153]:
# sample of data

loan_train, loan_test = train_test_split(loan_df, stratify=loan_df[target], train_size=0.85, random_state=42)
loan_train.shape

(302936, 12)

In [154]:
# label books for ordinal features

grades = 'ABCDEFG'
grade_levels = '12345'
subgrades = [ ch+lev for ch in grades for lev in grade_levels]
grade_mapping = {
    subgrade : i+1 for i,subgrade in enumerate(subgrades)
}

edu_mapping = {
    'High School': 1,
    'Other': 2,
    "Bachelor's": 3,
    "Master's": 4,
    'PhD': 5
}

monotone_constraints = {
    'grade_subgrade':-1,
    # 'education_level':1,
    # 'annual_income':1,
    'debt_to_income_ratio':-1,
    'credit_score':1,
    'interest_rate':-1
}
monotone_constraints = [ monotone_constraints.get(feat, 0) for feat in feats]

def preprocess(data):
    data = data.copy()
    data['education_level'] = data['education_level'].map(edu_mapping)
    data['grade_subgrade'] = data['grade_subgrade'].map(grade_mapping)
    return data

In [155]:
# no issues so far
# for feat in ordinal_cols+nominal_cols:
#     print(X[feat].value_counts())

In [172]:
X_train = preprocess(loan_df)[feats]
X_test = preprocess(loan_test)[feats]
y_train = loan_df[target]
y_test = loan_test[target]

In [173]:
from catboost import CatBoostClassifier
cat_clf = CatBoostClassifier(
    # monotone_constraints=monotone_constraints,
    iterations=4000,
    learning_rate=0.1,
    depth=5,
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=42,
    verbose=100,
    task_type="CPU"
)

In [174]:
cat_clf.fit(X_train, y_train, cat_features=cat_feat_indices)

train_y_pred = cat_clf.predict(X_train)
test_y_pred = cat_clf.predict(X_test)

print(accuracy_score(y_train, train_y_pred))
print(accuracy_score(y_test, test_y_pred))

0:	total: 142ms	remaining: 9m 29s
100:	total: 16.2s	remaining: 10m 26s
200:	total: 33.3s	remaining: 10m 29s
300:	total: 51.5s	remaining: 10m 32s
400:	total: 1m 6s	remaining: 9m 55s
500:	total: 1m 20s	remaining: 9m 24s
600:	total: 1m 35s	remaining: 9m 1s
700:	total: 1m 50s	remaining: 8m 39s
800:	total: 2m 4s	remaining: 8m 18s
900:	total: 2m 19s	remaining: 8m
1000:	total: 2m 34s	remaining: 7m 42s
1100:	total: 2m 49s	remaining: 7m 26s
1200:	total: 3m 4s	remaining: 7m 10s
1300:	total: 3m 20s	remaining: 6m 54s
1400:	total: 3m 35s	remaining: 6m 39s
1500:	total: 3m 50s	remaining: 6m 24s
1600:	total: 4m 6s	remaining: 6m 10s
1700:	total: 4m 22s	remaining: 5m 55s
1800:	total: 4m 38s	remaining: 5m 40s
1900:	total: 4m 55s	remaining: 5m 26s
2000:	total: 5m 12s	remaining: 5m 12s
2100:	total: 5m 30s	remaining: 4m 58s
2200:	total: 5m 48s	remaining: 4m 44s
2300:	total: 6m 4s	remaining: 4m 29s
2400:	total: 6m 19s	remaining: 4m 12s
2500:	total: 6m 34s	remaining: 3m 56s
2600:	total: 6m 49s	remaining: 3m 4

In [175]:
# submission file

X_sub = preprocess(test_df)[feats]
y_sub_pred = cat_clf.predict(X_sub)

y_sub_pd = pd.DataFrame(y_sub_pred, columns=[target], index=X_sub.index)
y_sub_pd.to_csv("submission07.csv")
y_sub_pd.head()

Unnamed: 0_level_0,loan_paid_back
id,Unnamed: 1_level_1
404674,1.0
549728,1.0
125237,0.0
512666,1.0
101001,1.0


In [None]:
# combos on full data
"""
    iterations=2000,
    learning_rate=0.03,
    depth=5,
0.9084164805441138
0.9083052749719417
    
    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.9096117801546594
0.9093901982790872

    monotone_constraints=monotone_constraints,
    iterations=3000,
    learning_rate=0.03,
    depth=5,
isn't that good

    iterations=3000,
    learning_rate=0.1,
    depth=5,
0.915052357489983
0.9145342312008978
"""

# combos on 85% data
"""
    iterations=3500,
    learning_rate=0.15,
    depth=5,
0.9205706815961127
0.9056677890011223

    iterations=2000,
    learning_rate=0.15,
    depth=5,
0.9162331317506008
0.9064721286943509

    iterations=2000,
    learning_rate=0.3,
    depth=5,
0.9217128370348853
0.9046389824167602

    iterations=3500,
    learning_rate=0.2,
    depth=5,
0.9236934534026989
0.9050130939019828

    iterations=3500,
    learning_rate=0.1,
    depth=5,
0.9174049964348905
0.9063224841002618

    iterations=3500,
    learning_rate=0.15,
    depth=5,
0.9205706815961127
0.9056677890011223

    iterations=4000,
    learning_rate=0.1,
    depth=5,
0.9183325851004833
0.9065095398428732
"""


# different combos tried on 0.15*0.8 = 12% of data
"""
    iterations=500,
    learning_rate=0.05,
    depth=5,
0.9113084252007781
0.9021324354657688

    iterations=2000,
    learning_rate=0.02,
    depth=5,
0.9142265675662193
0.9020576131687242

    iterations=2000,
    learning_rate=0.03,
    depth=5,
0.9187908415224223
0.9032547699214366

    iterations=2000,
    learning_rate=0.04,
    depth=5,
0.9221579288671622
0.9025813692480359

    iterations=2000,
    learning_rate=0.05,
    depth=4,
0.9191899037262433
0.9025065469509914

    iterations=2000,
    learning_rate=0.05,
    depth=5,
0.9252506609467751
0.9029554807332585

    iterations=3000,
    learning_rate=0.01,
    depth=5,
0.9126053773631965
0.9021324354657688

    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.923529705192797
0.9033295922184811

    iterations=4000,
    learning_rate=0.025,
    depth=5,
0.9246520676410436
0.9020576131687242

    iterations=4000,
    learning_rate=0.03,
    depth=5,
0.9270464408639697
0.9030303030303031

    iterations=5000,
    learning_rate=0.01,
    depth=5,
0.9165959994014067
0.9029554807332585
"""
# depth 5 is so far best
# best combo feels like 3000 iter and 0.03 learning rate

In [None]:
def objective(trial):
    # Suggest hyperparameters
    params = {
        'iterations': 3000,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'loss_function': 'Logloss',
        'bootstrap_type': 'Bernoulli',
        'eval_metric': 'Accuracy',
        'random_seed': 42,
        'auto_class_weights': 'Balanced',
        'early_stopping_rounds': 500,
        'verbose': 0,
        'use_best_model': True,
        "task_type": "GPU",
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(
            X_tr, y_tr,
            cat_features=cat_columns,#if you do not encoded cat Columns use this
            eval_set=(X_val, y_val)
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        accuracies.append(acc)

    # Return mean accuracy across folds
    return np.mean(accuracies)


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)  # increase n_trials for more exhaustive search

print("Best hyperparameters:", study.best_params)
print("Best CV accuracy:", study.best_value)


In [None]:
best_params = study.best_params
best_params.update({
    'iterations': 2000,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'bootstrap_type': 'Bernoulli',
   # 'auto_class_weights': 'Balanced',
    'early_stopping_rounds': 200,
    'verbose': 200,
    'use_best_model': True,
    'task_type': 'GPU'#if you use gpu
})

final_model = CatBoostClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    cat_features=cat_columns,
    eval_set=(X_test, y_test)
    
    
)