In [None]:
# follow this gpt response and idea
# https://chatgpt.com/share/693db66a-4118-8012-91a5-7db54a2aade9

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

loan_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)

In [2]:
# cols details

cols = loan_df.columns
feats = cols[:-1]
target = cols[-1]

ordinal_cols = ['grade_subgrade', 'education_level']
nominal_cols = ['gender', 'marital_status', 'employment_status', 'loan_purpose']
numeric_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

cat_feat_indices = [loan_df.columns.get_loc(col) for col in nominal_cols]
    # [5, 6, 8, 9]

In [3]:
# sample of data

loan_train, loan_test = train_test_split(loan_df, stratify=loan_df[target], train_size=0.85, random_state=42)
loan_train.shape

(302936, 12)

In [4]:
# label books for ordinal features

grades = 'ABCDEFG'
grade_levels = '12345'
subgrades = [ ch+lev for ch in grades for lev in grade_levels]
grade_mapping = {
    subgrade : i+1 for i,subgrade in enumerate(subgrades)
}

edu_mapping = {
    'High School': 1,
    'Other': 2,
    "Bachelor's": 3,
    "Master's": 4,
    'PhD': 5
}

monotone_constraints = {
    'grade_subgrade':-1,
    # 'education_level':1,
    # 'annual_income':1,
    'debt_to_income_ratio':-1,
    'credit_score':1,
    'interest_rate':-1
}
monotone_constraints = [ monotone_constraints.get(feat, 0) for feat in feats]

def preprocess(data):
    data = data.copy()
    data['education_level'] = data['education_level'].map(edu_mapping)
    data['grade_subgrade'] = data['grade_subgrade'].map(grade_mapping)
    return data

In [5]:
# no issues so far
# for feat in ordinal_cols+nominal_cols:
#     print(X[feat].value_counts())

In [6]:
X_train = preprocess(loan_df)[feats]
X_test = preprocess(loan_test)[feats]
y_train = loan_df[target]
y_test = loan_test[target]

In [7]:
from catboost import CatBoostClassifier
cat_clf = CatBoostClassifier(
    # monotone_constraints=monotone_constraints,
    iterations=4000,
    learning_rate=0.1,
    depth=5,
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=42,
    verbose=100,
    task_type="CPU"
)

In [8]:
cat_clf.fit(X_train, y_train, cat_features=cat_feat_indices)

train_y_pred = cat_clf.predict(X_train)
test_y_pred = cat_clf.predict(X_test)

print(accuracy_score(y_train, train_y_pred))
print(accuracy_score(y_test, test_y_pred))

0:	total: 423ms	remaining: 28m 11s
100:	total: 31.6s	remaining: 20m 18s
200:	total: 1m 16s	remaining: 24m 5s
300:	total: 2m 1s	remaining: 24m 49s


KeyboardInterrupt: 

In [175]:
# submission file

X_sub = preprocess(test_df)[feats]
y_sub_pred = cat_clf.predict(X_sub)

y_sub_pd = pd.DataFrame(y_sub_pred, columns=[target], index=X_sub.index)
y_sub_pd.to_csv("submission07.csv")
y_sub_pd.head()

Unnamed: 0_level_0,loan_paid_back
id,Unnamed: 1_level_1
404674,1.0
549728,1.0
125237,0.0
512666,1.0
101001,1.0


In [None]:
# combos on full data
"""
    iterations=2000,
    learning_rate=0.03,
    depth=5,
0.9084164805441138
0.9083052749719417
    
    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.9096117801546594
0.9093901982790872

    monotone_constraints=monotone_constraints,
    iterations=3000,
    learning_rate=0.03,
    depth=5,
isn't that good

    iterations=3000,
    learning_rate=0.1,
    depth=5,
0.915052357489983
0.9145342312008978
"""

# combos on 85% data
"""
    iterations=3500,
    learning_rate=0.15,
    depth=5,
0.9205706815961127
0.9056677890011223

            iterations=2000,
            learning_rate=0.15,
            depth=5,
        0.9162331317506008
        0.9064721286943509

    iterations=2000,
    learning_rate=0.3,
    depth=5,
0.9217128370348853
0.9046389824167602

    iterations=3500,
    learning_rate=0.2,
    depth=5,
0.9236934534026989
0.9050130939019828

            iterations=3500,
            learning_rate=0.1,
            depth=5,
        0.9174049964348905
        0.9063224841002618

    iterations=3500,
    learning_rate=0.15,
    depth=5,
0.9205706815961127
0.9056677890011223

            iterations=4000,
            learning_rate=0.1,
            depth=5,
        0.9183325851004833
        0.9065095398428732
"""


# different combos tried on 0.15*0.8 = 12% of data
"""
    iterations=500,
    learning_rate=0.05,
    depth=5,
0.9113084252007781
0.9021324354657688

    iterations=2000,
    learning_rate=0.02,
    depth=5,
0.9142265675662193
0.9020576131687242

    iterations=2000,
    learning_rate=0.03,
    depth=5,
0.9187908415224223
0.9032547699214366

    iterations=2000,
    learning_rate=0.04,
    depth=5,
0.9221579288671622
0.9025813692480359

    iterations=2000,
    learning_rate=0.05,
    depth=4,
0.9191899037262433
0.9025065469509914

    iterations=2000,
    learning_rate=0.05,
    depth=5,
0.9252506609467751
0.9029554807332585

    iterations=3000,
    learning_rate=0.01,
    depth=5,
0.9126053773631965
0.9021324354657688

    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.923529705192797
0.9033295922184811

    iterations=4000,
    learning_rate=0.025,
    depth=5,
0.9246520676410436
0.9020576131687242

    iterations=4000,
    learning_rate=0.03,
    depth=5,
0.9270464408639697
0.9030303030303031

    iterations=5000,
    learning_rate=0.01,
    depth=5,
0.9165959994014067
0.9029554807332585
"""
# depth 5 is so far best
# best combo feels like 3000 iter and 0.03 learning rate