# Load data and imports

In [1]:
import optuna
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('./data/train.csv')
train.drop('id', axis=1, inplace=True)

competition = pd.read_csv('./data/test.csv')
eval_ids = competition['id'].astype(np.int32)
competition.drop('id', axis=1, inplace=True)

# Data engineering

In [9]:
y = train['loan_status'].reset_index(drop=True)
X = train.drop('loan_status', axis=1).reset_index(drop=True)

num_cols = ['person_age', 'person_income', 'person_emp_length','loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
cat_cols = ['person_home_ownership', 'loan_intent', 'loan_grade' 'cb_person_default_on_file']

def get_dummies(df):
    loan_grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    df['loan_grade_numeric'] = df['loan_grade'].map(loan_grade_mapping)
    df.drop(columns=['loan_grade'], inplace=True)

    cat_cols_to_encode = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
    df = pd.get_dummies(df, columns=cat_cols_to_encode, drop_first=True)
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)
    return df

def data_eng(df):
    # New features
    df['dti'] = df['loan_amnt'] * df['loan_int_rate'] / df['person_income'] # Debt-to-Income Ratio
    df['lti'] = df['loan_amnt'] / df['person_income']  # Loan-to-Income Ratio
    df['loan_amnt_person_income'] = df['loan_amnt'] / df['person_income']  # Loan Amount / Income
    df['emp_length_age_ratio'] = df['person_emp_length'] / df['person_age']  # Employment Length / Age
    df['loan_amnt_emp_length'] = df['loan_amnt'] / (df['person_emp_length'] + 1)  # Loan Amount / Employment Length
    df['cred_hist_age_ratio'] = df['cb_person_cred_hist_length'] / df['person_age']  # Credit History Length / Age
    df['income_loan_grade'] = df['person_income'] * df['loan_grade_numeric']  # Income * Loan Grade
    df['annual_interest_burden'] = df['loan_amnt'] * df['loan_int_rate']  # Loan Amount * Interest Rate
    df['loan_amnt_percent_income'] = df['loan_amnt'] / (df['loan_percent_income'] + 0.001)   # Loan Amount / Loan Percent Income
    df['loan_amnt_cred_hist'] = df['loan_amnt'] / df['cb_person_cred_hist_length']  # Loan Amount / Credit History Length


X = get_dummies(X)
data_eng(X)
X

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_grade_numeric,person_home_ownership_OTHER,person_home_ownership_OWN,...,dti,lti,loan_amnt_person_income,emp_length_age_ratio,loan_amnt_emp_length,cred_hist_age_ratio,income_loan_grade,annual_interest_burden,loan_amnt_percent_income,loan_amnt_cred_hist
0,37,35000,0.0,6000,11.49,0.17,14,2,0,0,...,1.969714,0.171429,0.171429,0.000000,6000.000000,0.378378,70000,68940.0,35087.719298,428.571429
1,22,56000,6.0,4000,13.35,0.07,2,3,0,1,...,0.953571,0.071429,0.071429,0.272727,571.428571,0.090909,168000,53400.0,56338.028169,2000.000000
2,29,28800,8.0,6000,8.90,0.21,10,1,0,1,...,1.854167,0.208333,0.208333,0.275862,666.666667,0.344828,28800,53400.0,28436.018957,600.000000
3,30,70000,14.0,12000,11.11,0.17,5,2,0,0,...,1.904571,0.171429,0.171429,0.466667,800.000000,0.166667,140000,133320.0,70175.438596,2400.000000
4,22,60000,2.0,6000,6.92,0.10,3,1,0,0,...,0.692000,0.100000,0.100000,0.090909,2000.000000,0.136364,60000,41520.0,59405.940594,2000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,34,120000,5.0,25000,15.95,0.21,10,4,0,0,...,3.322917,0.208333,0.208333,0.147059,4166.666667,0.294118,480000,398750.0,118483.412322,2500.000000
58641,28,28800,0.0,10000,12.73,0.35,8,3,0,0,...,4.420139,0.347222,0.347222,0.000000,10000.000000,0.285714,86400,127300.0,28490.028490,1250.000000
58642,23,44000,7.0,6800,16.00,0.15,2,4,0,0,...,2.472727,0.154545,0.154545,0.304348,850.000000,0.086957,176000,108800.0,45033.112583,3400.000000
58643,22,30000,2.0,5000,8.90,0.17,3,1,0,0,...,1.483333,0.166667,0.166667,0.090909,1666.666667,0.136364,30000,44500.0,29239.766082,1666.666667


# Train and validate model

In [None]:
model = xgb.XGBClassifier()

# Generate submission.csv

In [6]:
to_eval_pred = model.predict_proba(competition)[:, 1]

submission = pd.DataFrame({
    'ID': eval_ids,
    'prediction': to_eval_pred
})

submission.to_csv("submission.csv", index=False)