In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
from itertools import product
import joblib  # To save the best model
from catboost import CatBoostClassifier


In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train = pd.get_dummies(df_train, columns=['person_home_ownership', 'loan_intent', 'cb_person_default_on_file'] ,drop_first=True)
df_test = pd.get_dummies(df_test, columns=['person_home_ownership', 'loan_intent', 'cb_person_default_on_file'], drop_first=True)

In [4]:
df_train.dtypes

id                               int64
person_age                       int64
person_income                    int64
person_emp_length              float64
loan_grade                      object
loan_amnt                        int64
loan_int_rate                  float64
loan_percent_income            float64
cb_person_cred_hist_length       int64
loan_status                      int64
person_home_ownership_OTHER       bool
person_home_ownership_OWN         bool
person_home_ownership_RENT        bool
loan_intent_EDUCATION             bool
loan_intent_HOMEIMPROVEMENT       bool
loan_intent_MEDICAL               bool
loan_intent_PERSONAL              bool
loan_intent_VENTURE               bool
cb_person_default_on_file_Y       bool
dtype: object

In [5]:
# Ordinal Encoding for loan_grade
loan_grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df_train['loan_grade_numeric'] = df_train['loan_grade'].map(loan_grade_mapping)
df_test['loan_grade_numeric'] = df_test['loan_grade'].map(loan_grade_mapping)

In [6]:
df_train = df_train.drop(columns=['loan_grade'], axis=1)
df_test = df_test.drop(columns=['loan_grade'], axis=1)

In [7]:
# Feature Engineering
df_train['dti'] = df_train['loan_int_rate'] * df_train['loan_amnt']  / df_train['person_income']  # Debt-to-Income Ratio
df_train['ltv'] = df_train['loan_amnt'] / df_train['person_income']  # Loan-to-Income Ratio (if no collateral)
df_train['loan_amnt_person_income'] = df_train['loan_amnt'] / df_train['person_income']  # Loan Amount / Income
df_train['emp_length_age_ratio'] = df_train['person_emp_length'] / df_train['person_age']  # Employment Length / Age
df_train['loan_amnt_emp_length'] = df_train['loan_amnt'] / (df_train['person_emp_length'] + 1)  # Loan Amount / Employment Length
df_train['cred_hist_age_ratio'] = df_train['cb_person_cred_hist_length'] / df_train['person_age']  # Credit History Length / Age
df_train['income_loan_grade'] = df_train['person_income'] * df_train['loan_grade_numeric']  # Income * Loan Grade
df_train['annual_interest_burden'] = df_train['loan_amnt'] * df_train['loan_int_rate']  # Loan Amount * Interest Rate
df_train['loan_amnt_percent_income'] = df_train['loan_amnt'] / (df_train['loan_percent_income'] + 0.001)   # Loan Amount / Loan Percent Income
df_train['loan_amnt_cred_hist'] = df_train['loan_amnt'] / df_train['cb_person_cred_hist_length']  # Loan Amount / Credit History Length

# Feature Engineering
df_test['dti'] = df_test['loan_int_rate'] * df_test['loan_amnt']  / df_test['person_income']  # Debt-to-Income Ratio
df_test['ltv'] = df_test['loan_amnt'] / df_test['person_income']  # Loan-to-Income Ratio (if no collateral)
df_test['loan_amnt_person_income'] = df_test['loan_amnt'] / df_test['person_income']  # Loan Amount / Income
df_test['emp_length_age_ratio'] = df_test['person_emp_length'] / df_test['person_age']  # Employment Length / Age
df_test['loan_amnt_emp_length'] = df_test['loan_amnt'] / (df_test['person_emp_length'] + 1)  # Loan Amount / Employment Length
df_test['cred_hist_age_ratio'] = df_test['cb_person_cred_hist_length'] / df_test['person_age']  # Credit History Length / Age
df_test['income_loan_grade'] = df_test['person_income'] * df_test['loan_grade_numeric']  # Income * Loan Grade
df_test['annual_interest_burden'] = df_test['loan_amnt'] * df_test['loan_int_rate']  # Loan Amount * Interest Rate
df_test['loan_amnt_percent_income'] = df_test['loan_amnt'] / (df_test['loan_percent_income'] + 0.001)  # Loan Amount / Loan Percent Income
df_test['loan_amnt_cred_hist'] = df_test['loan_amnt'] / df_test['cb_person_cred_hist_length']  # Loan Amount / Credit History Length

In [9]:
# Define the columns to create interactions
columns = ['person_age', 'person_income', 'person_emp_length','loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Loop over the selected columns to create interaction features
for i, col1 in enumerate(columns):
    for col2 in columns[i+1:]:
        # Create new columns by multiplying
        df_train[f'{col1}_x_{col2}'] = df_train[col1] * df_train[col2]
        df_test[f'{col1}_x_{col2}'] = df_test[col1] * df_test[col2]
        # Create new columns by dividing (to avoid division by zero, add a small constant if needed)
        df_train[f'{col1}_div_{col2}'] = df_train[col1] / (df_train[col2] + 1e-10)  # Preventing division by zero
        df_test[f'{col1}_div_{col2}'] = df_test[col1] / (df_test[col2] + 1e-10)

In [10]:
df_train.dtypes

id                                                      int64
person_age                                              int64
person_income                                           int64
person_emp_length                                     float64
loan_amnt                                               int64
                                                       ...   
loan_int_rate_div_loan_percent_income                 float64
loan_int_rate_x_cb_person_cred_hist_length            float64
loan_int_rate_div_cb_person_cred_hist_length          float64
loan_percent_income_x_cb_person_cred_hist_length      float64
loan_percent_income_div_cb_person_cred_hist_length    float64
Length: 66, dtype: object

In [8]:
# Split the data into features (X) and target (y)
X = df_train.drop(columns=['loan_status', 'id'], axis=1)  # Features (drop the target column and 'id')
y = df_train['loan_status']                               # Target

In [9]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import json
import optuna

# Function to optimize using Optuna
def objective(trial, X, y):
    # Define the hyperparameter search space
    param = {
        'tree_method': 'hist',  # Using GPU
        'device': 'cuda',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Define number of boosting rounds (equivalent to n_estimators)
    num_boost_round = trial.suggest_int('num_boost_round', 500, 2000)

    # Stratified K-Folds cross-validator
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # Train model
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round)

        # Predict and calculate ROC AUC score
        preds = model.predict(dtest)
        auc = roc_auc_score(y_test, preds)
        aucs.append(auc)

    return sum(aucs) / len(aucs)

# Run the optimization
def optimize_xgboost(X, y, n_trials, filename):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100)

    # Save the study results (hyperparameters and AUC)
    best_params = study.best_trial.params
    results = study.trials_dataframe()

    # Save to CSV
    results.to_csv('optuna_xgboost_results.csv', index=False)

    # Save best parameters to JSON
    with open(filename, 'w') as f:
        json.dump(best_params, f)

    return best_params, study.best_value

# Example usage:
# Assuming X and y are your feature matrix and labels
best_params, best_auc = optimize_xgboost(X, y, 100, "best_params.json")
print(f"Best params: {best_params}, Best AUC: {best_auc}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-10-04 21:13:32,954] A new study created in memory with name: no-name-0c051be0-90e1-4ad0-a5a9-df8f9329dafd
[I 2024-10-04 21:14:00,685] Trial 0 finished with value: 0.9428042453714701 and parameters: {'lambda': 0.001038113224886185, 'alpha': 2.968551754365886, 'learning_rate': 0.19371341698010308, 'max_depth': 9, 'min_child_weight': 7, 'subsample': 0.6482215701253844, 'colsample_bytree': 0.7760057288301152, 'num_boost_round': 1022}. Best is trial 0 with value: 0.9428042453714701.
[I 2024-10-04 21:14:38,856] Trial 1 finished with value: 0.9393745345574882 and parameters: {'lambda': 0.7164031101404582, 'alpha': 2.6346978302296393, 'learning_rate': 0.26387508456925635, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.5198286847047855, 'colsample_bytree': 0.7874806619874197, 'num_boost_round': 1677}. Best is trial 0 with value: 0.9428042453714701.
[I 2024-10-04 21:14:56,092] Trial 2 finished with value: 0.9473614834155132 and par

KeyboardInterrupt: 

In [34]:
#RE TRAIN BEST MODEL
best_params.update({
    'tree_method': 'hist',  # Using GPU
    'device': 'cuda',
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
})

num_boost_round = best_params.pop('num_boost_round', None)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aucs = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train model
    model = xgb.train(best_params, dtrain, num_boost_round=num_boost_round)

    # Predict and calculate ROC AUC score
    preds = model.predict(dtest)
    auc = roc_auc_score(y_test, preds)
    aucs.append(auc)

sum(aucs) / len(aucs)

0.9586547632441647

In [35]:
df_test_ids = df_test['id']

In [43]:
dtest = xgb.DMatrix(df_test.drop(columns=['id'], axis=1))

In [45]:
y_pred = model.predict(dtest)

In [61]:
submission = pd.DataFrame({'id': df_test_ids.values,         
                            'prediction': y_pred})

In [62]:
submission.to_csv("submission_baselinexgboost_0.9586.csv", index=False)

In [63]:
submission

Unnamed: 0,id,prediction
0,58645,0.998867
1,58646,0.007398
2,58647,0.738488
3,58648,0.008945
4,58649,0.076600
...,...,...
39093,97738,0.041863
39094,97739,0.002184
39095,97740,0.010456
39096,97741,0.450214
