In [26]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

In [27]:
train = pd.read_csv('kaggle/input/loan-payback/train.csv')
test = pd.read_csv('kaggle/input/loan-payback/test.csv')

In [28]:
train.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [29]:
# No nulls
train.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [30]:
train.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442236,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [31]:
# Create interaction features between quantitative columns
quantitative_features = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
                         'loan_amount', 'interest_rate']

def create_interaction_features(df):
    df = df.copy()
    
    # Income-related ratios
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['monthly_payment_estimate'] = (df['loan_amount'] * df['interest_rate'] / 100) / 12
    
    # Credit and risk interactions
    df['credit_to_loan_ratio'] = df['credit_score'] / (df['loan_amount'] + 1)
    df['risk_score'] = df['debt_to_income_ratio'] * df['interest_rate']
    
    # Affordability metrics
    df['monthly_income'] = df['annual_income'] / 12
    df['affordability_ratio'] = df['monthly_income'] / (df['monthly_payment_estimate'] + 1)
    
    # Composite scores
    df['credit_income_score'] = df['credit_score'] * (df['annual_income'] / 100000)
    df['debt_burden'] = df['annual_income'] * df['debt_to_income_ratio']
    
    return df

train = create_interaction_features(train)
test = create_interaction_features(test)

In [32]:
# Create encoded features
from sklearn.preprocessing import LabelEncoder

test['loan_paid_back'] = None
combined = pd.concat([train, test], sort=False, ignore_index=True)

# Encode categorical columns
le_gender = LabelEncoder()
le_marital_status = LabelEncoder()
le_education_level = LabelEncoder()
le_employment_status = LabelEncoder()
le_loan_purpose = LabelEncoder()
le_grade_subgrade = LabelEncoder()

combined['gender_enc'] = le_gender.fit_transform(combined['gender'])
combined['marital_status_enc'] = le_marital_status.fit_transform(combined['marital_status'])
combined['education_level_enc'] = le_education_level.fit_transform(combined['education_level'])
combined['employment_status_enc'] = le_employment_status.fit_transform(combined['employment_status'])
combined['loan_purpose_enc'] = le_loan_purpose.fit_transform(combined['loan_purpose'])
combined['grade_subgrade_enc'] = le_grade_subgrade.fit_transform(combined['grade_subgrade'])

# Split back into train and test
train = combined[combined['loan_paid_back'].notnull()].copy()
test = combined[combined['loan_paid_back'].isnull()].copy()

test.drop(columns=['loan_paid_back'], inplace=True)  # Remove dummy column from test

  combined = pd.concat([train, test], sort=False, ignore_index=True)


In [33]:
features = list(train.columns.drop(['id', 'loan_paid_back', 'gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']))
features

['annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'income_to_loan_ratio',
 'monthly_payment_estimate',
 'credit_to_loan_ratio',
 'risk_score',
 'monthly_income',
 'affordability_ratio',
 'credit_income_score',
 'debt_burden',
 'gender_enc',
 'marital_status_enc',
 'education_level_enc',
 'employment_status_enc',
 'loan_purpose_enc',
 'grade_subgrade_enc']

In [34]:
X = train[features]
y = train['loan_paid_back'].astype(int)
X_test = test[features]

In [None]:
#feature selection
from sklearn.feature_selection import SequentialFeatureSelector
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# X = your features DataFrame, y = your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

sfs = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',  
    direction='backward',          
    scoring='accuracy',
    cv=10,                     
    n_jobs=-1
)
sfs.fit(X_train, y_train)

# Get the best feature subset
selected_features = X_train.columns[sfs.get_support()]
print("Best features:", list(selected_features))

# Evaluate on test set
model.fit(X_train[selected_features], y_train)
print("Test accuracy:", model.score(X_test[selected_features], y_test))

# Best features: ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'risk_score', 'monthly_income', 'marital_status_enc', 'education_level_enc', 'employment_status_enc', 'loan_purpose_enc', 'grade_subgrade_enc']


Best features: ['debt_to_income_ratio', 'credit_score', 'loan_amount', 'risk_score', 'monthly_income', 'marital_status_enc', 'education_level_enc', 'employment_status_enc', 'loan_purpose_enc', 'grade_subgrade_enc']
Test accuracy: 0.9044099697808904


In [None]:
features = selected_features
X = train[features]
y = train['loan_paid_back'].astype(int)
X_test = test[features]

from sklearn.model_selection import RandomizedSearchCV

xgb_param_dist = {
    'n_estimators': [300, 500, 700, 1000],
    'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.5, 1, 2, 5]
}

xgb_clf = XGBClassifier(
    enable_categorical=True,  # if using categoricals
    tree_method='hist',       # speeds up training, optional
    use_label_encoder=False,  # suppresses warning if using older XGBoost
    eval_metric='logloss',     # or 'error' for accuracy
    random_state=42
)

xgb_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=xgb_param_dist,
    n_iter=100,                # increase for a more thorough search
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X, y)
print('Best parameters:', xgb_search.best_params_)
print('Best cross-validation score:', xgb_search.best_score_)

# Best parameters: {'subsample': 1.0, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 1000, 'min_child_weight': 7, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}


Best parameters: {'subsample': 1.0, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 1000, 'min_child_weight': 7, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best cross-validation score: 0.9051168854402427


In [38]:
xgb_model = XGBClassifier(**xgb_search.best_params_)
xgb_model.fit(X, y)

test['loan_paid_back'] = model.predict(X_test)
test['loan_paid_back'] = test['loan_paid_back'].astype(bool) # convert back

In [39]:
submission = test[['id', 'loan_paid_back']]
submission.to_csv('loan_payback_submission.csv', index=False)