In [1]:
import pandas as pd
import numpy as np

# Machine Learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler

# SMOTE for handling imbalanced datasets
from imblearn.over_sampling import SMOTE

In [3]:
# -------------------------------
# Step 1: Data Loading & Preprocessing
# -------------------------------

# Load dataset (update the path to where your CSV is stored)
data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')

# Check for missing values (example for MonthlyIncome and NumberOfDependents)
print("Missing values before imputation:\n", data[['MonthlyIncome', 'NumberOfDependents']].isnull().sum())

# Impute missing values with the median
data['MonthlyIncome'].fillna(data['MonthlyIncome'].median(), inplace=True)
data['NumberOfDependents'].fillna(data['NumberOfDependents'].median(), inplace=True)


Missing values before imputation:
 MonthlyIncome         29731
NumberOfDependents     3924
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['MonthlyIncome'].fillna(data['MonthlyIncome'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['NumberOfDependents'].fillna(data['NumberOfDependents'].median(), inplace=True)


In [9]:

# -------------------------------
# Step 2: Feature Engineering
# -------------------------------
# Example: Create a new feature "Income_per_Open_Credit"
data['Income_per_Open_Credit'] = data['MonthlyIncome'] / (data['NumberOfOpenCreditLinesAndLoans'] + 1)

# You could add other engineered features as needed

# Define features and target variable
X = data.drop(columns=['SeriousDlqin2yrs', 'Unnamed: 0'])
y = data['SeriousDlqin2yrs']


In [12]:
# -------------------------------
# Step 3: Train/Test Split and SMOTE
# -------------------------------
# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE only on the training data to balance the classes
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Standardize the features
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


In [13]:
# -------------------------------
# Step 4: Model Training and Evaluation
# -------------------------------
# Define the models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_res_scaled, y_train_res)
    y_pred = model.predict(X_test_scaled)
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f"--- {name} ---")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(classification_report(y_test, y_pred))
    print("\n")


--- RandomForest ---
Precision: 0.3139
Recall:    0.4244
              precision    recall  f1-score   support

           0       0.96      0.93      0.95     27995
           1       0.31      0.42      0.36      2005

    accuracy                           0.90     30000
   macro avg       0.64      0.68      0.65     30000
weighted avg       0.91      0.90      0.91     30000



--- GradientBoosting ---
Precision: 0.2787
Recall:    0.5855
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     27995
           1       0.28      0.59      0.38      2005

    accuracy                           0.87     30000
   macro avg       0.62      0.74      0.65     30000
weighted avg       0.92      0.87      0.89     30000



--- XGBoost ---
Precision: 0.2773
Recall:    0.4843
              precision    recall  f1-score   support

           0       0.96      0.91      0.93     27995
           1       0.28      0.48      0.35      2005

    accu

In [None]:
# -------------------------------
# Step 5: Hyperparameter Tuning for All Models
# -------------------------------

# ----- Random Forest Hyperparameter Tuning -----

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='recall', n_jobs=-1)
rf_grid.fit(X_train_res_scaled, y_train_res)

print("Best parameters for Random Forest:", rf_grid.best_params_)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_scaled)

print("\n--- Random Forest Performance ---")
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:   ", recall_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# ----- Gradient Boosting Hyperparameter Tuning -----

gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

gb = GradientBoostingClassifier(random_state=42)
gb_grid = GridSearchCV(gb, gb_param_grid, cv=5, scoring='recall', n_jobs=-1)
gb_grid.fit(X_train_res_scaled, y_train_res)

print("\nBest parameters for Gradient Boosting:", gb_grid.best_params_)
gb_best = gb_grid.best_estimator_
y_pred_gb = gb_best.predict(X_test_scaled)

print("\n--- Gradient Boosting Performance ---")
print("Precision:", precision_score(y_test, y_pred_gb))
print("Recall:   ", recall_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

In [None]:
# ----- XGBoost Hyperparameter Tuning -----

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_grid = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='recall', n_jobs=-1)
xgb_grid.fit(X_train_res_scaled, y_train_res)

print("\nBest parameters for XGBoost:", xgb_grid.best_params_)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test_scaled)

print("\n--- XGBoost Performance ---")
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:   ", recall_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))