In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

class BaseModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoders = {}

    def load(self, train_filepath, test_filepath):
        self.train_data = pd.read_excel(train_filepath)
        self.test_data = pd.read_excel(test_filepath)
        print("Training and testing data loaded successfully.")

    def preprocess(self):
        def process_data(data):
            # Feature engineering for transaction_date
            data['transaction_date'] = pd.to_datetime(data['transaction_date'])
            data['transaction_year'] = data['transaction_date'].dt.year
            data['transaction_month'] = data['transaction_date'].dt.month

            # Drop unnecessary columns
            data = data.drop(['customer_id', 'transaction_date'], axis=1)

            # Encode categorical variables
            categorical_cols = ['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type', 'verification_status']
            for col in categorical_cols:
                if col not in self.label_encoders:
                    le = LabelEncoder()
                    data[col] = le.fit_transform(data[col])
                    self.label_encoders[col] = le
                else:
                    data[col] = self.label_encoders[col].transform(data[col])

            # Scale numerical features
            numerical_cols = ['cibil_score', 'total_no_of_acc', 'annual_inc', 'int_rate',
                              'loan_amnt', 'installment', 'account_bal', 'emp_length', 'transaction_year', 'transaction_month']
            data[numerical_cols] = self.scaler.fit_transform(data[numerical_cols])

            return data

        self.train_data = process_data(self.train_data)
        self.test_data = process_data(self.test_data)
        print("Data preprocessing completed.")

    def split_data(self):
        X_train = self.train_data.drop('loan_status', axis=1)
        y_train = self.train_data['loan_status']
        X_test = self.test_data.drop('loan_status', axis=1)
        y_test = self.test_data['loan_status']
        return X_train, X_test, y_train, y_test

    def train(self, X_train, y_train):
        raise NotImplementedError("Train method must be implemented by subclasses.")

    def test(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        print("Classification Report:\n", report)
        print("Confusion Matrix:\n", cm)

    def predict(self, X):
        return self.model.predict(X)

class XGBoostModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        print("XGBoost model trained successfully.")

    def tune_hyperparameters_random(self, X_train, y_train):
        xgb_param_grid = {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'n_estimators': [50, 100, 200, 300],
            'max_depth': [3, 5, 7, 9],
            'subsample': [0.6, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
            'scale_pos_weight': [1, 2, 5]
        }

        # Sample 20% of training data
        X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

        random_search = RandomizedSearchCV(
            estimator=self.model,
            param_distributions=xgb_param_grid,
            n_iter=20,  # Number of parameter settings sampled
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            random_state=42
        )
        random_search.fit(X_train_sample, y_train_sample)
        self.model = random_search.best_estimator_
        print("XGBoost model hyperparameter tuning with RandomizedSearchCV completed. Best parameters:\n", random_search.best_params_)

# Example pipeline usage
if __name__ == "__main__":
    train_filepath = "/content/train_data.xlsx"
    test_filepath = "/content/test_data.xlsx"

    # XGBoost pipeline without hyperparameter tuning
    xgb_default_model = XGBoostModel()
    xgb_default_model.load(train_filepath, test_filepath)
    xgb_default_model.preprocess()
    X_train, X_test, y_train, y_test = xgb_default_model.split_data()
    xgb_default_model.train(X_train, y_train)
    print("XGBoost Default Model:")
    xgb_default_model.test(X_test, y_test)

    # XGBoost pipeline with RandomizedSearchCV tuning
    xgb_random_model = XGBoostModel()
    xgb_random_model.load(train_filepath, test_filepath)
    xgb_random_model.preprocess()
    X_train, X_test, y_train, y_test = xgb_random_model.split_data()
    xgb_random_model.tune_hyperparameters_random(X_train, y_train)
    print("XGBoost RandomizedSearchCV Tuned Model:")
    xgb_random_model.test(X_test, y_test)


Training and testing data loaded successfully.
Data preprocessing completed.


Parameters: { "use_label_encoder" } are not used.



XGBoost model trained successfully.
XGBoost Default Model:
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.17      0.27      3055
           1       0.67      0.95      0.79      5400

    accuracy                           0.67      8455
   macro avg       0.67      0.56      0.53      8455
weighted avg       0.67      0.67      0.60      8455

Confusion Matrix:
 [[ 507 2548]
 [ 244 5156]]
Training and testing data loaded successfully.
Data preprocessing completed.


Parameters: { "use_label_encoder" } are not used.



XGBoost model hyperparameter tuning with RandomizedSearchCV completed. Best parameters:
 {'subsample': 1.0, 'scale_pos_weight': 1, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
XGBoost RandomizedSearchCV Tuned Model:
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.13      0.21      3055
           1       0.66      0.97      0.79      5400

    accuracy                           0.66      8455
   macro avg       0.68      0.55      0.50      8455
weighted avg       0.67      0.66      0.58      8455

Confusion Matrix:
 [[ 388 2667]
 [ 174 5226]]


**XGBoost Tuned Model is the best model for loan default prediction as it ensures the majority of defaulters are predicted, achieving 97 percent recall. and Similar accuracy with Xgboost default model**

