In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

class BaseModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoders = {}

    def load(self, train_filepath, test_filepath):
        self.train_data = pd.read_excel(train_filepath)
        self.test_data = pd.read_excel(test_filepath)
        print("Training and testing data loaded successfully.")

    def preprocess(self):
        def process_data(data):
            # Feature engineering for transaction_date
            data['transaction_date'] = pd.to_datetime(data['transaction_date'])
            data['transaction_year'] = data['transaction_date'].dt.year
            data['transaction_month'] = data['transaction_date'].dt.month

            # Drop unnecessary columns
            data = data.drop(['customer_id', 'transaction_date'], axis=1)

            # Encode categorical variables
            categorical_cols = ['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type', 'verification_status']
            for col in categorical_cols:
                if col not in self.label_encoders:
                    le = LabelEncoder()
                    data[col] = le.fit_transform(data[col])
                    self.label_encoders[col] = le
                else:
                    data[col] = self.label_encoders[col].transform(data[col])

            # Scale numerical features
            numerical_cols = ['cibil_score', 'total_no_of_acc', 'annual_inc', 'int_rate',
                              'loan_amnt', 'installment', 'account_bal', 'emp_length', 'transaction_year', 'transaction_month']
            data[numerical_cols] = self.scaler.fit_transform(data[numerical_cols])

            return data

        self.train_data = process_data(self.train_data)
        self.test_data = process_data(self.test_data)
        print("Data preprocessing completed.")

    def split_data(self):
        X_train = self.train_data.drop('loan_status', axis=1)
        y_train = self.train_data['loan_status']
        X_test = self.test_data.drop('loan_status', axis=1)
        y_test = self.test_data['loan_status']
        return X_train, X_test, y_train, y_test

    def train(self, X_train, y_train):
        raise NotImplementedError("Train method must be implemented by subclasses.")

    def test(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        print("Classification Report:\n", report)
        print("Confusion Matrix:\n", cm)

    def predict(self, X):
        return self.model.predict(X)

class RandomForestModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        print("Random Forest model trained successfully.")

class XGBoostModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        print("XGBoost model trained successfully.")

    def tune_hyperparameters(self, X_train, y_train):
        xgb_param_grid = {
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 1.0]
        }
        grid_search = GridSearchCV(estimator=self.model, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        self.model = grid_search.best_estimator_
        print("XGBoost model hyperparameter tuning completed. Best parameters:\n", grid_search.best_params_)

# Example pipeline usage
if __name__ == "__main__":
    train_filepath = "/content/train_data.xlsx"
    test_filepath = "/content/test_data.xlsx"

    # Random Forest pipeline
    rf_model = RandomForestModel()
    rf_model.load(train_filepath, test_filepath)
    rf_model.preprocess()
    X_train, X_test, y_train, y_test = rf_model.split_data()
    rf_model.train(X_train, y_train)
    rf_model.test(X_test, y_test)

    # XGBoost pipeline
    xgb_model = XGBoostModel()
    xgb_model.load(train_filepath, test_filepath)
    xgb_model.preprocess()
    X_train, X_test, y_train, y_test = xgb_model.split_data()
    xgb_model.train(X_train, y_train)
    xgb_model.test(X_test, y_test)

    # Hyperparameter tuning for XGBoost
    xgb_model.tune_hyperparameters(X_train, y_train)
    xgb_model.test(X_test, y_test)


Training and testing data loaded successfully.
Data preprocessing completed.
Random Forest model trained successfully.
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.21      0.32      3055
           1       0.68      0.94      0.79      5400

    accuracy                           0.68      8455
   macro avg       0.67      0.58      0.55      8455
weighted avg       0.67      0.68      0.62      8455

Confusion Matrix:
 [[ 655 2400]
 [ 344 5056]]
Training and testing data loaded successfully.
Data preprocessing completed.


Parameters: { "use_label_encoder" } are not used.



XGBoost model trained successfully.
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.17      0.27      3055
           1       0.67      0.95      0.79      5400

    accuracy                           0.67      8455
   macro avg       0.67      0.56      0.53      8455
weighted avg       0.67      0.67      0.60      8455

Confusion Matrix:
 [[ 507 2548]
 [ 244 5156]]


Parameters: { "use_label_encoder" } are not used.



XGBoost model hyperparameter tuning completed. Best parameters:
 {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      3055
           1       0.64      1.00      0.78      5400

    accuracy                           0.64      8455
   macro avg       0.32      0.50      0.39      8455
weighted avg       0.41      0.64      0.50      8455

Confusion Matrix:
 [[   0 3055]
 [   0 5400]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Comparison of **Random Forest** and **XGBoost**



---

### **Random Forest**:
- **Accuracy**: 0.68
- **Recall (Class 1)**: 0.94

**Explanation**: Random Forest has a **high recall of 0.94**, meaning it identifies 94% of defaulters correctly, but still misses 6% (false negatives). This makes it a solid choice when prioritizing minimizing false negatives.

---

### **XGBoost (Before Hyperparameter Tuning)**:
- **Accuracy**: 0.67
- **Recall (Class 1)**: 0.95

**Explanation**: XGBoost outperforms Random Forest slightly in recall (**0.95**), meaning it identifies 95% of defaulters, missing only 5%. This is even better at minimizing false negatives compared to Random Forest.

---

### Conclusion:

1. **XGBoost** is the better model for identifying defaulters, as it has a higher **recall (0.95)** compared to **Random Forest (0.94)**. This means XGBoost is better at minimizing false negatives (missed defaulters), which is crucial when ensuring that no defaulters are granted loans.

2. **Hyperparameter tuning** for XGBoost has the potential to improve its performance. While the current tuned model achieves **perfect recall (1.00)**, the **precision drops**. Further fine-tuning of hyperparameters could help strike a better balance between recall and precision, ensuring optimal results for loan default prediction.