# Loan Approval Prediction System

## Project Overview

This project aims to build an automated loan approval prediction system using machine learning. The system analyzes various applicant features to predict whether a loan application should be approved or rejected.

### Objectives:
- Increase efficiency in loan processing
- Improve accuracy of loan decisions
- Enhance fairness and reduce bias
- Enable scalability for high-volume applications

## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Plot styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("All libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load dataset
df = pd.read_csv('loan_approval_dataset.csv')

print("Dataset loaded successfully!")
print(f"\nDataset Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display first few rows
print("\n=== First 5 rows of the dataset ===")
df.head()

In [None]:
# Dataset info
print("\n=== Dataset Information ===")
df.info()

In [None]:
# Statistical summary
print("\n=== Statistical Summary ===")
df.describe()

In [None]:
# Check for missing values
print("\n=== Missing Values ===")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Identify column types
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

In [None]:
# Check target variable distribution
if 'loan_status' in df.columns:
    target_col = 'loan_status'
elif 'Loan_Status' in df.columns:
    target_col = 'Loan_Status'
else:
    # Find likely target column
    target_col = [col for col in df.columns if 'status' in col.lower() or 'approved' in col.lower()]
    target_col = target_col[0] if target_col else df.columns[-1]

print(f"\n=== Target Variable: {target_col} ===")
print(df[target_col].value_counts())
print(f"\nClass Distribution:")
print(df[target_col].value_counts(normalize=True) * 100)

In [None]:
# Visualize target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df[target_col].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title(f'Distribution of {target_col}', fontsize=14, fontweight='bold')
axes[0].set_xlabel(target_col)
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
df[target_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                     colors=['#FF6B6B', '#4ECDC4'], startangle=90)
axes[1].set_title(f'Proportion of {target_col}', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of numerical features
numerical_features = [col for col in numerical_cols if col != target_col]

if len(numerical_features) > 0:
    n_cols = 3
    n_rows = (len(numerical_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes
    
    for idx, col in enumerate(numerical_features):
        if idx < len(axes):
            df[col].hist(bins=30, ax=axes[idx], edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
    
    # Hide extra subplots
    for idx in range(len(numerical_features), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Distribution of categorical features
categorical_features = [col for col in categorical_cols if col != target_col]

if len(categorical_features) > 0:
    n_cols = 2
    n_rows = (len(categorical_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes
    
    for idx, col in enumerate(categorical_features):
        if idx < len(axes):
            df[col].value_counts().plot(kind='bar', ax=axes[idx], color='skyblue', edgecolor='black')
            axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Count')
            axes[idx].tick_params(axis='x', rotation=45)
    
    # Hide extra subplots
    for idx in range(len(categorical_features), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix for numerical features
if len(numerical_features) > 1:
    plt.figure(figsize=(12, 8))
    correlation_matrix = df[numerical_features].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Numerical Features', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
# Box plots to detect outliers
if len(numerical_features) > 0:
    n_cols = 3
    n_rows = (len(numerical_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes
    
    for idx, col in enumerate(numerical_features):
        if idx < len(axes):
            df.boxplot(column=col, ax=axes[idx])
            axes[idx].set_title(f'Box Plot of {col}', fontweight='bold')
            axes[idx].set_ylabel(col)
    
    # Hide extra subplots
    for idx in range(len(numerical_features), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 4. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

print("Starting data preprocessing...")
print(f"Initial shape: {df_processed.shape}")

In [None]:
# Handle missing values
print("\n=== Handling Missing Values ===")

# For numerical columns: fill with median
for col in numerical_features:
    if df_processed[col].isnull().sum() > 0:
        median_value = df_processed[col].median()
        df_processed[col].fillna(median_value, inplace=True)
        print(f"Filled {col} missing values with median: {median_value}")

# For categorical columns: fill with mode
for col in categorical_features:
    if df_processed[col].isnull().sum() > 0:
        mode_value = df_processed[col].mode()[0]
        df_processed[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} missing values with mode: {mode_value}")

print(f"\nMissing values after imputation: {df_processed.isnull().sum().sum()}")

In [None]:
# Remove duplicates if any
before_dup = len(df_processed)
df_processed.drop_duplicates(inplace=True)
after_dup = len(df_processed)
print(f"\nRemoved {before_dup - after_dup} duplicate rows")

In [None]:
# Encode categorical variables
print("\n=== Encoding Categorical Variables ===")

label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    print(f"Encoded {col}: {list(le.classes_)}")

# Encode target variable if it's categorical
if df_processed[target_col].dtype == 'object':
    le_target = LabelEncoder()
    df_processed[target_col] = le_target.fit_transform(df_processed[target_col])
    label_encoders[target_col] = le_target
    print(f"\nEncoded target '{target_col}': {list(le_target.classes_)}")

In [None]:
# Handle outliers using IQR method (optional - can be commented out if needed)
print("\n=== Handling Outliers (IQR Method) ===")

def remove_outliers_iqr(df, columns, factor=1.5):
    df_clean = df.copy()
    outliers_removed = 0
    
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        
        before = len(df_clean)
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        after = len(df_clean)
        removed = before - after
        
        if removed > 0:
            outliers_removed += removed
            print(f"{col}: Removed {removed} outliers (bounds: [{lower_bound:.2f}, {upper_bound:.2f}])")
    
    return df_clean, outliers_removed

# Uncomment the next line to remove outliers
# df_processed, total_outliers = remove_outliers_iqr(df_processed, numerical_features)
# print(f"\nTotal outliers removed: {total_outliers}")

print(f"Shape after preprocessing: {df_processed.shape}")

## 5. Feature Engineering

In [None]:
# Feature engineering - create new features if applicable
print("=== Feature Engineering ===")

# Example: Create debt-to-income ratio if income and loan amount columns exist
income_cols = [col for col in df_processed.columns if 'income' in col.lower()]
loan_cols = [col for col in df_processed.columns if 'loan' in col.lower() and 'amount' in col.lower()]

if income_cols and loan_cols:
    income_col = income_cols[0]
    loan_col = loan_cols[0]
    df_processed['debt_to_income_ratio'] = df_processed[loan_col] / (df_processed[income_col] + 1)
    print(f"Created feature: debt_to_income_ratio using {loan_col} and {income_col}")

# Example: Create age groups if age column exists
age_cols = [col for col in df_processed.columns if 'age' in col.lower()]
if age_cols:
    age_col = age_cols[0]
    df_processed['age_group'] = pd.cut(df_processed[age_col], 
                                        bins=[0, 25, 35, 50, 100],
                                        labels=[0, 1, 2, 3])
    df_processed['age_group'] = df_processed['age_group'].astype(int)
    print(f"Created feature: age_group from {age_col}")

print(f"\nFinal dataset shape: {df_processed.shape}")
print(f"Feature columns: {df_processed.columns.tolist()}")

## 6. Prepare Data for Modeling

In [None]:
# Separate features and target
X = df_processed.drop(columns=[target_col])
y = df_processed[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts()}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set size: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining target distribution:\n{y_train.value_counts()}")
print(f"\nTesting target distribution:\n{y_test.value_counts()}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")

In [None]:
# Handle class imbalance using SMOTE
print("\n=== Handling Class Imbalance ===")
print(f"Before SMOTE: {y_train.value_counts().to_dict()}")

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"After SMOTE: {pd.Series(y_train_balanced).value_counts().to_dict()}")
print(f"Training set shape after SMOTE: {X_train_balanced.shape}")

## 7. Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'XGBoost': XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

print("Initialized models:")
for name in models.keys():
    print(f"  - {name}")

In [None]:
# Train and evaluate models
results = []

print("\n=== Training Models ===")
print("This may take a few minutes...\n")

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train model
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Calculate ROC AUC if probability predictions available
    if y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_pred_proba)
        except:
            roc_auc = None
    else:
        roc_auc = None
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC AUC': roc_auc,
        'CV Mean': cv_mean,
        'CV Std': cv_std
    })
    
    print(f"  Accuracy: {accuracy:.4f} | F1-Score: {f1:.4f} | CV: {cv_mean:.4f} (+/- {cv_std:.4f})\n")

print("Model training completed!")

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n=== Model Performance Comparison ===")
print(results_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Accuracy comparison
results_df.plot(x='Model', y='Accuracy', kind='bar', ax=axes[0, 0], color='skyblue', legend=False)
axes[0, 0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Model')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].set_ylim([0, 1])

# Precision comparison
results_df.plot(x='Model', y='Precision', kind='bar', ax=axes[0, 1], color='lightgreen', legend=False)
axes[0, 1].set_title('Model Precision Comparison', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Model')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].set_ylim([0, 1])

# Recall comparison
results_df.plot(x='Model', y='Recall', kind='bar', ax=axes[1, 0], color='lightcoral', legend=False)
axes[1, 0].set_title('Model Recall Comparison', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Model')
axes[1, 0].set_ylabel('Recall')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].set_ylim([0, 1])

# F1-Score comparison
results_df.plot(x='Model', y='F1-Score', kind='bar', ax=axes[1, 1], color='plum', legend=False)
axes[1, 1].set_title('Model F1-Score Comparison', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Model')
axes[1, 1].set_ylabel('F1-Score')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Model comparison chart saved as 'model_comparison.png'")

In [None]:
# Select best model
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"\n{'='*50}")
print(f"BEST MODEL: {best_model_name}")
print(f"Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"{'='*50}")

## 8. Detailed Analysis of Best Model

In [None]:
# Predictions with best model
y_pred_best = best_model.predict(X_test_scaled)
y_pred_proba_best = best_model.predict_proba(X_test_scaled)[:, 1] if hasattr(best_model, 'predict_proba') else None

# Classification report
print(f"\n=== Classification Report for {best_model_name} ===")
print(classification_report(y_test, y_pred_best))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, square=True,
            xticklabels=['Rejected', 'Approved'],
            yticklabels=['Rejected', 'Approved'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=16, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
# ROC Curve
if y_pred_proba_best is not None:
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
    roc_auc = roc_auc_score(y_test, y_pred_proba_best)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve - {best_model_name}', fontsize=16, fontweight='bold')
    plt.legend(loc='lower right', fontsize=12)
    plt.grid(alpha=0.3)
    plt.savefig('roc_curve.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("ROC curve saved as 'roc_curve.png'")

In [None]:
# Feature Importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\n=== Feature Importance - {best_model_name} ===")
    print(feature_importance.to_string(index=False))
    
    # Plot feature importance
    plt.figure(figsize=(12, 6))
    plt.barh(feature_importance['Feature'][:15], feature_importance['Importance'][:15], color='teal')
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title(f'Top 15 Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Feature importance chart saved as 'feature_importance.png'")
elif hasattr(best_model, 'coef_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': best_model.coef_[0]
    }).sort_values('Coefficient', ascending=False, key=abs)
    
    print(f"\n=== Feature Coefficients - {best_model_name} ===")
    print(feature_importance.to_string(index=False))
    
    # Plot coefficients
    plt.figure(figsize=(12, 6))
    colors = ['green' if x > 0 else 'red' for x in feature_importance['Coefficient'][:15]]
    plt.barh(feature_importance['Feature'][:15], feature_importance['Coefficient'][:15], color=colors)
    plt.xlabel('Coefficient', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title(f'Top 15 Feature Coefficients - {best_model_name}', fontsize=16, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_coefficients.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Feature coefficients chart saved as 'feature_coefficients.png'")

## 9. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for best model
print(f"\n=== Hyperparameter Tuning for {best_model_name} ===")

# Define parameter grids for different models
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 0.9, 1.0]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 0.9, 1.0]
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

if best_model_name in param_grids:
    print(f"Performing GridSearchCV for {best_model_name}...")
    print("This may take several minutes...\n")
    
    grid_search = GridSearchCV(
        estimator=best_model,
        param_grid=param_grids[best_model_name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Use tuned model for final predictions
    best_model_tuned = grid_search.best_estimator_
    y_pred_tuned = best_model_tuned.predict(X_test_scaled)
    
    accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
    f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')
    
    print(f"\nTuned model performance:")
    print(f"Accuracy: {accuracy_tuned:.4f}")
    print(f"F1-Score: {f1_tuned:.4f}")
    
    print(f"\nClassification Report (Tuned Model):")
    print(classification_report(y_test, y_pred_tuned))
    
    best_model = best_model_tuned
else:
    print(f"No predefined parameter grid for {best_model_name}")
    print("Using default parameters...")

## 10. Save the Model and Artifacts

In [None]:
# Save the best model and preprocessing artifacts
import pickle

# Create models directory
import os
os.makedirs('models', exist_ok=True)

# Save model
with open('models/best_loan_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print(f"Best model ({best_model_name}) saved to 'models/best_loan_model.pkl'")

# Save scaler
with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved to 'models/scaler.pkl'")

# Save label encoders
with open('models/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print("Label encoders saved to 'models/label_encoders.pkl'")

# Save feature names
with open('models/feature_names.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
print("Feature names saved to 'models/feature_names.pkl'")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'accuracy': accuracy_score(y_test, best_model.predict(X_test_scaled)),
    'f1_score': f1_score(y_test, best_model.predict(X_test_scaled), average='weighted'),
    'features': X.columns.tolist(),
    'target_column': target_col,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('models/model_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)
print("Model metadata saved to 'models/model_metadata.pkl'")

print("\nAll artifacts saved successfully!")

## 11. Prediction Pipeline Example

In [None]:
# Example: Make predictions on new data
def predict_loan_approval(new_data_dict):
    """
    Predict loan approval for new applicant data.
    
    Parameters:
    new_data_dict: dict with feature names as keys and values
    
    Returns:
    prediction: 0 (Rejected) or 1 (Approved)
    probability: Probability of approval
    """
    # Load artifacts
    with open('models/best_loan_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('models/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    with open('models/label_encoders.pkl', 'rb') as f:
        encoders = pickle.load(f)
    with open('models/feature_names.pkl', 'rb') as f:
        features = pickle.load(f)
    
    # Create dataframe
    new_data = pd.DataFrame([new_data_dict])
    
    # Encode categorical variables
    for col, encoder in encoders.items():
        if col in new_data.columns and col != target_col:
            try:
                new_data[col] = encoder.transform(new_data[col])
            except:
                pass
    
    # Ensure all features are present
    for feature in features:
        if feature not in new_data.columns:
            new_data[feature] = 0
    
    new_data = new_data[features]
    
    # Scale features
    new_data_scaled = scaler.transform(new_data)
    
    # Make prediction
    prediction = model.predict(new_data_scaled)[0]
    probability = model.predict_proba(new_data_scaled)[0][1] if hasattr(model, 'predict_proba') else None
    
    return prediction, probability

# Test with a sample from test set
sample_idx = 0
sample_data = X_test.iloc[sample_idx].to_dict()

pred, prob = predict_loan_approval(sample_data)

print("\n=== Sample Prediction ===")
print(f"Input features: {sample_data}")
print(f"\nPrediction: {'Approved' if pred == 1 else 'Rejected'}")
if prob is not None:
    print(f"Approval Probability: {prob:.2%}")
print(f"Actual: {'Approved' if y_test.iloc[sample_idx] == 1 else 'Rejected'}")

## 12. Summary and Conclusions

In [None]:
print("="*70)
print("LOAN APPROVAL PREDICTION SYSTEM - PROJECT SUMMARY")
print("="*70)

print(f"\n1. DATASET OVERVIEW")
print(f"   - Total records: {len(df)}")
print(f"   - Number of features: {df.shape[1] - 1}")
print(f"   - Target variable: {target_col}")

print(f"\n2. BEST MODEL")
print(f"   - Algorithm: {best_model_name}")
print(f"   - Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"   - Precision: {results_df.iloc[0]['Precision']:.4f}")
print(f"   - Recall: {results_df.iloc[0]['Recall']:.4f}")
print(f"   - F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")
if results_df.iloc[0]['ROC AUC'] is not None:
    print(f"   - ROC AUC: {results_df.iloc[0]['ROC AUC']:.4f}")

print(f"\n3. KEY ACHIEVEMENTS")
print(f"   - Automated loan decision process")
print(f"   - Handled class imbalance using SMOTE")
print(f"   - Evaluated {len(models)} different algorithms")
print(f"   - Created comprehensive visualization and reporting")

print(f"\n4. DELIVERABLES")
print(f"   - Trained model saved in 'models/' directory")
print(f"   - Preprocessing artifacts (scaler, encoders)")
print(f"   - Model performance visualizations")
print(f"   - Prediction pipeline ready for deployment")

print(f"\n5. BUSINESS IMPACT")
print(f"   - Faster loan processing time")
print(f"   - Consistent and objective decision-making")
print(f"   - Reduced manual workload")
print(f"   - Improved customer satisfaction")

print("\n" + "="*70)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("="*70)