In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_palette("Set2")

df = pd.read_csv('Loan-Approval-Prediction.csv')

categorical = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 
               'Property_Area']

numerical = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [None]:
df.info()

### Split into training and test sets

Ideally, to avoid data leakage, no data from the test set should be used in preprocessing, so we'll split into training and test sets beforehand. Given that we have less than 1000 rows, we'll have a relatively small test set and make an 80:20 split. We can also drop the loan ID since this won't be used in modelling. 

In [None]:
#Separate target out and remove loan ID column
X = df.iloc[:, 1:].copy()
y = X.pop('Loan_Status')

We encode the target as a binary, applying the convention that the minority class is labelled as positive.

In [None]:
y = y.apply(lambda x: 1 if x=='N' else 0)

In [None]:
from sklearn.model_selection import train_test_split

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21, stratify=y)

In [None]:
#Check shapes of splits
[split.shape for split in [X_train, X_test, y_train, y_test]]

In [None]:
#Check class distribution in target splits
print(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True), sep='\n\n')

### Impute missing values

- For the categorical columns (including the loan term), we'll replace all nulls with the most frequent value. Given that 8% of the credit history values are null, this isn't ideal, but they mostly (~84%) take the value 1, so this seems reasonable in this case.
- Since the other numerical columns are right-skewed, we'll replace the nulls with the median rather than the mean.  

In [None]:
#Check nulls
X.isna().sum()

In [None]:
#Impute missing values with median
value = {'LoanAmount': X_train.loc[:, ('LoanAmount')].median()}
X_train = X_train.fillna(value=value)
X_test = X_test.fillna(value=value)  

In [None]:
#Impute missing values with most frequent value
for col in categorical + ['Loan_Amount_Term']:
    value = {col: X_train.loc[:, (col)].value_counts().index[0]}
    X_train = X_train.fillna(value=value)
    X_test = X_test.fillna(value=value)

In [None]:
#Check for nulls in data
print(X_train.isna().sum().sum(), X_test.isna().sum().sum())

### Preprocessing

First we'll recast our datatypes where appropriate.

In [None]:
for data in X_train, X_test:
    data = data.astype({'CoapplicantIncome': 'int64', 
                        'LoanAmount': 'int64', 
                        'Loan_Amount_Term': 'int64', 
                        'Credit_History': 'int64'})
    print(data.dtypes, '\n')

We note the following with regard to encoding each feature. 
- Gender, Married, Education, Self_Employed and Credit_History are binary, so we'll one-hot encode these. Property_Area takes three values but there doesn't appear to be any inherent ordering so we'll also one-hot encode this and drop a redundant column.
- Dependents takes string values 0, 1, 2 and 3+. There is an ordering here so we could cast these as integers 0, 1, 2, 3, respectively, but note that this doesn't take into account the fact that a label 3 could then indicate any number of dependents from 3 upwards, while the other labels exactly match the data. Given this, we'll simply one-hot encode these as well, dropping a redundant column, as is standard.
- ApplicantIncome, CoapplicantIncome, LoanAmount and Loan_Amount_Term are all numeric and take only positive values, so we'll scale these using a MinMaxScaler().

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [None]:
transformers = [('categorical', OneHotEncoder(sparse=True, handle_unknown='ignore'), categorical), 
                ('numerical', MinMaxScaler(feature_range=(0,1)), numerical)]

col_transform = ColumnTransformer(transformers=transformers, remainder='passthrough')

### Modelling

We now test a suite of classifiers with default parameters using repeated stratified cross-validation with three repeats and three folds. We monitor both accuracy and F1 score to ensure that we have a good balance between overall accuracy, recall and precision.

In [None]:
from sklearn.metrics import accuracy_score, fbeta_score, make_scorer, f1_score, average_precision_score

#Define metrics
accs = make_scorer(accuracy_score)
f1 = make_scorer(fbeta_score, beta=1)
aps = make_scorer(average_precision_score)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
# Define classifiers
classifiers = [
    ('LR', LogisticRegression(solver='lbfgs', max_iter=1500)),
    ('SVC', SVC(gamma='scale', max_iter=1500)),
    ('DT', DecisionTreeClassifier(max_depth=3)),
    ('RF', RandomForestClassifier(max_depth=3)),
    ('XGB', XGBClassifier(max_depth=3))
]

# define cross-validation
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=21)

results = []

for name, classifier in classifiers:
#     define data prep and pipeline
    pipeline = Pipeline(steps=[('preprocessing', col_transform), ('modelling', classifier)])

#     evaluate pipeline using cross validation 
    scores = cross_validate(pipeline, X_train, y_train, 
                            scoring={'accuracy': accs, 'f1': f1, 'APS': aps}, 
                            cv=cv, n_jobs=-1, error_score='raise')
    result = pd.DataFrame(scores)
    result['model'] = name
    results.append(result)
    
    #In case we need them later
#     pipeline.fit(X_train, y_train)
#     predictions = pipeline.predict(X_train)
#     yhat = pipeline.predict(X_test)

final = pd.concat(results, ignore_index=True)
time_metrics = [col for col in final.columns if 'time' in col]
test_metrics = [col for col in final.columns if 'test' in col]

#Plot results
final_test = pd.melt(final, id_vars=['model'], value_vars=test_metrics,
                     var_name='metric')
final_times = pd.melt(final, id_vars=['model'], value_vars=time_metrics,
                     var_name='metric')

fig, ax = plt.subplots(ncols=2, figsize=(12,4))

sns.boxplot(x='model', y='value', data=final_test, hue='metric', palette='Set2', ax=ax[0])
sns.boxplot(x='model', y='value', data=final_times, hue='metric', palette='Set3', ax=ax[1])

ax[0].set_ylabel('Values')
ax[0].set_xlabel('Model')
ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax[0].set_title('Performance by model and metric')

ax[1].set_ylabel('Time')
ax[1].set_xlabel('Model')
ax[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax[1].set_title('Performance by model and fit/score time')

plt.tight_layout();


The models perform similarly in overall accuracy at about 80% (compared to a baseline of 69%) and it's difficult to distinguish visually which might be the best model, so we'll take a closer look at the numbers.

In [None]:
results_unbalanced = final_test.groupby(['metric', 'model']).median()
results_unbalanced.T

It's now easier to see that the random forest, logistic regression and SVC models perform equivalently, and in fact have exactly the same scores on some of the metrics.

Let's rerun the experiments with class weights balanced to see if the class imbalance is having a negative effect. For the XGBoost classifier, a good starting value for scale_pos_weight is the ratio of negative to positive examples in the dataset, which in our case is given as follows.

In [None]:
y_train.value_counts()[0] / y_train.value_counts()[1]

In [None]:
# Define balanced classifiers
classifiers = [
    ('LR', LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1000)),
    ('SVC', SVC(gamma='scale', class_weight='balanced', max_iter=1000)),
    ('DT', DecisionTreeClassifier(class_weight='balanced', max_depth=3)),
    ('RF', RandomForestClassifier(class_weight='balanced', max_depth=3)),
    ('XGB', XGBClassifier(scale_pos_weight=2.19, max_depth=3))
]


# define cross-validation
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

results = []

for name, classifier in classifiers:
#     define data prep and pipeline
    pipeline = Pipeline(steps=[('preprocessing', col_transform), ('modelling', classifier)])

#     evaluate pipeline using cross validation 
    scores = cross_validate(pipeline, X_train, y_train, 
                            scoring={'accuracy': accs, 'f1': f1, 'APS': aps}, 
                            cv=cv, n_jobs=-1, error_score='raise')
    result = pd.DataFrame(scores)
    result['model'] = name
    results.append(result)
    
    #In case we need them later
#     pipeline.fit(X_train, y_train)
#     predictions = pipeline.predict(X_train)
#     yhat = pipeline.predict(X_test)

final = pd.concat(results, ignore_index=True)
time_metrics = [col for col in final.columns if 'time' in col]
test_metrics = [col for col in final.columns if 'test' in col]

#Plot results
final_test = pd.melt(final, id_vars=['model'], value_vars=test_metrics,
                     var_name='metric')
final_times = pd.melt(final, id_vars=['model'], value_vars=time_metrics,
                     var_name='metric')

fig, ax = plt.subplots(ncols=2, figsize=(12,4))

sns.boxplot(x='model', y='value', data=final_test, hue='metric', palette='Set2', ax=ax[0])
sns.boxplot(x='model', y='value', data=final_times, hue='metric', palette='Set3', ax=ax[1])

ax[0].set_ylabel('Values')
ax[0].set_xlabel('Model')
ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax[0].set_title('Performance by model and metric')

ax[1].set_ylabel('Time')
ax[1].set_xlabel('Model')
ax[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax[1].set_title('Performance by model and fit/score time')

plt.tight_layout();


In [None]:
results_balanced = final_test.groupby(['metric', 'model']).median()
results_balanced.T

We note that the cost-sensitive versions have a decrease in overall accuracy except for random forest, which is about the same. However, we should check the confusion matrices to evaluate how recall and precision vary for each.

We'll do this for the logistic regression, random forest and SVC models, since these all performed equivalently, at least in the non-cost-sensitive versions.

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve, ConfusionMatrixDisplay, classification_report

In [None]:
models = [('LogisticRegression', LogisticRegression(solver='lbfgs', max_iter=1500)), 
          ('SVC', SVC(gamma='scale', max_iter=1500)),
          ('RandomForest', RandomForestClassifier(max_depth=3))]

for title, model in models:
    pipe = Pipeline(steps=[('preprocessing', col_transform), (title.lower(), model)])
    pipe.fit(X_train, y_train)
    
    #Get predictions
    yhat = pipe.predict(X_test)
    
    acc = round(pipe.score(X_test, y_test), 3)
    fscore = round(f1_score(y_test, yhat), 3)
    apscore = round(average_precision_score(y_test, yhat), 3)
    
    print(title + '\n', classification_report(y_test, yhat, labels=[1,0]), "\n")
    
    #Display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, yhat, labels=[1,0]), display_labels=[1,0])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{title}\n Accuracy: {acc}; F1: {fscore}; APS: {apscore}');

Interestingly, all three perform perform exactly the same, which is perhaps unsurprising given that the accuracy and F1 scores from cross-validation are the same in all three cases. We also note that all scores decrease from the training stage, which makes sense, and that recall is poor and precision is perfect (there are 25 false negatives and no false positives), i.e. the models are good at predicting loans that will be approved (the majority class) and poor at predicting if they will be rejected. Given that it is often more important to identify  whether loans will be rejected than when they will be accepted, we can assume that the recall could be improved at the expense of some precision. 

In [None]:
yhat_lr = Pipeline(steps=[('preprocessing', col_transform), ('LR', models[0][1])]).predict(X_test)
yhat_svc = Pipeline(steps=[('preprocessing', col_transform), ('SVC', models[1][1])]).predict(X_test)
yhat_rf = Pipeline(steps=[('preprocessing', col_transform), ('RF', models[2][1])]).predict(X_test)

print(sum(yhat_lr == yhat_svc) == len(X_test), sum(yhat_lr == yhat_rf) == len(X_test), 
      sum(yhat_rf == yhat_svc) == len(X_test), sep='\n')

So the models made exactly the same predictions.

Given that they perform the same and the dataset is small enough that computation time isn't an issue, we'll gridsearch all three to obtain the optimum hyperparameters. We should also check the cost-sensitive versions to see if recall is any better.

In [None]:
models = [('LogisticRegression', LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1500)), 
          ('SVC', SVC(class_weight='balanced', gamma='scale', max_iter=1500)),
          ('RandomForest', RandomForestClassifier(class_weight='balanced', max_depth=3))]

for title, model in models:
    pipe = Pipeline(steps=[('preprocessing', col_transform), (title.lower(), model)])
    pipe.fit(X_train, y_train)
    
    #Get predictions
    yhat = pipe.predict(X_test)
    
    acc = round(pipe.score(X_test, y_test), 3)
    fscore = round(f1_score(y_test, yhat), 3)
    apscore = round(average_precision_score(y_test, yhat), 3)
    
    print(title + '\n', classification_report(y_test, yhat, labels=[1,0]), "\n")
    
    #Display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, yhat, labels=[1,0]), display_labels=[1,0])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{title}\n Accuracy: {acc}; F1: {fscore}; APS: {apscore}');

Beforehand, we'll explore some feature engineering to see if we can improve performance.

### Power transforms of numerical features

Some ML algorithms (e.g. logistic regression) assume that the input data is approximately normally distributed; if it is not this can negatively affect performance. We can make numerical distributions more normally distributed by applying a power transform in the preprocessing step. By default, this also includes standardization after the transform is applied but we'll forgo this and apply a MinMaxScaler() for consistency with our previous process. Note that we have omitted the loan term from this process and simply scaled it since it is a kind of hybrid categorical/numerical feature. 

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
#numerical transforms
num_transformer = Pipeline(steps=[('power', PowerTransformer(method='yeo-johnson', standardize=False)),
                                  ('scaler', MinMaxScaler(feature_range=(0,1)))])

#categorical transforms
cat_transformer = OneHotEncoder(sparse=True, handle_unknown='ignore')

#transformer
col_transform = ColumnTransformer(transformers=[('categorical', cat_transformer, categorical),
                                                ('numerical', num_transformer, numerical[:-1]),
                                                ('scaler', MinMaxScaler(feature_range=(0,1)), [numerical[-1]])],
                                  remainder='passthrough')

In [None]:
models = [('LogisticRegression', LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1500)), 
          ('SVC', SVC(class_weight='balanced', gamma='scale', max_iter=1500)),
          ('RandomForest', RandomForestClassifier(class_weight='balanced', max_depth=3))]

for title, model in models:
    pipe = Pipeline(steps=[('preprocessing', col_transform), (title.lower(), model)])
    pipe.fit(X_train, y_train)
    
    #Get predictions
    yhat = pipe.predict(X_test)
    
    acc = round(pipe.score(X_test, y_test), 3)
    fscore = round(f1_score(y_test, yhat), 3)
    apscore = round(average_precision_score(y_test, yhat), 3)
    
    print(title + '\n', classification_report(y_test, yhat, labels=[1,0]), "\n")
    
    #Display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, yhat, labels=[1,0]), display_labels=[1,0])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{title}\n Accuracy: {acc}; F1: {fscore}; APS: {apscore}');

## Gridsearch

We'll gridsearch over hyperparameters for all three models and compare performance; given that we established earlier that the class imbalance isn't causing too many problems, we'll use the area under the ROC curve as an appropriate metric for performance evaluation.

### Logistic regression

We define a parameter grid, tune the hyperparameters and output the confusion matrix and scores to check performance.

In [None]:
#Instantiate pipeline
pipeline = Pipeline(steps=[('preprocessing', col_transform), ('lr', LogisticRegression())])

C = np.logspace(-3, 2, 10)
penalty = ['l1', 'l2']
solver = ['lbfgs', 'liblinear', 'sag', 'saga']
max_iter = [500, 700, 1000, 1200]
class_weight = [{0:10,1:1}, {0:1,1:1}, {0:7,1:3}]

# define grid search
grid = dict(lr__C=C, 
            lr__penalty=penalty, 
            lr__solver=solver,
            lr__max_iter=max_iter,
            lr__class_weight=class_weight)

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=grid, 
                           n_jobs=-1, 
                           cv=cv, 
                           scoring='roc_auc',
                           verbose=2)

grid_result_lr = grid_search.fit(X_train, y_train)

# summarize results
print(f"Best: {grid_result_lr.best_score_} using {grid_result_lr.best_params_}")
print(f'Best estimator: {grid_result_lr.best_estimator_}')
means = grid_result_lr.cv_results_['mean_test_score']
stds = grid_result_lr.cv_results_['std_test_score']
params = grid_result_lr.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

In [None]:
lr_best = grid_result_lr.best_estimator_

yhat = lr_best.predict(X_test)
acc = round(lr_best.score(X_test, y_test), 3)
fscore = round(f1_score(y_test, yhat), 3)
apscore = round(average_precision_score(y_test, yhat), 3)

print(acc, fscore, apscore, sep='\n')

In [None]:
lr_best

In [None]:
x = results_unbalanced.reset_index()
x[x['model'] == 'LR']

### Random Forest

In [None]:
#Instantiate pipeline
pipeline = Pipeline(steps=[('preprocessing', col_transform), ('rf', RandomForestClassifier())])

bootstrap = [True]
max_depth = [9, 10, 20]
max_features = [2, 3, 4]
min_samples_leaf = [3, 4, 5]
min_samples_split = [8, 10, 12]
n_estimators = [50, 100, 500]
class_weight = [{0:10,1:1}, {0:1,1:1}, {0:2,1:1}, {0:7,1:3}]

# define grid search
grid = dict(rf__bootstrap=bootstrap, 
            rf__max_depth=max_depth, 
            rf__max_features=max_features,
            rf__min_samples_leaf=min_samples_leaf,
            rf__min_samples_split=min_samples_split, 
            rf__n_estimators=n_estimators,
            rf__class_weight=class_weight)

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=grid, 
                           n_jobs=-1, 
                           cv=cv, 
                           scoring='roc_auc',
                           verbose=2)

grid_result_rf = grid_search.fit(X_train, y_train)

# summarize results
print(f"Best: {grid_result_rf.best_score_} using {grid_result_rf.best_params_}")
print(f'Best estimator: {grid_result_rf.best_estimator_}')
means = grid_result_rf.cv_results_['mean_test_score']
stds = grid_result_rf.cv_results_['std_test_score']
params = grid_result_rf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

In [None]:
rf_best = grid_result_rf.best_estimator_

yhat = rf_best.predict(X_test)
acc = round(rf_best.score(X_test, y_test), 3)
fscore = round(f1_score(y_test, yhat), 3)
apscore = round(average_precision_score(y_test, yhat), 3)

print(acc, fscore, apscore, sep='\n')