# Classification trials

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from io import StringIO

import numpy as np
import pandas as pd
import sklearn.metrics as mr
from azure.storage.blob import BlobServiceClient
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    RepeatedKFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)

In [None]:
%aimport src.custom_transformers
import src.custom_transformers as ct

%aimport src.ml_helpers
from src.ml_helpers import base_pipeline, multi_model_grid_search

%aimport src.ml_metrics
from src.ml_metrics import (
    recall_binary_scorer,
    threshold_fpr_score,
    threshold_recall_score,
    threshold_auc_score,
)

%aimport src.visualization_helpers
from src.visualization_helpers import (
    plot_learning_curve,
    builtin_plot_permutation_importances,
    plot_cross_validated_coefs,
    show_yb_grid,
    plot_grouped_bar_chart,
    plot_grouped_histogram,
)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## About

This notebook will walk through machine learning classification experiments on the raw [Lending Club loans data](https://www.lendingclub.com/auth/login?login_url=%2Fstatistics%2Fadditional-statistics%3F). A best-performing model will then be determined and assessed in the context of the business use case for this project.i.e. an conservative investor wanting to use the model developed here to predict which loan applications on the [Lending Club platform](https://www.lendingclub.com/), will not [default](https://en.wikipedia.org/wiki/Default_(finance)), and therefore should be funded by them.

## User Inputs

User inputs and helper functions, to be used later, are defined below

In [None]:
raw_data_file_path = "data/raw/lending_club_loans.csv"
cloud_storage = "no"

# From Feature Reduction
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary_labels = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]

# From Feature Processing
more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]
datetime_cols_v2 = ["last_credit_pull_d", "earliest_cr_line"]
high_cardinality_cols = ["addr_state"]
mapping_dict_emp_length = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]
repeated_data_cols = ["title"]
pct_to_numeric_cols = ["int_rate", "revol_util"]

# From Exploratory Data Analysis 1/2
correlated_features = [
    # "total_acc",
    "installment",
    "fico_range_low",
    "fico_range_high",
]
look_ahead_features = ["last_fico_range_low", "last_fico_range_high"]
raw_labels = ["loan_status"]
new_labels = ["is_default"]

cols_to_show = [
    "preprocessor_type",
    "resamplers",
    "clf",
    "threshold",
    "params",
    "mean_test_recall_binary",
    "mean_test_fpr",
    "mean_test_auc",
    "mean_train_recall_binary",
    "mean_train_fpr",
    "mean_train_auc",
    "mean_fit_time",
    "std_train_recall_binary",
    "std_test_recall_binary",
    "std_train_fpr",
    "std_test_fpr",
    "mean_score_time",
    "clf_params",
]

In [None]:
nan_threshold = float(nan_threshold)
for k in ["Fully Paid", "Charged Off"]:
    mapping_dictionary_labels["loan_status"][k] = int(
        mapping_dictionary_labels["loan_status"][k]
    )
for k in [
    "10+ years",
    "9 years",
    "8 years",
    "7 years",
    "6 years",
    "5 years",
    "4 years",
    "3 years",
    "2 years",
    "1 year",
    "< 1 year",
    "n/a",
]:
    mapping_dict_emp_length["emp_length"][k] = int(
        mapping_dict_emp_length["emp_length"][k]
    )

# From Exploratory Data Analysis 2/2
mapping_dict_new_labels = {"is_default": {0: 1, 1: 0}}

## Load data

Raw data from Lending Club is loaded into memory

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    df = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    df = pd.read_csv(raw_data_file_path, skiprows=0, low_memory=False)

## Train-Test split

A hold-out set of the raw data will be set aside for model assessment

In [None]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=4321)

## Data processing

The raw data is cleaned, filtered and features are extracted/selected from this processed data in order to run experiments in classification

In [None]:
pipe_1_2_3 = Pipeline(
    [
        # nan_threshold=0.5, is a float in [0, 1.0] - CAN CHANGE OR KEEP UNCHANGED
        ("nanthresh", ct.DFNanThresholdColumnDropper(0.5)),
        ("nouse", ct.DFColumnDropper(non_useful_cols)),
        ("dtime", ct.DFColumnDropper(datetime_cols)),
        ("c1", ct.DFColumnDropper(cols_one_eighteen)),
        ("c2", ct.DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", ct.DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            ct.DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", ct.DFColumnMapper(mapping_dictionary_labels)),
        ("onevals", ct.DFNonUniqueValColDropper(num_non_unique_vals=1)),
        # four_or_less_value_columns=['pymnt_plan'] - 'pymnt_plan' is MOSTLY 'n', EXCLUDE
        ("fourvals", ct.DFColumnDropper(four_or_less_value_columns)),
        # more_than_one_pct_missing_columns = ['pub_rec_bankruptcies']
        ("morethan1pctnan", ct.DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan", ct.DFDropNaN()),
        # high_cardinality_cols=['addr_state'] - CAN INCLUDE OR EXCLUDE
        ("hcardcols", ct.DFColumnDropper(high_cardinality_cols)),
        ("dtimev2", ct.DFColumnDropper(datetime_cols_v2)),
        ("texttonum", ct.DFColumnMapper(mapping_dict_emp_length)),
        ("repeats", ct.DFColumnDropper(repeated_data_cols)),
        ("pctcols", ct.DFPctNumeric(pct_to_numeric_cols, "%")),
        (
            "singlecolmap",
            ct.DFSingleColumnMapper("loan_status", mapping_dict_new_labels),
        ),
        ("dtype", ct.DFSimpleDtypeChanger(new_labels, "int")),
        # n_std is an integer - CAN CHANGE OR KEEP UNCHANGED
        ("stdfilter", ct.DFColumnStdFilter("annual_inc", 3)),
        # correlated_features=['total_acc','installment','fico_range_low','fico_range_high'] EXCLUDE
        ("corr", ct.DFColumnDropper(correlated_features)),
        ("lookahead", ct.DFColumnDropper(look_ahead_features)),
        ("label", ct.DFColumnDropper(raw_labels)),
        # requires ("corr", ...) to be removed; threshold=1, ideally chosen from dendogram
        # ("clusterselect", ct.DFHierarchicalClusterSpearmanRank(threshold=1)),
    ]
)
df_pipe_transformed_train = pipe_1_2_3.fit_transform(df_train)
df_pipe_transformed_test = pipe_1_2_3.transform(df_test)
print(df_pipe_transformed_train.shape)
print(df_pipe_transformed_test.shape)
display(df_pipe_transformed_train.head(2))
display(df_pipe_transformed_test.head(2))
display(df_pipe_transformed_train["is_default"].squeeze().value_counts(dropna=False))

## Get features and class labels from processed data

The features and labels are now extracted from the processed training and testing splits

In [None]:
X_train = df_pipe_transformed_train.drop(labels=new_labels, axis=1)
y_train = df_pipe_transformed_train[new_labels].astype(int).squeeze()
X_test = df_pipe_transformed_test.drop(labels=new_labels, axis=1)
y_test = df_pipe_transformed_test[new_labels].astype(int).squeeze()
display(X_train.head(2))
display(X_test.head(2))
display(y_train.to_frame().head(2))
display(y_test.to_frame().head(2))

## Assemble components for `GridSearchCV`

The pipeline inputs and other components required for hyperparameter optimization using `GridSearchCV` are extracted here

### Lists of features by type

A list of numerical and categorical features is extracted from the processed data
- numerical features are those with a datatype of `float`
- categorical features are those that do not have a datatype of `float`

In [None]:
numerical_columns = [
    c
    for c in list(X_train.select_dtypes(exclude="object"))
    if c not in new_labels + ["emp_length"]
]
nominal_columns = list(X_train.select_dtypes(include="object")) + ["emp_length"]
try:
    assert set(numerical_columns + nominal_columns) == set(list(X_train)) - set(
        new_labels
    )
    print("Columns from training data match feature lists")
except AssertionError as e:
    print("Some columns from training data are missing from feature lists")
print("Categoricals:\n-" + "\n-".join(nominal_columns))
print("Numericals:\n-" + "\n-".join(numerical_columns))

### Feature transformations

Below, feature transformations to be applied to all numerical columns are defined. With or without transformations, all numerical features will be normalized. All categorical features will be one-hot encoded.

In [None]:
col_transformers = {
    c: Pipeline(
        steps=[
            ("trans", ct.DFPowerTransformer("yeo-johnson")),
            ("ss", ct.DFStandardScaler()),
        ]
    )
    for c in numerical_columns
}
preprocessors = {
    "no_trans": ColumnTransformer(
        transformers=[
            (
                "nums",
                Pipeline(steps=[("trans", StandardScaler())]),
                numerical_columns,
            )
        ]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
    "trans": ColumnTransformer(
        transformers=[(k, v, [k]) for k, v in col_transformers.items()]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
}

### Classifiers and hyper-parameters for optimization

The models to be compared, discrimination threshold(s) (to be applied to all listed models), and dictionaries of model hyper-parameters for tuning, are defined below
- for hyper-parameter dictionaries containing cost-function weights, for manual specification of the penalties used in the algorithm's cost function, the larger penalty should be assigned to the minority class (see [**Lesson 07. Cost-Sensitive Algorithms**](https://machinelearningmastery.com/imbalanced-classification-with-python-7-day-mini-course/))
  - here, this is for the positive class where `is_default`==1, and is explicitly shown below through the class-balance of the labels from the training data

In [None]:
y_train.value_counts().to_frame()

In [None]:
clf_list = [
    # DummyClassifier(strategy="stratified", random_state=42),
    LogisticRegression(penalty="l2", solver="lbfgs", max_iter=500),
    # RandomForestClassifier(n_jobs=-1, random_state=42),
    # AdaBoostClassifier(random_state=42),
    # GaussianNB(),
    # BernoulliNB(),
]
thresholds = [0.4, 0.5]
parameters = {
    "DummyClassifier": {"strategy": ["stratified", "most_frequent", "uniform"]},
    "LogisticRegression": {
        "C": [1.0],
        "class_weight": [
            "balanced",
            {0: 1, 1: 5},
        ],
    },
    "RandomForestClassifier": {
        "n_estimators": [500],
        "min_samples_split": [10],
        "class_weight": [{0: 1, 1: 5}, "balanced"],
    },
    "AdaBoostClassifier": {"n_estimators": [500]},
    "GaussianNB": {"priors": [None]},
    "BernoulliNB": {"fit_prior": [True, False], "class_prior": [None]},
}

In [None]:
# installment, total_acc

## Run `GridSearchCV` without transformed features

Hyper-parameter optimization is performed without numerical feature transformations
- normalization (numerical) and one-hot encoding (categorical) are applied to the features before classification

Below, this is done while varying the discrimination threshold, including the default threshold of `0.5`

In [None]:
df_gs_summary_no_trans = multi_model_grid_search(
    clf_list, X_train, y_train, preprocessors, parameters, thresholds, False, "no_trans"
)[cols_to_show]
display(df_gs_summary_no_trans)

Below, this is done without varying the discrimination threshold, and so only uses the default threshold of `0.5`

In [None]:
df_gs_summary_no_trans_no_custom_threshold = multi_model_grid_search(
    clf_list, X_train, y_train, preprocessors, parameters, [], True, "no_trans"
)[cols_to_show]
display(df_gs_summary_no_trans_no_custom_threshold)

Here, it is shown that, for the case of a default threshold of `0.5`, both approaches are equivalent. So, the approach of varying the discrimination threshold, for non-default values higher or lower than `0.5`, can be used in further analysis

In [None]:
try:
    assert (
        df_gs_summary_no_trans[df_gs_summary_no_trans["threshold"] == 0.5]
        .reset_index(drop=True)
        .drop(columns=["mean_fit_time", "mean_score_time"], axis=1)
    ).equals(
        df_gs_summary_no_trans_no_custom_threshold.drop(
            columns=["mean_fit_time", "mean_score_time"], axis=1
        )
    )
except AssertionError as e:
    print(
        "Disagreement between threshold sensitive and threshold agnostic "
        "approaches for default discrimination threshold of 0.5."
    )
else:
    print(
        "Agreement between threshold sensitive and threshold agnostic "
        "approaches for default discrimination threshold of 0.5."
    )

## Run `GridSearchCV` with transformed features

Hyper-parameter optimization is performed with numerical feature transformations
- normalization (numerical) and one-hot encoding (categorical) are applied to the features before classification

In [None]:
df_gs_summary_trans = multi_model_grid_search(
    clf_list, X_train, y_train, preprocessors, parameters, thresholds, False, "trans"
)[cols_to_show]
display(df_gs_summary_trans)

## Comparison of multi-model `GridSearchCV`

The combined results of the multi-model hyperparameter optimization, with and without feature transformations, are shown below

In [None]:
df_gs_summary = (
    pd.concat([df_gs_summary_trans, df_gs_summary_no_trans])
    .sort_values(
        by=["mean_test_recall_binary", "mean_test_fpr", "mean_test_auc"],
        ascending=[False, True, False],
    )
    .reset_index(drop=True)
)
display(df_gs_summary)

**Observations**
1. The best `TPR` and `FPR` are approximately 66% and 39% respectively. If both types of loans are arbitrarily funded, the relative benefit of the higher `TPR` is that the best model found here will be better at avoiding loans that result in a default (and loss of money) at the expense of not allowing 34% of the total available loans that will not default (`FPR`) to be funded. These (`FPR`, 34%) are missed opportunities to earn returns and there is a stronger case for the `TPR` (64%) to be higher, than for the `FPR` to be lower, and thus save the business user (a risk-averse investor) money. In practice, further discussion with the end user (the investor) is warranted in order to determine their risk tolerance before picking between these two metrics.
2. (Training or validation) Scores with and without feature transformations are minimally different.
3. Since a shorter duration is required for training without transformations, a pipeline with no feature transformations (`preprocessor_type="no_trans"`) will be selected.

### Extract components of best pipeline

Below, the following four components of the pipelines tried
- name of model
- model hyper-parameters
- type of preprocessor
- discrimination threshold

are extracted from the best chosen pipeline, while keeping in mind the objectives of the business use case (i.e. maximize recall, minimize `FPR`)

In [None]:
best_cfg_idx = df_gs_summary[
    (df_gs_summary["preprocessor_type"] == "no_trans")
    & (df_gs_summary["clf"] == "LogisticRegression")
    & df_gs_summary["params"].astype(str).str.contains("'balanced'")
    & (df_gs_summary["threshold"] == 0.5)
].index[0]
print(f"Best pipeline configuration occurs at row index: {best_cfg_idx}")
display(df_gs_summary.loc[best_cfg_idx].to_frame())

Below, the extracted pipeline components are assigned to variables for future use

In [None]:
(
    best_clf_name,
    best_pipe_params_dict,
    best_threshold,
    best_preprocessor_type,
) = df_gs_summary.loc[
    best_cfg_idx, ["clf", "params", "threshold", "preprocessor_type"]
].tolist()

### Assemble best pipeline

A new pipeline is instantiated using the extracted
- model
- model hyper-parameters
- preprocessor

from the best pipeline chosen above

In [None]:
best_pipe_params_dict = {k.split("__")[1]: v for k, v in best_pipe_params_dict.items()}
best_pipe_params_dict.update(dict(penalty="l2", solver="lbfgs", max_iter=500))
clf = LogisticRegression(**best_pipe_params_dict)
pipe = base_pipeline(preprocessors[best_preprocessor_type])
pipe.steps.append(["clf", clf])

New scorers are instantiated using the extracted discrimination threshold from the best pipeline chosen above

In [None]:
multi_scorers = {
    "recall_binary": mr.make_scorer(
        threshold_recall_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=best_threshold,
    ),
    "fpr": mr.make_scorer(
        threshold_fpr_score,
        greater_is_better=False,
        needs_proba=True,
        threshold=best_threshold,
    ),
    "auc": mr.make_scorer(
        threshold_auc_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=best_threshold,
    ),
}

## Model Evaluation

The best pipeline assembled above is trained on the training data

In [None]:
print(numerical_columns, nominal_columns, list(X_train))

In [None]:
pipe.fit(X_train, y_train)

The evaluation metrics on the held-out data are shown below

In [None]:
pd.DataFrame.from_dict(
    {
        "test": {
            "recall_binary": threshold_recall_score(
                y_test, pipe.predict_proba(X_test)[:, 1], 0.5
            ),
            "fpr": -1
            * threshold_fpr_score(y_test, pipe.predict_proba(X_test)[:, 1], 0.5),
            "auc": threshold_auc_score(y_test, pipe.predict_proba(X_test)[:, 1], 0.5),
        },
        "train": {
            "recall_binary": threshold_recall_score(
                y_train, pipe.predict_proba(X_train)[:, 1], 0.5
            ),
            "fpr": -1
            * threshold_fpr_score(y_train, pipe.predict_proba(X_train)[:, 1], 0.5),
            "auc": threshold_auc_score(y_train, pipe.predict_proba(X_train)[:, 1], 0.5),
        },
    },
    orient="index",
)

### Diagnostic Plots

A grid of diagnostic plots are shown below for predictions, using the best pipeline found above, on OOS data (the test split)

In [None]:
show_yb_grid(
    pipe,
    X_test,
    y_test,
    np.sort(y_test.unique()),
    pd.concat([X_train, X_test]),
    pd.concat([y_train, y_test]),
    StratifiedKFold(n_splits=5, shuffle=False),
    np.round(np.arange(0, 1.1, 0.1), 3),
    wspace=0.1,
    hspace=0.3,
    fig_size=(12, 12),
)

**Notes**
1. The per-class ROC curve is computed on the held-out data only and, by definition, is not sensitive to changes in discrimination threshold.
2. 

### Feature Importances

#### Cross-Validation

For examining the importance of features (columns), cross-validation could be performed on the full data set (combined training and testing splits), as shown below. Multiple OOS splits are used and the average model coefficient, for each feature, across all splits would be taken as being indicative of overall OOS performance
- NOTE: metrics (`TPR` and `FPR`) computed on in-sample and held-out splits are computed but are not used in this plot; instead, only the model coefficients for each feature are extracted for each split and shown on the plot

In [None]:
plot_cross_validated_coefs(
    pipe,
    numerical_columns,
    nominal_columns,
    X_train,
    X_test,
    y_train,
    y_test,
    multi_scorers,
    n_repeats=5,
    n_splits=5,
    axis_tick_label_fontsize=12,
    fig_size=(8, 12),
)

**Observations**
1. From this graph, the one-hot encoded version of the features are shown. Ideally, the reverse-encoded feature would be shown since that is how it appears in the data.
2. Problems with using such coefficients/importances of a modeling algorithm include
   - incorrect normalization of the data during pre-processing
   - incorrectly, or not at all, accounting for the influence of highly-correlated features on model coefficients/importances during
     - the modeling process
       - these problems are specific to each model
     - [interpretation of model coefficients](https://projecteuclid.org/euclid.ss/1009213726)

#### Model-Neutral Permutation Importance

In permutation importance ([1](https://academic.oup.com/bioinformatics/article/26/10/1340/193348), [2](https://docs.cloud.oracle.com/en-us/iaas/tools/ads-sdk/latest/user_guide/mlx/permutation_importance.html#description)), each column is iteratively randomized and used as an input for modeling. The difference in scoring metric with and without this randomization is taken as the importance of the column being randomized to the model. This process is repeated for each column individually. It provides a model agnostic indication of the importance of each feature, independent of how the algorithm's coefficients/importances are computed.

Below is a boxplot highlighting the impact on model performance, using `TPR` as the scoring metric, of randomizing columns from the data individually (i.e. as determined using permutation importance)

In [None]:
plot_permutation_importances(
    pipe,
    X_train,
    X_test,
    y_train,
    y_test,
    scorer=multi_scorers["recall_binary"],
    n_repeats=10,
    wspace=0.4,
    fig_title_fontsize=16,
    fig_title_vertical_pos=0.97,
    axis_tick_label_fontsize=12,
    box_color="cyan",
    fig_size=(12, 6),
)

Below is a boxplot highlighting the impact on model performance, using `FPR` as the scoring metric, as determined using permutation importance

In [None]:
plot_permutation_importances(
    pipe,
    X_train,
    X_test,
    y_train,
    y_test,
    scorer=multi_scorers["fpr"],
    n_repeats=10,
    wspace=0.4,
    fig_title_fontsize=16,
    fig_title_vertical_pos=0.97,
    axis_tick_label_fontsize=12,
    box_color="cyan",
    fig_size=(12, 6),
)

### Check of bias and variance

Bias and variance related to this dataset are explored below, in terms of `TPR`

In [None]:
plot_learning_curve(
    pipe,
    f"Learning Curves for {type(pipe.named_steps['clf']).__name__}",
    X=pd.concat([X_train.iloc[:, :], X_test.iloc[:, :]]).reset_index(drop=True),
    y=pd.concat([y_train[:], y_test[:]]).reset_index(drop=True),
    cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=42),
    scorer=multi_scorers["recall_binary"],
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5),
    legend_coords=(0.7, 1),
    axis_tick_label_fontsize=12,
    fig_size=(8, 12),
)

Bias and variance related to this dataset are explored below, in terms of `FPR`

In [None]:
plot_learning_curve(
    pipe,
    f"Learning Curves for {type(pipe.named_steps['clf']).__name__}",
    X=pd.concat([X_train.iloc[:, :], X_test.iloc[:, :]]).reset_index(drop=True),
    y=pd.concat([y_train[:], y_test[:]]).reset_index(drop=True),
    cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=42),
    scorer=multi_scorers["fpr"],
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5),
    legend_coords=(0.7, 1),
    axis_tick_label_fontsize=12,
    fig_size=(8, 12),
)

**Notes**
1. The permutation importance (previous sub-section) and learning curve plotting codes used here are not sensitive to discrimination threshold and pick a default value of 0.5. However, since the best configuration used a threshold of 0.5, these plotting codes are used as-is. If a different threshold value were chosen, these plots would have to be generated manually.

**Observations**
1. Assessing bias
   - high validation and training errors indicate a high bias problem.
2. Assessing variance
   - a low training error and large gap would indicate overfitting and collecting more data is one approach to attempt at rectifying this problem - however, this is not the case here for the best model found above. High training error and a low gap between training and validation errors indicate a low variance problem.
3. Combined, this indicates that the best model is underfitting the training data. Adding data is unlikely to remedy this problem.
4. Further work should focus on extracting more features from the dataset - currently only a single feature `is_employed` (a binary feature indicating whether the applicant was employed or not at the time of applying for the loan on Lending Club) was extracted.

### Exploring incorrect predictions from the held-out data

Below is an exploration of predictions, from the unseen data, that the model got incorrect

#### Join dropped columns with held-out data

To explore misclassifications by grouping across a column not used in modeling, such as `addr_state`, the held-out data is merged with the raw data below

In [None]:
y_pred = pd.Series(pipe.predict(X_test), index=X_test.index, name="pred")
df_tp = df[["addr_state"]].merge(
    pd.concat(
        [
            X_test[
                [
                    "purpose",
                    "home_ownership",
                    "emp_length",
                    "term",
                ]
                + numerical_columns
            ],
            y_test,
            y_pred,
        ],
        axis=1,
    ),
    left_index=True,
    right_index=True,
    how="inner",
)

A `misclassified` column is appended to the merged data in order to indicate a mis-match between the true and predicted label

In [None]:
df_tp["misclassified"] = df_tp["is_default"] != df_tp["pred"]
display(df_tp)
display(df_tp.dtypes.to_frame())

#### Numerical columns

In [None]:
for c in numerical_columns:
    plot_grouped_histogram(df_tp, c, (0.675, 1.1), 0.5, 0.15, (12, 4))

**Observations**
1. For the chosen numerical columns, the distributions of correctly and incorrectly predicted loan status are similar to eachother and follow that of the overall (true) held-out data.

#### Categorical columns

In [None]:
for col, wspace, fig_size in zip(
    ["home_ownership", "purpose", "emp_length", "term", "addr_state"],
    [0.25, 0.4, 0.1, 0.25, 0.1],
    [(12, 4), (12, 4), (12, 4), (12, 4), (12, 8)],
):
    plot_grouped_bar_chart(df_tp, col, "misclassified", wspace, fig_size=fig_size)

**Observations**
1. For the `Term` column of the data
   - although nearly 75% of the loans (RHS plot) required 36 monthly payments, the model found here has nearly the same difficulty (LHS plot) predicting loans requiring 36 or 60 monthly payments. The model has trouble with predicting infrequently occurring term loans (60 months) than those that occur more commonly (36 months).
2. In terms of the `Employment Length` and `Purpose` columns of the data, the incorrect model predictions follow the frequency of the data - more commonly occurring categories in each of these columns are misclassified.
3. The model has greater difficulty (45% misclassifications, LHS plot) in predicting the outcome for home owners who pay `Rent` than what is observed from the data (40%, RHS plot).

## Conclusion and Future work

The best model found here returns a `FPR` of approximately 34% and a `TPR` of approximately 65%. Since a conservative investor would want the `FPR` to be lowered before deploying this model to determine if new loan applications hould be funded, in order to avoid funding riskier loans (ones that will default), further iterations of this project should focus on the following
- extracting more machine learning features from the raw Lending Club data
  - consider a less stringent threshold (currently 50%) for dropping features with missing data
    - this could possibly re-introduce some features into the dataset that hopefully add predictive power to the models tried
  - based on incorrect model predictions, further exploration (incl. outlier removal) of the home ownership and loan term columns from the data may be warranted
- additional model hyper-parameter, incl. threshold, optimization
- other techniques to remove outliers from features (currently filtering, based on univariate visualization, was used) in the dataset before machine learning