# Classification trials

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from io import StringIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as mr
from azure.storage.blob import BlobServiceClient
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedKFold,
    ShuffleSplit,
    StratifiedKFold,
    train_test_split,
)
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)

In [None]:
%aimport src.altair_helpers
from src.altair_helpers import (
    plot_altair_grid,
    alt_plot_metric_based_threshold_tuning_plots,
)

%aimport src.business_helpers
from src.business_helpers import int_income_calculator

%aimport src.custom_learning_curve_helpers
from src.custom_learning_curve_helpers import learning_curve, manual_learning_curve

%aimport src.custom_permutation_importance_helpers
from src.custom_permutation_importance_helpers import manual_plot_permutation_importance

%aimport src.custom_threshold_tuning_plotting_helpers
from src.custom_threshold_tuning_plotting_helpers import (
    plot_cost_function_based_threshold_tuning_plots,
    plot_metric_based_threshold_tuning_plots,
)

%aimport src.custom_transformers
import src.custom_transformers as ct

%aimport src.custom_returns_plotter
from src.custom_returns_plotter import plot_returns

%aimport src.ml_helpers_v2
from src.ml_helpers_v2 import get_best_pipes, gridsearch

%aimport src.ml_metrics_v2
from src.ml_metrics_v2 import (
    threshold_roc_auc_score,
    threshold_fpr_score,
    threshold_f2_score,
    threshold_recall_score,
    pr_auc_score,
    get_scores,
    get_eval_metrics,
)

%aimport src.threshold_tuning_helpers
from src.threshold_tuning_helpers import (
    get_components_of_returns,
    threshold_tuning_reshaping,
)

%aimport src.visualization_helpers
from src.visualization_helpers import (
    plot_learning_curve,
    plot_permutation_importances,
    plot_grouped_bar_chart,
    plot_grouped_histogram,
    plot_pr_roc_curves,
)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 500)
pd.set_option("display.width", 1000)

## About

This notebook will walk through machine learning classification experiments on the raw [Lending Club loans data](https://www.lendingclub.com/auth/login?login_url=%2Fstatistics%2Fadditional-statistics%3F). A best-performing model will then be determined and assessed in the context of the business use case for this project.i.e. an conservative investor wanting to use the model developed here to predict which loan applications on the [Lending Club platform](https://www.lendingclub.com/), will not [default](https://en.wikipedia.org/wiki/Default_(finance)), and therefore should be funded by them.

## User Inputs

User inputs and helper functions, to be used later, are defined below

In [None]:
raw_data_file_path = "data/raw/lending_club_loans.csv"
cloud_storage = "no"

# From Feature Reduction
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary_labels = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]

# From Feature Processing
more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]
datetime_cols_v2 = ["last_credit_pull_d", "earliest_cr_line"]
high_cardinality_cols = ["addr_state"]
mapping_dict_emp_length = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]
repeated_data_cols = ["title"]
pct_to_numeric_cols = ["int_rate", "revol_util"]

# From Exploratory Data Analysis 1/2
correlated_features = [
    # "total_acc",
    "installment",
    "fico_range_low",
    "fico_range_high",
]
look_ahead_features = ["last_fico_range_low", "last_fico_range_high"]
raw_labels = ["loan_status"]
new_labels = ["is_default"]

cols_to_show = [
    "preprocessor_type",
    "resamplers",
    "threshold",
    "params",
    "mean_test_recall_binary",
    "mean_test_fpr",
    "mean_test_auc",
    "mean_train_recall_binary",
    "mean_train_fpr",
    "mean_train_auc",
    "mean_fit_time",
    "std_train_recall_binary",
    "std_test_recall_binary",
    "std_train_fpr",
    "std_test_fpr",
    "mean_score_time",
    "clf_params",
]

thresholds_list = np.arange(0.01, 1, 0.01)

In [None]:
nan_threshold = float(nan_threshold)
for k in ["Fully Paid", "Charged Off"]:
    mapping_dictionary_labels["loan_status"][k] = int(
        mapping_dictionary_labels["loan_status"][k]
    )
for k in [
    "10+ years",
    "9 years",
    "8 years",
    "7 years",
    "6 years",
    "5 years",
    "4 years",
    "3 years",
    "2 years",
    "1 year",
    "< 1 year",
    "n/a",
]:
    mapping_dict_emp_length["emp_length"][k] = int(
        mapping_dict_emp_length["emp_length"][k]
    )

# From Exploratory Data Analysis 2/2
mapping_dict_new_labels = {"is_default": {0: 1, 1: 0}}

In [None]:
multi_scorers = {
    "recall_binary": mr.make_scorer(
        threshold_recall_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=0.5,
    ),
    "fpr": mr.make_scorer(
        threshold_fpr_score,
        greater_is_better=False,
        needs_proba=True,
        threshold=0.5,
    ),
    "f2": mr.make_scorer(
        threshold_f2_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=0.5,
    ),
    "roc_auc_binary": mr.make_scorer(
        threshold_roc_auc_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=0.5,
    ),
    "pr_auc": mr.make_scorer(
        pr_auc_score,
        greater_is_better=True,
        needs_proba=True,
    ),
}
cols_to_show = [
    "mean_test_recall_binary",
    "mean_test_fpr",
    "mean_test_roc_auc_binary",
    "mean_test_pr_auc",
    "mean_train_recall_binary",
    "mean_train_fpr",
    "mean_train_roc_auc_binary",
    "mean_train_pr_auc",
    "mean_fit_time",
    "std_train_recall_binary",
    "std_test_recall_binary",
    "std_train_fpr",
    "std_test_fpr",
    "std_train_roc_auc_binary",
    "std_test_roc_auc_binary",
    "std_train_pr_auc",
    "std_test_pr_auc",
    "mean_score_time",
]

## Load data

Raw data from Lending Club is loaded into memory

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    df = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    df = pd.read_csv(raw_data_file_path, skiprows=0, low_memory=False)

## Train-Test split

A hold-out set of the raw data will be set aside for model assessment

In [None]:
df_train_val, df_test = train_test_split(df, test_size=0.33, random_state=4321)
df_train, df_val = train_test_split(df_train_val, test_size=0.33, random_state=4321)

## Data processing

The raw data is cleaned, filtered and features are extracted/selected from this processed data in order to run experiments in classification

In [None]:
pipe_1_2_3 = Pipeline(
    [
        # nan_threshold=0.5, is a float in [0, 1.0] - CAN CHANGE OR KEEP UNCHANGED
        ("nanthresh", ct.DFNanThresholdColumnDropper(0.5)),
        ("nouse", ct.DFColumnDropper(non_useful_cols)),
        ("dtime", ct.DFColumnDropper(datetime_cols)),
        ("c1", ct.DFColumnDropper(cols_one_eighteen)),
        ("c2", ct.DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", ct.DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            ct.DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", ct.DFColumnMapper(mapping_dictionary_labels)),
        ("onevals", ct.DFNonUniqueValColDropper(num_non_unique_vals=1)),
        # four_or_less_value_columns=['pymnt_plan'] - 'pymnt_plan' is MOSTLY 'n', EXCLUDE
        ("fourvals", ct.DFColumnDropper(four_or_less_value_columns)),
        # more_than_one_pct_missing_columns = ['pub_rec_bankruptcies']
        ("morethan1pctnan", ct.DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan", ct.DFDropNaN()),
        # high_cardinality_cols=['addr_state'] - CAN INCLUDE OR EXCLUDE
        ("hcardcols", ct.DFColumnDropper(high_cardinality_cols)),
        ("dtimev2", ct.DFColumnDropper(datetime_cols_v2)),
        ("texttonum", ct.DFColumnMapper(mapping_dict_emp_length)),
        ("repeats", ct.DFColumnDropper(repeated_data_cols)),
        ("pctcols", ct.DFPctNumeric(pct_to_numeric_cols, "%")),
        (
            "singlecolmap",
            ct.DFSingleColumnMapper("loan_status", mapping_dict_new_labels),
        ),
        ("dtype", ct.DFSimpleDtypeChanger(new_labels, "int")),
        # n_std is an integer - CAN CHANGE OR KEEP UNCHANGED
        ("stdfilter", ct.DFColumnStdFilter("annual_inc", 3)),
        # correlated_features=['total_acc','installment','fico_range_low','fico_range_high'] EXCLUDE
        ("corr", ct.DFColumnDropper(correlated_features)),
        ("lookahead", ct.DFColumnDropper(look_ahead_features)),
        ("label", ct.DFColumnDropper(raw_labels)),
        # requires ("corr", ...) to be removed; threshold=1, ideally chosen from dendogram
        # ("clusterselect", ct.DFHierarchicalClusterSpearmanRank(threshold=1)),
    ]
)
df_pipe_transformed_train = pipe_1_2_3.fit_transform(df_train)
df_pipe_transformed_val = pipe_1_2_3.transform(df_val)
df_pipe_transformed_train_val = pipe_1_2_3.transform(df_train_val)
df_pipe_transformed_test = pipe_1_2_3.transform(df_test)
print(df_pipe_transformed_train.shape)
print(df_pipe_transformed_val.shape)
print(df_pipe_transformed_train_val.shape)
print(df_pipe_transformed_test.shape)
display(df_pipe_transformed_train.head(2))
display(df_pipe_transformed_val.head(2))
display(df_pipe_transformed_train_val.head(2))
display(df_pipe_transformed_test.head(2))
display(df_pipe_transformed_train["is_default"].squeeze().value_counts(dropna=False))

## Get features and class labels from processed data

The features and labels are now extracted from the processed training and testing splits

In [None]:
X_train = df_pipe_transformed_train.drop(labels=new_labels, axis=1)
y_train = df_pipe_transformed_train[new_labels].astype(int).squeeze()
X_val = df_pipe_transformed_val.drop(labels=new_labels, axis=1)
y_val = df_pipe_transformed_val[new_labels].astype(int).squeeze()
X_train_val = df_pipe_transformed_test.drop(labels=new_labels, axis=1)
y_train_val = df_pipe_transformed_test[new_labels].astype(int).squeeze()
X_test = df_pipe_transformed_test.drop(labels=new_labels, axis=1)
y_test = df_pipe_transformed_test[new_labels].astype(int).squeeze()
display(X_train.head(2))
display(X_val.head(2))
display(X_test.head(2))
display(X_train_val.head(2))
display(y_train.to_frame().head(2))
display(y_val.to_frame().head(2))
display(y_test.to_frame().head(2))
display(y_train_val.to_frame().head(2))

## Assemble components for `GridSearchCV`

The pipeline inputs and other components required for hyperparameter optimization using `GridSearchCV` are extracted here

### Lists of features by type

A list of numerical and categorical features is extracted from the processed data
- numerical features are those with a datatype of `float`
- categorical features are those that do not have a datatype of `float`

In [None]:
numerical_columns = [
    c
    for c in list(X_train.select_dtypes(exclude="object"))
    if c not in new_labels + ["emp_length"]
]
nominal_columns = list(X_train.select_dtypes(include="object")) + ["emp_length"]
try:
    assert set(numerical_columns + nominal_columns) == set(list(X_train)) - set(
        new_labels
    )
    print("Columns from training data match feature lists")
except AssertionError as e:
    print("Some columns from training data are missing from feature lists")
print("Categoricals:\n-" + "\n-".join(nominal_columns))
print("Numericals:\n-" + "\n-".join(numerical_columns))

### Feature transformations

Below, feature transformations to be applied to all numerical columns are defined. With or without transformations, all numerical features will be normalized. All categorical features will be one-hot encoded.

In [None]:
col_transformers = {
    c: Pipeline(
        steps=[
            ("trans", ct.DFPowerTransformer("yeo-johnson")),
            ("ss", ct.DFStandardScaler()),
        ]
    )
    for c in numerical_columns
}
preprocessors = {
    "no_trans": ColumnTransformer(
        transformers=[
            (
                "nums",
                Pipeline(steps=[("trans", StandardScaler())]),
                numerical_columns,
            )
        ]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
    "trans": ColumnTransformer(
        transformers=[(k, v, [k]) for k, v in col_transformers.items()]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
}

### Classifiers and hyper-parameters for optimization

The models to be compared, discrimination threshold(s) (to be applied to all listed models), and dictionaries of model hyper-parameters for tuning, are defined below
- for hyper-parameter dictionaries containing cost-function weights, for manual specification of the penalties used in the algorithm's cost function, the larger penalty should be assigned to the minority class (see [**Lesson 07. Cost-Sensitive Algorithms**](https://machinelearningmastery.com/imbalanced-classification-with-python-7-day-mini-course/))
  - here, this is for the positive class where `is_default`==1, and is explicitly shown below through the class-balance of the labels from the training data

In [None]:
y_train.value_counts().to_frame()

In [None]:
vc = np.unique(y_train, return_counts=True)[1]
minority_weight = vc[0] / vc[1]
parameters = {
    "LogisticRegression": {
        "C": [1.0],
        "class_weight": [
            "balanced",
            None,
            {0: 1, 1: 1},
            {0: 1, 1: minority_weight},
            {0: 1, 1: 8},
        ],
    },
    "DummyClassifier": {"strategy": ["stratified"]},
}

In [None]:
%%time
cv = StratifiedKFold(n_splits=5, shuffle=False)
df_gs = gridsearch(
    X_train,
    y_train,
    parameters,
    preprocessors["no_trans"],
    cv,
    multi_scorers,
    threshold=0.5,
)
param_cols = df_gs.columns[
    df_gs.columns.to_series().str.contains("param_")
].tolist()
display(df_gs[["clf"]+param_cols+cols_to_show])

In [None]:
best_cfg_idx = 0
best_dummy_cfg_idx = 5
best_pipe, best_dummy_pipe = get_best_pipes(
    best_cfg_idx, best_dummy_cfg_idx, df_gs, preprocessors["no_trans"], param_cols
)

In [None]:
_, confs = get_components_of_returns(X_val)
confs

In [None]:
%%time
best_pipe.fit(X_train, y_train)
best_dummy_pipe.fit(X_train, y_train)
y_probs_val = best_pipe.predict_proba(X_val)[:, 1]

In [None]:
%%time
(
    df_threshold_tuning_scores,
    df_all_threshold_tuning_scores
) = plot_metric_based_threshold_tuning_plots(
    y_val,
    y_probs_val,
    thresholds_list,
    f2_beta=2,
    legend_position=(1.01, 1),
    show_best_t_by_f1=False,
    show_plot=False,
    fig_size=(8, 4),
)
display(df_threshold_tuning_scores)

In [None]:
alt_plot_metric_based_threshold_tuning_plots(
    df_all_threshold_tuning_scores,
    ptitle_offset=-5,
    legend_offset=5,
    figsize=(450, 300),
)

1. this is the sum of `n` [monthly returns](https://www.vertex42.com/ExcelArticles/amortization-calculation.html) that a prospective funder of the loan would expect to receive if the loan is paid off on time
2. Why divide by `len(y_test)`?
   - If every loan had these r,n,p, anbd other factos were consisnt, and taking the model's predictive power into account, then the average return would be given by `ds /= len(y_test)`

In [None]:
%%time
df_returns_t_tuned, df_returns_t_tuning_full = plot_cost_function_based_threshold_tuning_plots(best_pipe, best_dummy_pipe, X_val, y_val, confs, thresholds_list)
display(df_returns_t_tuned)
display(df_returns_t_tuning_full.head())

In [None]:
%%time
best_t = 0.75
df_returns_best_t, _ = plot_cost_function_based_threshold_tuning_plots(best_pipe, best_dummy_pipe, X_val, y_val, confs, [best_t])
df_returns_best_t = threshold_tuning_reshaping(df_returns_best_t)
display(df_returns_best_t)

In [None]:
plot_returns(
    df_returns_best_t.sort_values(by=["clf", "return"], ascending=[False, False]),
    ptitle="Comparison of Theoretical (if paid on time) and Predicted* Returns",
    annotation_text=f"*using optimal discrimination threshold ({best_t})",
    axis_tick_fontsize=12,
    annotation_text_fontsize=10,
    annotation_loc=(0.99, 0.01),
    fig_size=(8, 4),
)

1. It is average becuase the model will not alwyas predict that you should fund such a loan. There are times when it incorrectly predicts that the loan should not be funded - in such a scenario, prospective return is lost. If you used an independent methd of predicting loans to fund, and the loan was paid off on time, then the return you earn would be the theoretical return from the above graph/table.

In [None]:
%%time
best_pipe.fit(X_train_val, y_train_val)
best_dummy_pipe.fit(X_train_val, y_train_val)
y_probs_test = best_pipe.predict_proba(X_test)[:, 1]

In [None]:
_, confs_test = get_components_of_returns(X_train_val)
confs_test

In [None]:
%%time
df_returns_best_t_test, _ = plot_cost_function_based_threshold_tuning_plots(best_pipe, best_dummy_pipe, X_test, y_test, confs_test, [best_t])
df_returns_best_t_test = threshold_tuning_reshaping(df_returns_best_t_test)
display(df_returns_best_t_test)

In [None]:
plot_returns(
    df_returns_best_t_test.sort_values(by=["clf", "return"], ascending=[False, False]),
    ptitle="Comparison of Theoretical (if paid on time) and Predicted* Returns",
    annotation_text=f"*using optimal discrimination threshold ({best_t})",
    axis_tick_fontsize=12,
    annotation_text_fontsize=10,
    annotation_loc=(0.99, 0.01),
    fig_size=(8, 4),
)

## Model Evaluation

### ML Diagnostic metrics and plots

In [None]:
df_scores, y_pred_test_selected_threshold = get_eval_metrics(
    y_test.to_numpy(), y_probs_test, split="test", threshold=best_t, beta=2
)
display(df_scores.T)

In [None]:
df_cm = (
    pd.DataFrame(
        mr.confusion_matrix(
            y_test,
            y_pred_test_selected_threshold,
            labels=np.sort(np.unique(y_train_val)),
        ),
        index=np.sort(np.unique(y_train_val)),
        columns=np.sort(np.unique(y_train_val)),
    )
    .rename_axis("actual", axis="columns")
    .rename_axis("predicted", axis="rows")
)
df_cr = pd.DataFrame(
    mr.classification_report(
        y_test,
        y_pred_test_selected_threshold,
        target_names=np.sort(np.unique(y_train_val)),
        output_dict=True,
    )
).T
plot_altair_grid(
    df_cm,
    df_cr,
    ptitle_offset=-5,
    cpe_figsize=(150, 300),
    cm_figsize=(150, 300),
    cr_figsize=[(250, 300), (100, 300)],
)

**Observations**
1. The poor performance is not surprising since the threshold was optimized on the cost function of predicted loan return. Each of these metrics had an optimal threshold region that was different from the threshold that maximized the difference in the prediction error expressed as a cost function - the average error in the model's predicted returns.
2. By definition, `TPR` and `F2` are close to eachother since they minimize `FN`. This was also seen earlier in the metric-based threshold tuning plots and is again observed here.

In [None]:
plot_pr_roc_curves(
    y_test,
    y_probs_test,
    type(best_pipe.named_steps["clf"]).__name__,
    axis_tick_label_fontsize=12,
    wspace=0.1,
    legend_position=(0.3, 1.1),
    f2_beta=2,
    fig_size=(12, 4),
)

**Observations**
1. Interpreting the ROC-AUC curve
   - On average, a model with skill gives a higher probability to a randomly chosen real positive outcome than a negative one.
   - A model with perfect skill is depicted as a point at `(0, 1)` (top left of the plot)
     - A model with skill produces a ROC-AUC curve that expands from the 45-degree line up to the top left of the plot.
2. Interpreting the Precision-Recall curve
   - A model with skill can discriminate between classes and does not predict a random class or a constant class in all cases.
   - The no-skill line is a horizontal line with the value of the ratio of positive cases in the dataset. Its value depends on the relative balance between positive to negative classes. For a prefectly balanced dataset, this ratio is 0.5, which is clearly not the case here.
     - for the test set here, `y_test.value_counts().to_dict()` gives `{0: 10698, 1: 1771}` and so the no-skill line is drawn at `1771/10698`
   - A model with perfect skill is depicted as a point at `(1, 1)` (top right of the plot)
   - A model with skill produces a Precision-Recall curve that expands from the horizontal line, at the bottom, to the top right of the plot and is well above the horizontal line of no skill.
3. Given the imbalance in the classes here, more importance should be given to the Precision-Recall curve than to the ROC-AUC curve.
4. While both plots are clearly sub-optimal, it is reassuring that the best model found here is better than one that has no skill.
5. The optimal threshold is marked as a circle with an annotation on the
   - ROC-AUC curve
     - This is the threshold with the optimal balance between false positive and true positive rates as determined by optimizing the Geometric Mean
   - Precision-Recall curve
     - This is the threshold with the the best balance of precision and recall as determined by as optimizing the [F2 score](https://clusteval.sdu.dk/1/clustering_quality_measures/14)

### Model-Neutral Permutation Importance

In permutation importance ([1](https://academic.oup.com/bioinformatics/article/26/10/1340/193348), [2](https://docs.cloud.oracle.com/en-us/iaas/tools/ads-sdk/latest/user_guide/mlx/permutation_importance.html#description)), each column is iteratively randomized and used as an input for modeling. The difference in scoring metric with and without this randomization is taken as the importance of the column being randomized to the model. This process is repeated for each column individually. It provides a model agnostic indication of the importance of each feature, independent of how the algorithm's coefficients/importances are computed. The method requires the absence of multi-collinearity between ML features and that is the case here since correlated features were manually removed during exploratory data analysis.

**Methodology Used**
1. For a given (`r`, `n`, `P`), calculate
   - predicted return per loan ( `A` )
     - calculate predicted return from confusion matrix, using formula for `ds`
       - this takes ML model's predictive power into account
   - true return per loan (`B`)
2. Calculate difference (`D`) between predicted and theoretical returns
   - `D` = `A` - `B`
3. Calculate mean of all differences in the testing data
   - this gives the average difference between the return predicted by the model and the return earned (depending on whether the loan was paid off on time or not), per loan ( $\overline{D}$ )
4. Shuffle single ML feature and re-calculate mean of differences ( $\overline{D}$ ) from step 2.
4. Calculate difference (`D1`) bewteen mean values found in steps 2. (without shuffling) and 3. (with shuffling)
5. Repeat steps 2. to 4. `n_repeats` times
   - in other words, shuffle the same column `n_repeats` times and calculate the differences (`D2`, ..., `D10`) between the predicted and true returns
6. Plot all the differences `D1`, ..., `D10`
7. The higher the difference the more impactful the ML feature is to the predictive model developed here

Below is a boxplot highlighting the impact on model performance, separately using the TPR (minimizes `FN`), FPR and F2 score (the prefered metric for imbalanced data where `FN` is more important than `FP`) as the scoring metric, of randomizing columns from the testing data individually (i.e. as determined using the permutation importance)

In [None]:
%%time
plot_permutation_importances(
    best_pipe,
    X_test,
    y_test,
    scorer=multi_scorers["recall_binary"],
    n_repeats=10,
    fig_title_fontsize=14,
    fig_title_vertical_pos=0.97,
    axis_tick_label_fontsize=12,
    axis_label_fontsize=14,
    box_color="cyan",
    fig_size=(8, 8),
)

In [None]:
# %%time
# plot_permutation_importances(
#     best_pipe,
#     X_test,
#     y_test,
#     scorer=multi_scorers["fpr"],
#     n_repeats=10,
#     fig_title_fontsize=14,
#     fig_title_vertical_pos=0.97,
#     axis_tick_label_fontsize=12,
#     axis_label_fontsize=14,
#     box_color="cyan",
#     fig_size=(8, 8),
# )

In [None]:
%%time
plot_permutation_importances(
    best_pipe,
    X_test,
    y_test,
    scorer=multi_scorers["f2"],
    n_repeats=10,
    fig_title_fontsize=14,
    fig_title_vertical_pos=0.97,
    axis_tick_label_fontsize=12,
    axis_label_fontsize=14,
    box_color="cyan",
    fig_size=(8, 8),
)

The process is repeated using the difference between the average difference of the model's predicted, and true return, per loan in the testing split as the scoring metric instead of the the TPR and FPR separately

In [None]:
%%time
manual_plot_permutation_importance(
    X_test,
    y_test,
    best_pipe,
    best_t,
    5,
    "test",
    "Permutation Importances",
    14,
    12,
    14,
    "cyan",
    (8,8),
)

**Observations**
1. The `int_rate` (loan's interest rate), `term` (duration of loan) and `loan_amnt` (principal) are required to calculate the interrest income which is used in converting the model's predictions to the per-loan return. By definition, shuffling each of these columns individually will affect this conversion. So, it is not surprising that these three are the most important factors as determined by permutation importance.
2. Not surprisingly, annual income followed by purpose were the two most influential value-add variables in the data. The annual income is more important than the length of employment - a higher earner is more impactful on the ability to pay off a loan than an applicant who has been working longer (but not earning a high salary).
3. Since this is the **average** difference in dollars (horizontal axis) per loan, the marginal value increases in the per-loan return between pairs of features (eg. `annual_inc` vs `purpose`) should also be interpreted as an average rather than a discrete value that can be expected for every single loan.

In [None]:
multi_scorers = {
    "recall_binary": mr.make_scorer(
        threshold_recall_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=best_t,
    ),
    "f2": mr.make_scorer(
        threshold_fpr_score,
        greater_is_better=False,
        needs_proba=True,
        threshold=best_t,
    ),
    "fpr": mr.make_scorer(
        threshold_f2_score,
        greater_is_better=False,
        needs_proba=True,
        threshold=best_t,
    ),
}

### Check of Bias and Variance using Learning Curves

The training and cross-validation learning curves are shown below.

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
X_all = pd.concat([X_train_val.iloc[:, :], X_test.iloc[:, :]]).reset_index(drop=True)
y_all = pd.concat([y_train_val[:], y_test[:]]).reset_index(drop=True)

First, these are shown using the `TPR` (or `Recall`) as the scoring metric

In [None]:
# %%time
# plot_learning_curve(
#     best_pipe,
#     f"Learning Curves for {type(best_pipe.named_steps['clf']).__name__}",
#     X=X_all,
#     y=y_all,
#     cv=cv,
#     scorer=multi_scorers["recall_binary"],
#     n_jobs=-1,
#     train_sizes=np.linspace(0.1, 1.0, 5),
#     legend_coords=(0.7, 1),
#     axis_tick_label_fontsize=12,
#     fig_size=(8, 12),
# )

These are now shown using the `FPR` as the scoring metric

In [None]:
# %%time
# plot_learning_curve(
#     best_pipe,
#     f"Learning Curves for {type(best_pipe.named_steps['clf']).__name__}",
#     X=X_all,
#     y=y_all,
#     cv=cv,
#     scorer=multi_scorers["fpr"],
#     n_jobs=-1,
#     train_sizes=np.linspace(0.1, 1.0, 5),
#     legend_coords=(0.7, 1),
#     axis_tick_label_fontsize=12,
#     fig_size=(8, 12),
# )

Lastly, these are now shown using the `F2` score as the scoring metric

In [None]:
%%time
plot_learning_curve(
    best_pipe,
    f"Learning Curves for {type(best_pipe.named_steps['clf']).__name__}",
    X=X_all,
    y=y_all,
    cv=cv,
    scorer=multi_scorers["f2"],
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5),
    legend_coords=(0.7, 1),
    axis_tick_label_fontsize=12,
    fig_size=(8, 12),
)

As with the ML feature permutation importances, poor performance is expected for these metrics since the discrumination threshold was not chosen to optimize any of them. Instead, the focus was on the cost function - here, this is the model's predicted return per loan. So, these learning curves are repeated using the per-loan difference between the predicted and true returns (i.e. the mean error of the model's predictions) as the scoring metric

In [None]:
# %%time
# scores, train_sizes = learning_curve(best_pipe, X_all, y_all, cv, 5, best_t)
# scores_grouped = scores.groupby(["train_size"]).agg({"train_err": ["mean", "std"], "test_err": ["mean", "std"], "fit_time": ["mean", "std"], "clf":"first"}).reset_index()
# scores_grouped.columns = scores_grouped.columns.map('|'.join)
# display(scores_grouped)
# manual_learning_curve(
#     scores_grouped,
#     alpha=0.2,
#     hspace=0.2,
#     wspace=0.2,
#     axis_tick_label_fontsize=12,
#     figsize=(7, 12),
# )

**Observations**
1. Both training and cross-validation errors are larger (more negative) than $\$$1,000. If the business objective was to predict the average return<sup>[1](#myfootnote1)</sup>
 per loan to within $\$$1,000 of the true return<sup>[1](#myfootnote1)</sup> earned per loan, then the error in the model's predictions (both on training and cross-validation splits of the data) would be considered high. This is an indication that the model's predictions suffer from high bias.
2. The gap between the two learning curves is small, indicating the presence of low variance in the model's predictions.
3. Combined, this indicates that the best model with the selected discrimination threshold is underfitting the training data. Adding data is unlikely to remedy this problem. This is seen from the convergence of the two learning curves as the size of the training data is increased.
4. Further work should focus on extracting more features from the dataset - currently only a single ML feature `is_employed` (a binary column indicating whether the applicant was employed or not at the time of applying for the loan on Lending Club) was extracted.

<a name="myfootnote1">1</a>: or loss, depending on whether the loan is paid off on time or not

In [None]:
# y_pred_test_selected_threshold_series = pd.Series(
#     y_pred_test_selected_threshold, index=X_test.index, name="pred"
# )
# df_tp = df[["addr_state"]].merge(
#     pd.concat(
#         [
#             X_test[
#                 [
#                     "purpose",
#                     "home_ownership",
#                     "emp_length",
#                     "term",
#                 ]
#                 + numerical_columns
#             ],
#             y_test,
#             y_pred_test_selected_threshold_series,
#         ],
#         axis=1,
#     ),
#     left_index=True,
#     right_index=True,
#     how="inner",
# )

In [None]:
# df_tp["misclassified"] = df_tp["is_default"] != df_tp["pred"]
# display(df_tp)
# display(df_tp.dtypes.to_frame())

In [None]:
# for c in numerical_columns:
#     plot_grouped_histogram(df_tp, c, (0.675, 1.1), 0.5, 0.15, (12, 4))

In [None]:
# for col, wspace, fig_size in zip(
#     ["home_ownership", "purpose", "emp_length", "term", "addr_state"],
#     [0.25, 0.4, 0.1, 0.25, 0.1],
#     [(12, 4), (12, 4), (12, 4), (12, 4), (12, 8)],
# ):
#     plot_grouped_bar_chart(df_tp, col, "misclassified", wspace, fig_size=fig_size)

## Links
1. [Find row closest to a value](https://stackoverflow.com/a/52587453/4057186)