# Classification trials

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from io import StringIO

import numpy as np
import pandas as pd
import sklearn.metrics as mr
from azure.storage.blob import BlobServiceClient
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    RepeatedKFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)

In [None]:
%aimport src.custom_transformers
import src.custom_transformers as ct

%aimport src.ml_helpers
from src.ml_helpers import base_pipeline, multi_model_grid_search

%aimport src.ml_metrics
from src.ml_metrics import (
    recall_binary_scorer,
    threshold_fpr_score,
    threshold_recall_score,
    threshold_auc_score,
)

%aimport src.visualization_helpers
from src.visualization_helpers import (
    plot_learning_curve,
    plot_permutation_importances,
    plot_cross_validated_coefs,
    show_yb_grid,
    plot_grouped_bar_chart,
    plot_grouped_histogram,
)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 500)
pd.set_option("display.width", 1000)

## About

This notebook will walk through machine learning classification experiments on the raw [Lending Club loans data](https://www.lendingclub.com/auth/login?login_url=%2Fstatistics%2Fadditional-statistics%3F). A best-performing model will then be determined and assessed in the context of the business use case for this project.i.e. an conservative investor wanting to use the model developed here to predict which loan applications on the [Lending Club platform](https://www.lendingclub.com/), will not [default](https://en.wikipedia.org/wiki/Default_(finance)), and therefore should be funded by them.

## User Inputs

User inputs and helper functions, to be used later, are defined below

In [None]:
raw_data_file_path = "data/raw/lending_club_loans.csv"
cloud_storage = "no"

# From Feature Reduction
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary_labels = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]

# From Feature Processing
more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]
datetime_cols_v2 = ["last_credit_pull_d", "earliest_cr_line"]
high_cardinality_cols = ["addr_state"]
mapping_dict_emp_length = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]
repeated_data_cols = ["title"]
pct_to_numeric_cols = ["int_rate", "revol_util"]

# From Exploratory Data Analysis 1/2
correlated_features = [
    # "total_acc",
    "installment",
    "fico_range_low",
    "fico_range_high",
]
look_ahead_features = ["last_fico_range_low", "last_fico_range_high"]
raw_labels = ["loan_status"]
new_labels = ["is_default"]

cols_to_show = [
    "preprocessor_type",
    "resamplers",
    "clf",
    "threshold",
    "params",
    "mean_test_recall_binary",
    "mean_test_fpr",
    "mean_test_auc",
    "mean_train_recall_binary",
    "mean_train_fpr",
    "mean_train_auc",
    "mean_fit_time",
    "std_train_recall_binary",
    "std_test_recall_binary",
    "std_train_fpr",
    "std_test_fpr",
    "mean_score_time",
    "clf_params",
]

In [None]:
nan_threshold = float(nan_threshold)
for k in ["Fully Paid", "Charged Off"]:
    mapping_dictionary_labels["loan_status"][k] = int(
        mapping_dictionary_labels["loan_status"][k]
    )
for k in [
    "10+ years",
    "9 years",
    "8 years",
    "7 years",
    "6 years",
    "5 years",
    "4 years",
    "3 years",
    "2 years",
    "1 year",
    "< 1 year",
    "n/a",
]:
    mapping_dict_emp_length["emp_length"][k] = int(
        mapping_dict_emp_length["emp_length"][k]
    )

# From Exploratory Data Analysis 2/2
mapping_dict_new_labels = {"is_default": {0: 1, 1: 0}}

## Load data

Raw data from Lending Club is loaded into memory

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    df = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    df = pd.read_csv(raw_data_file_path, skiprows=0, low_memory=False)

## Train-Test split

A hold-out set of the raw data will be set aside for model assessment

In [None]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=4321)

## Data processing

The raw data is cleaned, filtered and features are extracted/selected from this processed data in order to run experiments in classification

In [None]:
pipe_1_2_3 = Pipeline(
    [
        # nan_threshold=0.5, is a float in [0, 1.0] - CAN CHANGE OR KEEP UNCHANGED
        ("nanthresh", ct.DFNanThresholdColumnDropper(0.5)),
        ("nouse", ct.DFColumnDropper(non_useful_cols)),
        ("dtime", ct.DFColumnDropper(datetime_cols)),
        ("c1", ct.DFColumnDropper(cols_one_eighteen)),
        ("c2", ct.DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", ct.DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            ct.DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", ct.DFColumnMapper(mapping_dictionary_labels)),
        ("onevals", ct.DFNonUniqueValColDropper(num_non_unique_vals=1)),
        # four_or_less_value_columns=['pymnt_plan'] - 'pymnt_plan' is MOSTLY 'n', EXCLUDE
        ("fourvals", ct.DFColumnDropper(four_or_less_value_columns)),
        # more_than_one_pct_missing_columns = ['pub_rec_bankruptcies']
        ("morethan1pctnan", ct.DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan", ct.DFDropNaN()),
        # high_cardinality_cols=['addr_state'] - CAN INCLUDE OR EXCLUDE
        ("hcardcols", ct.DFColumnDropper(high_cardinality_cols)),
        ("dtimev2", ct.DFColumnDropper(datetime_cols_v2)),
        ("texttonum", ct.DFColumnMapper(mapping_dict_emp_length)),
        ("repeats", ct.DFColumnDropper(repeated_data_cols)),
        ("pctcols", ct.DFPctNumeric(pct_to_numeric_cols, "%")),
        (
            "singlecolmap",
            ct.DFSingleColumnMapper("loan_status", mapping_dict_new_labels),
        ),
        ("dtype", ct.DFSimpleDtypeChanger(new_labels, "int")),
        # n_std is an integer - CAN CHANGE OR KEEP UNCHANGED
        ("stdfilter", ct.DFColumnStdFilter("annual_inc", 3)),
        # correlated_features=['total_acc','installment','fico_range_low','fico_range_high'] EXCLUDE
        ("corr", ct.DFColumnDropper(correlated_features)),
        ("lookahead", ct.DFColumnDropper(look_ahead_features)),
        ("label", ct.DFColumnDropper(raw_labels)),
        # requires ("corr", ...) to be removed; threshold=1, ideally chosen from dendogram
        # ("clusterselect", ct.DFHierarchicalClusterSpearmanRank(threshold=1)),
    ]
)
df_pipe_transformed_train = pipe_1_2_3.fit_transform(df_train)
df_pipe_transformed_test = pipe_1_2_3.transform(df_test)
print(df_pipe_transformed_train.shape)
print(df_pipe_transformed_test.shape)
display(df_pipe_transformed_train.head(2))
display(df_pipe_transformed_test.head(2))
display(df_pipe_transformed_train["is_default"].squeeze().value_counts(dropna=False))

## Get features and class labels from processed data

The features and labels are now extracted from the processed training and testing splits

In [None]:
X_train = df_pipe_transformed_train.drop(labels=new_labels, axis=1)
y_train = df_pipe_transformed_train[new_labels].astype(int).squeeze()
X_test = df_pipe_transformed_test.drop(labels=new_labels, axis=1)
y_test = df_pipe_transformed_test[new_labels].astype(int).squeeze()
display(X_train.head(2))
display(X_test.head(2))
display(y_train.to_frame().head(2))
display(y_test.to_frame().head(2))

## Assemble components for `GridSearchCV`

The pipeline inputs and other components required for hyperparameter optimization using `GridSearchCV` are extracted here

### Lists of features by type

A list of numerical and categorical features is extracted from the processed data
- numerical features are those with a datatype of `float`
- categorical features are those that do not have a datatype of `float`

In [None]:
numerical_columns = [
    c
    for c in list(X_train.select_dtypes(exclude="object"))
    if c not in new_labels + ["emp_length"]
]
nominal_columns = list(X_train.select_dtypes(include="object")) + ["emp_length"]
try:
    assert set(numerical_columns + nominal_columns) == set(list(X_train)) - set(
        new_labels
    )
    print("Columns from training data match feature lists")
except AssertionError as e:
    print("Some columns from training data are missing from feature lists")
print("Categoricals:\n-" + "\n-".join(nominal_columns))
print("Numericals:\n-" + "\n-".join(numerical_columns))

### Feature transformations

Below, feature transformations to be applied to all numerical columns are defined. With or without transformations, all numerical features will be normalized. All categorical features will be one-hot encoded.

In [None]:
col_transformers = {
    c: Pipeline(
        steps=[
            ("trans", ct.DFPowerTransformer("yeo-johnson")),
            ("ss", ct.DFStandardScaler()),
        ]
    )
    for c in numerical_columns
}
preprocessors = {
    "no_trans": ColumnTransformer(
        transformers=[
            (
                "nums",
                Pipeline(steps=[("trans", StandardScaler())]),
                numerical_columns,
            )
        ]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
    "trans": ColumnTransformer(
        transformers=[(k, v, [k]) for k, v in col_transformers.items()]
        + [("onehot", OneHotEncoder(handle_unknown="ignore"), nominal_columns)],
        remainder="passthrough",
    ),
}

### Classifiers and hyper-parameters for optimization

The models to be compared, discrimination threshold(s) (to be applied to all listed models), and dictionaries of model hyper-parameters for tuning, are defined below
- for hyper-parameter dictionaries containing cost-function weights, for manual specification of the penalties used in the algorithm's cost function, the larger penalty should be assigned to the minority class (see [**Lesson 07. Cost-Sensitive Algorithms**](https://machinelearningmastery.com/imbalanced-classification-with-python-7-day-mini-course/))
  - here, this is for the positive class where `is_default`==1, and is explicitly shown below through the class-balance of the labels from the training data

In [None]:
y_train.value_counts().to_frame()

In [None]:
clf_list = [
    LogisticRegression(penalty="l2", solver="lbfgs", max_iter=500),
]
parameters = {
    "LogisticRegression": {
        "C": [1.0],
        "class_weight": [
            "balanced",
            {0: 1, 1: 5},
            {0: 1, 1: 6},
            {0: 1, 1: 8},
        ],
    },
}

In [None]:
from sklearn.metrics import make_scorer, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
pipe = Pipeline([("preprocessor", preprocessors["no_trans"]), ("clf", clf_list[0])])
params_dict = {
    f"clf__{k}": v for k, v in parameters[type(clf_list[0]).__name__].items()
}
cv = StratifiedKFold(n_splits=5, shuffle=False)

In [None]:
def roc_auc_binary_scorer(y_true, y_pred):
    roc_auc_binary_score = roc_auc_score(
        y_true,
        y_pred,
        average="weighted",
        sample_weight=compute_sample_weight(class_weight="balanced", y=y_true),
    )
    return roc_auc_binary_score


def recall_binary_scorer(y_true, y_pred):
    recall_binary_score = recall_score(
        y_true,
        y_pred,
        average="binary",
        sample_weight=compute_sample_weight(class_weight="balanced", y=y_true),
    )
    return recall_binary_score


def false_positive_rate_scorer(y_true, y_pred):
    tn = y_pred[(y_pred == 0) & (y_true == 0)].shape[0]
    fp = y_pred[(y_pred == 1) & (y_true == 0)].shape[0]
    fpr = fp / (fp + tn)
    return -fpr


def threshold_roc_auc_score(ground_truth, predictions, threshold=0.5):
    predicted = (predictions >= threshold).astype("int")
    roc_auc = roc_auc_binary_scorer(ground_truth, predicted)
    return roc_auc


def threshold_recall_score(ground_truth, predictions, threshold=0.5):
    predicted = (predictions >= threshold).astype("int")
    recall = recall_binary_scorer(ground_truth, predicted)
    return recall


def threshold_fpr_score(ground_truth, predictions, threshold=0.5):
    predicted = (predictions >= threshold).astype("int")
    fpr = false_positive_rate_scorer(ground_truth, predicted)
    return fpr


def pr_auc_score(ground_truth, y_probs):
    precision, recall, _ = mr.precision_recall_curve(ground_truth, y_probs)
    pr_auc_score = mr.auc(recall, precision)
    return pr_auc_score

In [None]:
multi_scorers = {
    "recall_binary": make_scorer(
        threshold_recall_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=0.5,
    ),
    "fpr": make_scorer(
        threshold_fpr_score,
        greater_is_better=False,
        needs_proba=True,
        threshold=0.5,
    ),
    "roc_auc_binary": make_scorer(
        threshold_roc_auc_score,
        greater_is_better=True,
        needs_proba=True,
        threshold=0.5,
    ),
    "pr_auc": make_scorer(
        pr_auc_score,
        greater_is_better=True,
        needs_proba=True,
    ),
}
cols_to_show = [
    # "preprocessor_type",
    # "resamplers",
    "clf",
    "params",
    "mean_test_recall_binary",
    "mean_test_fpr",
    "mean_test_roc_auc_binary",
    "mean_test_pr_auc",
    "mean_train_recall_binary",
    "mean_train_fpr",
    "mean_train_roc_auc_binary",
    "mean_train_pr_auc",
    "mean_fit_time",
    "std_train_recall_binary",
    "std_test_recall_binary",
    "std_train_fpr",
    "std_test_fpr",
    "std_train_roc_auc_binary",
    "std_test_roc_auc_binary",
    "std_train_pr_auc",
    "std_test_pr_auc",
    "mean_score_time",
    "clf_params",
]

In [None]:
%%time
gs = GridSearchCV(
    estimator=pipe,
    param_grid=params_dict,
    cv=cv,
    scoring=multi_scorers,
    refit="recall_binary",
    return_train_score=True,
    n_jobs=-1,
)
gs.fit(X_train, y_train)
resamplers = list(set(list(pipe.named_steps.keys())) - set(["preprocessor", "clf"]))
# print(resamplers)
df_gs = (
    pd.DataFrame(gs.cv_results_)
    .assign(preprocessor_type=preprocessors["no_trans"])
    .assign(resamplers=",".join(resamplers))
    .assign(clf_params=type(clf_list[0]).__name__ + "_(" + str(params_dict) + "" + ")")
)
df_gs.insert(0, "clf", type(clf_list[0]).__name__)
display(df_gs[cols_to_show])