# Exploratory Data Analysis

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from io import StringIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from azure.storage.blob import BlobServiceClient
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    PowerTransformer,
    QuantileTransformer,
)

In [None]:
%aimport src.custom_transformers
from src.custom_transformers import (
    DFNanThresholdColumnDropper,
    DFColumnDropper,
    DFColumnFilterList,
    DFColumnMapper,
    DFNonUniqueValColDropper,
    DFDropNaN,
    DFOneHotEncoder,
    DFPctNumeric,
)

In [None]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## User Inputs

Inputs and helper functions are defined here

In [None]:
raw_data_path = "data/raw/lending_club_loans.csv"
cloud_storage = "no"

# From Feature Reduction
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols1 = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]

# From Feature Processing
more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]
datetime_cols2 = ["last_credit_pull_d", "earliest_cr_line"]
high_cardinality_cols = ["addr_state"]
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]
repeated_data_cols = ["title"]
pct_to_numeric_cols = ["int_rate", "revol_util"]

In [None]:
nan_threshold = float(nan_threshold)
for k in ["Fully Paid", "Charged Off"]:
    mapping_dictionary["loan_status"][k] = int(mapping_dictionary["loan_status"][k])
for k in [
    "10+ years",
    "9 years",
    "8 years",
    "7 years",
    "6 years",
    "5 years",
    "4 years",
    "3 years",
    "2 years",
    "1 year",
    "< 1 year",
    "n/a",
]:
    mapping_dict["emp_length"][k] = int(mapping_dict["emp_length"][k])

In [None]:
def customize_splines(ax: plt.axis) -> plt.axis:
    ax.spines["left"].set_edgecolor("black")
    ax.spines["left"].set_linewidth(2)
    ax.spines["bottom"].set_edgecolor("black")
    ax.spines["bottom"].set_linewidth(2)
    ax.spines["top"].set_edgecolor("lightgrey")
    ax.spines["top"].set_linewidth(1)
    ax.spines["right"].set_edgecolor("lightgrey")
    ax.spines["right"].set_linewidth(1)
    return ax

In [None]:
def get_cols_by_partial_name(df, partial_col_name):
    return df.columns[df.columns.str.contains(partial_col_name)].tolist()

In [None]:
def reverse_get_dummies(df, partial_name):
    """
    > reverse_get_dummies(df, "purpose_")
    """
    cols_to_reverse = get_cols_by_partial_name(df, partial_name)
    return df[cols_to_reverse].idxmax(axis=1)

In [None]:
def plot_lower_corr_heatmap(
    df_corr, ptitle, lw=1, annot_fmt=".2f", ptitle_y_loc=1, fig_size=(10, 10)
):
    f, ax = plt.subplots(figsize=fig_size)
    mask = np.triu(np.ones_like(df_corr, dtype=bool))
    sns.heatmap(
        df_corr,
        mask=mask,
        vmin=-1,
        vmax=1,
        center=0,
        cmap=sns.diverging_palette(220, 10, as_cmap=True),
        square=True,
        ax=ax,
        annot=True,
        cbar=False,
        linewidths=lw,
        fmt=annot_fmt,
    )
    ax.set_title(ptitle, loc="left", fontweight="bold", y=ptitle_y_loc)
    ax.tick_params(left=False, bottom=False)

In [None]:
def plot_is_default(df, by, default_col="is_default", is_default=1, ascending=True):
    grp = df.groupby([default_col, by])[by].count()
    cnt = df.groupby(by)[by].count()
    percentages = grp.unstack() * 100 / cnt.T
    fig, ax = plt.subplots(figsize=(8, 6))
    # display(percentages)
    if ascending:
        percentages = percentages.T.sort_values(by=is_default, ascending=True).T
    ax = percentages.loc[is_default].plot.barh(ax=ax, zorder=3)
    ax.set_title("Percent of loans in default", loc="left", fontweight="bold")
    ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
    _ = customize_splines(ax)
    ax.set_ylabel(None)
    return ax

In [None]:
def plot_distribution(df, col, default_col="is_default"):
    fig = plt.figure(figsize=(15, 5))
    grid = plt.GridSpec(1, 2, hspace=0.2, wspace=0.2)
    ax1 = fig.add_subplot(grid[0, 0])
    ax2 = fig.add_subplot(grid[0, 1])
    df[col].plot.hist(ax=ax1, edgecolor="white", zorder=3)
    sns.boxplot(data=df, x=col, y=default_col, orient="h", ax=ax2)
    for ax in [ax1, ax2]:
        ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
        _ = customize_splines(ax)

In [None]:
def plot_pct_of_data(df, col, wspace=0.9, fig_size=(8, 4)):
    df_term = pd.concat(
        [reverse_get_dummies(df, f"{col}_").rename(col), df["is_default"]], axis=1
    )
    s1 = 100 * df_term[col].value_counts(normalize=True).sort_values(ascending=True)
    s2 = 100 * df_term.loc[df_term["is_default"] == 1][col].value_counts(
        normalize=True
    ).sort_values(ascending=True)
    fig = plt.figure(figsize=fig_size)
    grid = plt.GridSpec(1, 2, wspace=wspace)
    ax1 = fig.add_subplot(grid[0, 0])
    ax2 = fig.add_subplot(grid[0, 1])
    s1.plot(ax=ax1, kind="barh", zorder=3)
    s2.plot(ax=ax2, kind="barh", zorder=3)
    ax1.set_xlabel(None)
    ax2.set_xlabel(None)
    for ax in [ax1, ax2]:
        ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
        _ = customize_splines(ax)

In [None]:
def plot_multiple_barplots(df, cols, col_to_agg="loan_amnt"):
    fig = plt.figure(figsize=(12, 12))
    rows = int(len(cols) / 2)
    grid = plt.GridSpec(rows, 2, hspace=0.2, wspace=0.2)
    for r in range(rows):
        for k in range(2):
            # print(r, k, (2 * r) + k)
            ax = fig.add_subplot(grid[r, k])
            df.groupby(cols[(2 * r) + k])[col_to_agg].count().plot.bar(ax=ax, zorder=3)
            ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
            ax.set_xticklabels(ax.get_xticklabels(), rotation=22, ha="right")
            _ = customize_splines(ax)
            ax.set_xlabel(None)

## Load Data

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    loans_2007 = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    loans_2007 = pd.read_csv(raw_data_path, skiprows=1, low_memory=False)

In [None]:
loans_2007, _ = train_test_split(loans_2007, test_size=0.33, random_state=4321)
loans_2007 = loans_2007.reset_index(drop=True)

In [None]:
pipe_part_1_and_2 = Pipeline(
    [
        ("nan1", DFNanThresholdColumnDropper(nan_threshold)),
        ("nouse", DFColumnDropper(non_useful_cols)),
        ("dtime1", DFColumnDropper(datetime_cols1)),
        ("c1", DFColumnDropper(cols_one_eighteen)),
        ("c2", DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", DFColumnMapper(mapping_dictionary)),
        ("onevals", DFNonUniqueValColDropper(1)),
        ("fourvals", DFColumnDropper(four_or_less_value_columns)),
        ("morethan1pctnan", DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan2", DFDropNaN()),
        ("hcardcols", DFColumnDropper(high_cardinality_cols)),
        ("dtime2", DFColumnDropper(datetime_cols2)),
        ("texttonum", DFColumnMapper(mapping_dict)),
        ("onehot", DFOneHotEncoder(nominal_columns)),
        ("repeats", DFColumnDropper(repeated_data_cols)),
        ("pctcols", DFPctNumeric(pct_to_numeric_cols, "%")),
    ]
)
df = pipe_part_1_and_2.fit_transform(loans_2007)

A full list of columns in this processed data is shown below

In [None]:
list(df)

## Identifying default loans

In [None]:
df["loan_status"].value_counts().to_frame()

In [None]:
class DFSingleColumnMapper(TransformerMixin):
    def __init__(self, col, mapping_dict):
        self.col = col
        self.mapping_dict = mapping_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X[list(self.mapping_dict.keys())[0]] = X[self.col]
        return X.replace(self.mapping_dict)

    def fit_transform(self, X, y=None, **kwargs):
        self = self.fit(X, y)
        return self.transform(X)

In [None]:
class DFSimpleDtypeChanger(TransformerMixin):
    def __init__(self, col, datatype):
        self.col = col
        self.datatype = datatype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X[self.col] = X[self.col].astype(self.datatype)
        return X

    def fit_transform(self, X, y=None, **kwargs):
        self = self.fit(X, y)
        return self.transform(X)

In [None]:
df["is_default"] = [0 if s in [1] else 1 for s in df["loan_status"]]
df["is_default"] = df["is_default"].astype(int)

In [None]:
mapping_dict = {"is_default": {0: 1, 1: 0}}

In [None]:
# pipe = Pipeline(
#     [
#         ("singlecolmap", DFSingleColumnMapper("loan_status", mapping_dict)),
#         ("dtype", DFSimpleDtypeChanger("is_default", "int")),
#     ]
# )
# df = pipe.fit_transform(df)

In [None]:
df.is_default.value_counts().to_frame()

## Exploratory Data Analysis

### Breakdown of default vs non-default loans

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(
    df[df.is_default == 0]["loan_amnt"],
    bins=7,
    edgecolor="white",
    stacked=True,
    label="non-default",
    zorder=3,
)
ax.hist(
    df[df.is_default == 1]["loan_amnt"],
    bins=7,
    edgecolor="white",
    stacked=True,
    label="default",
    zorder=3,
)
ax.set_title("Loan Amount ($)", loc="left", fontweight="bold")
ax.legend(frameon=False)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
_ = customize_splines(ax)

### Loan purpose

In [None]:
purpose_cols = get_cols_by_partial_name(df, "purpose_")
loans = []
for purpose_col in purpose_cols:
    good = (
        100
        * np.sum([(df[purpose_col] == 1) & (df.is_default == 0)])
        / np.sum(df[purpose_col] == 1)
    )
    bad = (
        100
        * np.sum([(df[purpose_col] == 1) & (df.is_default == 1)])
        / np.sum(df[purpose_col] == 1)
    )
    loans.append([purpose_col, good, bad])

loans = pd.DataFrame(loans)
loans.columns = ["Purpose", "Non-default", "Default"]
loans.set_index("Purpose", inplace=True)
loans.sort_values("Non-default")

In [None]:
g = sns.catplot(
    data=loans.stack().reset_index(),
    kind="bar",
    orient="h",
    hue="level_1",
    y="Purpose",
    x=0,
    ci="sd",
    palette="dark",
    height=5,
    aspect=1.5,
    zorder=3,
)
g.set_axis_labels("", "")
g.legend.set_title("")
plt.grid(zorder=0)

### Loan Amount

In [None]:
plot_distribution(df, "loan_amnt", "is_default")

In [None]:
loan_amnt_bin = pd.cut(
    df["loan_amnt"],
    [x for x in range(0, 36000, 5000)],
    labels=[str(x) + "-" + str(x + 5) + "k" for x in range(0, 35, 5)],
).rename("loan_amnt_bin")
plot_is_default(
    pd.concat([df["is_default"], loan_amnt_bin], axis=1), "loan_amnt_bin", "is_default"
)

In [None]:
df.groupby("is_default")["loan_amnt"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["bc"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["loan_amnt"]])).rename(
                columns={0: "loan_amnt"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "loan_amnt",
)

**Notes**
- all transformers be used
- **all transformers should be compared**

## Various categorical columns

In [None]:
cats_cols = ["home_ownership", "term", "verification_status", "purpose"]
df_cats = pd.concat(
    [reverse_get_dummies(df, f"{c}_").rename(c) for c in cats_cols],
    axis=1,
)
plot_multiple_barplots(
    pd.concat([df_cats, df["loan_amnt"]], axis=1), cats_cols, col_to_agg="loan_amnt"
)

### Term

In [None]:
plot_pct_of_data(df, "term", 0.9, (6, 4))

### Purpose

In [None]:
plot_pct_of_data(df, "purpose", 0.7, (12, 4))

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
df[get_cols_by_partial_name(df, "purpose_")].sum().sort_values(ascending=True).plot(
    ax=ax, kind="barh", zorder=3
)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
_ = customize_splines(ax)

In [None]:
df_proc = df.set_index("is_default")[get_cols_by_partial_name(df, "purpose_")]
plot_is_default(
    df_proc[df_proc == 1].stack().reset_index().drop(0, 1), "level_1", "is_default", 1
)

In [None]:
loans = []
for purpose_col in purpose_cols:
    good = (
        100
        * np.sum([(df[purpose_col] == 1) & (df.is_default == 0)])
        / len(df[purpose_col] == 1)
    )
    bad = (
        100
        * np.sum([(df[purpose_col] == 1) & (df.is_default == 1)])
        / len(df[purpose_col] == 1)
    )
    loans.append([purpose_col, good, bad])

loans = pd.DataFrame(loans)
loans.columns = ["Purpose", "Non-default", "Default"]
loans.set_index("Purpose", inplace=True)
loans.sort_values("Non-default")

g = sns.catplot(
    data=loans.stack().sort_values(ascending=False).reset_index(),
    kind="bar",
    orient="h",
    hue="level_1",
    y="Purpose",
    x=0,
    ci="sd",
    palette="dark",
    height=6,
    aspect=1,
    zorder=3,
)
g.set_axis_labels("", "")
g.legend.set_title("")
plt.grid(zorder=0)

### Interest Rate

In [None]:
plot_distribution(df, "int_rate")

In [None]:
plot_is_default(
    pd.concat([df["is_default"], round(df["int_rate"])], axis=1),
    "int_rate",
    "is_default",
    1,
    False,
)

In [None]:
df.groupby("is_default")["int_rate"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["l"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["int_rate"]])).rename(
                columns={0: "int_rate"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "int_rate",
)

**Notes**
- all transformers be used
- **all transformers (incl. `log`) should be compared**

### Installment

In [None]:
plot_distribution(df, "installment", "is_default")

In [None]:
df.groupby("is_default")["installment"].describe()

### Employment Length

In [None]:
plot_distribution(df, "emp_length")

In [None]:
df_emp_length = pd.get_dummies(df["emp_length"], prefix="emp_length").assign(
    is_default=df["is_default"]
)
plot_pct_of_data(df_emp_length, "emp_length", 0.5)

In [None]:
df.groupby("is_default")["emp_length"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["qu"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["emp_length"]])).rename(
                columns={0: "emp_length"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "emp_length",
)

**Notes**
- Box-Cox cannot be used
- none of the transformers seem effective, likely due to the large number of loans for individuals with 10 or more years of employment
  - it seems intuitive that these records should be retained and not filtered out
- **all transformers should be compared OR leave the column un-transformed**

### Home ownership

In [None]:
plot_pct_of_data(df, "home_ownership", 1.3)

### Annual Income

In [None]:
plot_distribution(df, "annual_inc", "is_default")

In [None]:
# annual_inc_3std = df[
#     np.abs(df["annual_inc"] - df["annual_inc"].mean()) <= (3 * df["annual_inc"].std())
# ]
df = df[
    np.abs(df["annual_inc"] - df["annual_inc"].mean()) <= (3 * df["annual_inc"].std())
]

In [None]:
plot_distribution(
    df,
    "annual_inc",
    "is_default",
)

In [None]:
df.groupby("is_default")["annual_inc"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["yj"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["annual_inc"]])).rename(
                columns={0: "annual_inc"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "annual_inc",
)

**Notes**
- any transformer can be used
- **all transformers (incl. `log`) should be compared on filtered version of this column**

In [None]:
class DFColumnStdFilter(TransformerMixin):
    def __init__(self, col, n_std):
        self.col = col
        self.n_std = n_std

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        return X[
            np.abs(X[self.col] - X[self.col].mean()) <= (self.n_std * X[self.col].std())
        ]

    def fit_transform(self, X, y=None, **kwargs):
        self = self.fit(X, y)
        return self.transform(X)

In [None]:
# pipe = Pipeline(
#     [
#         ("stdfilter", DFColumnStdFilter("annual_inc", 3)),
#     ]
# )
# df = pipe.fit_transform(df)

### Verification Status

In [None]:
plot_pct_of_data(df, "verification_status", 1.6)

### `dti` - Monthly payments on debt obligations (excl. this loan) divided by monthly income

In [None]:
plot_distribution(df, "dti", "is_default")

In [None]:
dti_bin = pd.cut(
    df["dti"],
    [x for x in range(0, 30 + 5, 5)],
    labels=[str(x) + "-" + str(x + 5) for x in range(0, 30, 5)],
).rename("dti_bin")
plot_is_default(
    pd.concat([df["is_default"], dti_bin], axis=1), "dti_bin", "is_default", 1, False
)

In [None]:
df.groupby("is_default")["dti"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["yj"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["dti"]])).rename(
                columns={0: "dti"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "dti",
)

**Notes**
- Box-Cox cannot be used
- power transformers seem more effective than Yeo-Johnson
- **all transformers should be compared (preference to power)**

### `delinq_2yrs` - Number of greater than one month delinquincies over the last two years

In [None]:
plot_distribution(df, "delinq_2yrs")

In [None]:
df.groupby("is_default")["delinq_2yrs"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["yj"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(
                pipe_trans.fit_transform(df[df["delinq_2yrs"] < 2][["delinq_2yrs"]])
            ).rename(columns={0: "delinq_2yrs"}),
            df["is_default"],
        ],
        axis=1,
    ),
    "delinq_2yrs",
)

**Notes**
- Box-Cox cannot be used
- none of the transformers seem effective
  - without more exhaustive exploratory analysis, this seems like an important column so it should not be dropped
- **all transformers should be compared OR (prefered) leave column un-transformed**

In [None]:
plot_is_default(df, "delinq_2yrs", "is_default", 1, False)

### `inq_last_6_mths` - Number of inquiries over the last six years

In [None]:
(100 * df["inq_last_6mths"].value_counts(normalize=True)).to_frame()

In [None]:
plot_is_default(df, "inq_last_6mths", "is_default", 1, False)

In [None]:
plot_distribution(df, "inq_last_6mths")

In [None]:
plot_distribution(df[df["inq_last_6mths"] <= 3], "inq_last_6mths")

In [None]:
df.groupby("is_default")["inq_last_6mths"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["yj"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(
                pipe_trans.fit_transform(
                    df[df["inq_last_6mths"] <= 3][["inq_last_6mths"]]
                )
            ).rename(columns={0: "inq_last_6mths"}),
            df["is_default"],
        ],
        axis=1,
    ),
    "inq_last_6mths",
)

**Notes**
- Box-Cox cannot be used
- none of the transformers seem effective
- **all transformers should be compared OR (prefered) leave column un-transformed**

### Open credit lines on file for borrower

In [None]:
plot_distribution(df, "open_acc")

In [None]:
df.groupby("is_default")["open_acc"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["qn"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["open_acc"]])).rename(
                columns={0: "open_acc"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "open_acc",
)

**Notes**
- with strictly positive value, any transformer can be used
- even without filtering outliers, all transformers seem effective
- **all transformers should be compared**

### Revolving Balance

In [None]:
plot_distribution(df, "revol_bal")

In [None]:
plot_distribution(df[df["revol_bal"] <= 30000], "revol_bal")

In [None]:
df.groupby("is_default")["revol_bal"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["yj"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["revol_bal"]])).rename(
                columns={0: "revol_bal"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "revol_bal",
)

**Notes**
- filtering out values greater than $30,000, from the un-transformed data in this column, eliminates outliters
  - more exhaustive analysis is needed to determine if this is acceptable
- for transforming the un-filtered data in this column
  - since the minimum value is zero, the Box-Cox transformation (requiring strictly positive values) [cannot be used](https://en.wikipedia.org/wiki/Power_transform#Box%E2%80%93Cox_transformation)
- **all transformers should be compared**

### Relative used credit (compared to available revolving credit)

In [None]:
plot_distribution(df, "revol_util")

In [None]:
df.groupby("is_default")["revol_util"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["qn"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["revol_util"]])).rename(
                columns={0: "revol_util"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "revol_util",
)

**Notes**
- filtering out values greater than $30,000, from the un-transformed data in this column, eliminates outliters
  - more exhaustive analysis is needed to determine if this is acceptable
- for transforming the un-filtered data in this column
  - Box-Cox transformation cannot be used due to zero values
  - log and Yeo-Johnson are not as effective, possibly due to the large number of values between 0 and 10
    - should preference be given to power transformations?
- **all transformers should be compared (preference to quantile)**

### `pub_rec`

In [None]:
plot_distribution(df, "pub_rec")

In [None]:
plot_is_default(df, "pub_rec", "is_default", 1, False)

In [None]:
df.groupby("is_default")["pub_rec"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["l"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["pub_rec"]])).rename(
                columns={0: "pub_rec"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "pub_rec",
)

**Notes**
- **leave un-transformed**

### Total number of credit lines on file

In [None]:
plot_distribution(df, "total_acc")

### FICO range (low)

In [None]:
plot_distribution(df, "fico_range_low")

In [None]:
df.groupby("is_default")["fico_range_low"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["bc"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["fico_range_low"]])).rename(
                columns={0: "fico_range_low"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "fico_range_low",
)

**Notes**
- **all transformers (except Yeo-Johnson, which produced division by zero), incl. `log`, should be compared**

### FICO range (high)

In [None]:
plot_distribution(df, "fico_range_high")

In [None]:
df.groupby("is_default")["fico_range_high"].describe()

In [None]:
transformers = {
    "bc": PowerTransformer("box-cox"),
    "yj": PowerTransformer("yeo-johnson"),
    "qn": QuantileTransformer(output_distribution="normal", n_quantiles=len(df) - 1),
    "qu": QuantileTransformer(output_distribution="uniform", n_quantiles=len(df) - 1),
    "l": FunctionTransformer(np.log1p, inverse_func=np.expm1),
}
pipe_trans = Pipeline([("trans", transformers["bc"])])
plot_distribution(
    pd.concat(
        [
            pd.DataFrame(pipe_trans.fit_transform(df[["fico_range_high"]])).rename(
                columns={0: "fico_range_high"}
            ),
            df["is_default"],
        ],
        axis=1,
    ),
    "fico_range_high",
)

**Notes**
- **(same as for `fico_range_low`) all transformers (except Yeo-Johnson, which produced division by zero), incl. `log`, should be compared**
  - this feature is likely correlated to `fico_range_low`, so only one of the two should be kept

### Loan amount compared to Annual income

In [None]:
with sns.axes_style("whitegrid"):
    g = sns.jointplot(
        x="annual_inc",
        y="loan_amnt",
        hue="is_default",
        data=df.loc[df["annual_inc"] < 200000],
        height=10,
        ratio=2,
        space=0,
    )

### Loan amount by Loan Verification status

In [None]:
dfp = pd.concat(
    [
        reverse_get_dummies(df, "verification_status").rename("verification_status"),
        df[["loan_amnt", "is_default"]],
    ],
    axis=1,
)
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(
    data=dfp, x="verification_status", y="loan_amnt", hue="is_default", ax=ax, zorder=3
)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
ax.legend(loc="best", ncol=2, frameon=False)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_title(
    "Loan Amount versus Status of Loan Verification", loc="left", fontweight="bold"
)
_ = customize_splines(ax)

### Interest Rate by Term

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
(
    pd.concat([reverse_get_dummies(df, "term").rename("term"), df["int_rate"]], axis=1)
).boxplot(column="int_rate", by="term", ax=ax)
ax.set_xlabel(None)
fig.suptitle(None)
_ = customize_splines(ax)

### Loan amount by Term of loan

In [None]:
dfp = pd.concat(
    [
        reverse_get_dummies(df, "term").rename("term"),
        df[["loan_amnt", "is_default"]],
    ],
    axis=1,
)
fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(data=dfp, x="term", y="loan_amnt", hue="is_default", ax=ax, zorder=3)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
ax.legend(loc="best", ncol=2, frameon=False)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_title("Loan Amount versus Loan Term", loc="left", fontweight="bold")
_ = customize_splines(ax)

### Monthly payments on debt obligations (excl. this loan) divided by monthly income by Open credit lines on file for borrower

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(
    data=pd.concat([df[["is_default", "open_acc"]], dti_bin], axis=1),
    x="dti_bin",
    y="open_acc",
    hue="is_default",
    ax=ax,
    zorder=3,
)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
ax.legend(loc="best", ncol=2, frameon=False)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_title(
    "Open credit lines versus Monthly payments/monthly income",
    loc="left",
    fontweight="bold",
)
_ = customize_splines(ax)

### Interest rate by Delinquency rate

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(
    data=df,
    x="delinq_2yrs",
    y="int_rate",
    hue="is_default",
    ax=ax,
    zorder=3,
)
ax.grid(which="both", axis="both", color="lightgrey", zorder=0)
ax.legend(loc="best", ncol=2, frameon=False)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_title(
    "Interest rate by Delinquency rate",
    loc="left",
    fontweight="bold",
)
_ = customize_splines(ax)

### Interest Rate versus Loan Amount

In [None]:
with sns.axes_style("whitegrid"):
    g = sns.jointplot(
        x="loan_amnt",
        y="int_rate",
        hue="is_default",
        data=df[df["loan_amnt"] < 30000],
        height=10,
        ratio=2,
        space=0,
    )

### Interest Rate versus Loan Amount for Zero and Non-zero public derogatory records

For zero public derogatory records

In [None]:
with sns.axes_style("whitegrid"):
    g = sns.jointplot(
        x="loan_amnt",
        y="int_rate",
        hue="is_default",
        data=df[(df["pub_rec"] == 0) & (df["loan_amnt"] < 30000)],
        height=10,
        ratio=2,
        space=0,
    )

For non-zero derogatory records

In [None]:
with sns.axes_style("whitegrid"):
    g = sns.jointplot(
        x="loan_amnt",
        y="int_rate",
        hue="is_default",
        data=df[(df["pub_rec"] > 0) & (df["loan_amnt"] < 30000)],
        height=10,
        ratio=2,
        space=0,
    )

### Feature correlations between numerical columns

In [None]:
list(df)

In [None]:
cols_to_consider_adding = [
    "grade",
    # "sub_grade",  # redundant if grade is also included
    # "issue_d",  # date loan was funded; leaks data from future
    "addr_state",  # will give 50 new dummy features
    # "recoveries",  # recovery after charging off; leaks data from future
    "acc_now_delinq",  # no. of accounts on which borrower is delinquent
]

In [None]:
numerical_cols = [
    "loan_amnt",  #
    "int_rate",  #
    "emp_length",  #
    "annual_inc",  #
    "dti",  #
    "delinq_2yrs",  #
    "inq_last_6mths",  #
    "open_acc",  #
    "pub_rec",  #
    "revol_bal",  #
    "revol_util",  #
    "fico_range_low",  #
    "fico_range_high",  #
    "installment",  # dropped due to correlation
    "total_acc",  # dropped due to correlation
]
categorical_cols = [
    # does not seem to be ordinal
    "purpose",
    # # leaks data from future - DROP THIS
    # "last_fico_range",
    # rent, own, mortgage or other does not seem to be ordinal
    "home_ownership",
    # verified, not verified, or income source verified does not seem to be ordinal
    "verification_status",
    # consider label encoding (maybe an ordinal relationship)
    "term",
]

Some labels to consider retaining are shown below
- the objective here is to predict whether future loanees will default (label `1`) or not (label `0`)
- this requires a model to have knowledge of past (**completed**) loans (ones that have been either fully paid back (label `0`) or defaulted (label `1`))
  - a model cannot be trained on loans which are **not completed**, since these labels (`0` for a repaid loan, or `1` for a defaulted loan) are **not** known apriori.i.e. they are only known at some time in the future (after the loan application has been accepted or rejected)

In [None]:
labels_to_consider_including_for_completed_loans = [
    # following are completed loans (i.e. they are inactive)
    "Fully Paid",  # is_default=0, loan_status=1 (currently included)
    "Charged Off",  # is_default=1, loan_status=0 (currently included)
    "Default",  # is_default=1
    "Late (16-30 days)",  # is_default=1
    "Late (31-120 days)"  # is_default=1
    # following are not completed loans (i.e. they are still active)
    "Current",  # cannot use
    "Issued",  # cannot use
    "Does not meet the credit policy. Status:Fully Paid",  # cannot use
    "In Grace Period",  # cannot use
]

In [None]:
# df_all_cols_reversed_dummies = pd.concat(
#     [
#         pd.concat(
#             [reverse_get_dummies(df, col).rename(col) for col in categorical_cols],
#             axis=1,
#         ),
#         df[numerical_cols],
#     ],
#     axis=1,
# )

In [None]:
plot_lower_corr_heatmap(
    df[numerical_cols].corr(),
    "Correlation between numerical columns",
    1,
    ".2f",
    0.975,
    (10, 10),
)

In [None]:
c = df.corr().abs()
sol = (
    c.where(np.triu(np.ones(c.shape), k=1).astype(np.bool))
    .stack()
    .sort_values(ascending=False)
)
display(sol.reset_index().head(10))

In [None]:
df[
    [
        "installment",
        "loan_amnt",
        "open_acc",
        "total_acc",
        "fico_range_low",
        "fico_range_high",
    ]
].corr()

In [None]:
correlated_features = ["total_acc", "installment", "fico_range_low", "fico_range_high"]
look_ahead_features = ["last_fico_range_low", "last_fico_range_high"]
df.drop(labels=correlated_features + look_ahead_features, axis=1, inplace=True)

## Drop `loan_status` column

In [None]:
raw_labels = ["loan_status"]
new_labels = ["is_default"]

In [None]:
df.drop(columns=raw_labels, axis=1, inplace=True)

In [None]:
class DFColumnDropper(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        return X.drop(self.columns, axis=1)

    def fit_transform(self, X, y=None, **kwargs):
        self = self.fit(X, y)
        return self.transform(X)

In [None]:
# pipe = Pipeline(
#     [
#         ("label", DFColumnDropper(raw_labels)),
#     ]
# )
# df = pipe.fit_transform(df)

## Pipeline-based approach to add and remove columns from cleaned data

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    loans_2007 = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    loans_2007 = pd.read_csv(raw_data_path, skiprows=1, low_memory=False)

In [None]:
loans_2007, _ = train_test_split(loans_2007, test_size=0.33, random_state=4321)
loans_2007 = loans_2007.reset_index(drop=True)
df_reloaded = pipe_part_1_and_2.fit_transform(loans_2007)

First, we'll use a pipeline to
- create the new column for labels and change its dtype to integers
- drop correlated columns
- drop the old labels column

In [None]:
pipe = Pipeline(
    [
        ("singlecolmap", DFSingleColumnMapper("loan_status", mapping_dict)),
        ("dtype", DFSimpleDtypeChanger(new_labels, "int")),
        ("stdfilter", DFColumnStdFilter("annual_inc", 3)),
        ("corr", DFColumnDropper(correlated_features)),
        ("lookahead", DFColumnDropper(look_ahead_features)),
        ("label", DFColumnDropper(raw_labels)),
    ]
)
df_pipe = pipe.fit_transform(df_reloaded)

In [None]:
assert df_pipe.equals(df)

We'll then use a pipeline to apply a different transformation to each numerical column in the dataset
- this could also have been applied to all columns at once, or in groups of columns (if suitable groupings can be found such that all columns in each group are put through the same transformation)

In [None]:
col_transformers = {
    "loan_amnt": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    "int_rate": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    # no transform for emp_length?
    "emp_length": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    # for annual_inc, used box-cox since divide by 0 for yeo-johnson
    "annual_inc": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    "dti": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    # no transform for delinq_2yrs?
    "delinq_2yrs": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    # no transform for inq_last_6mths?
    "inq_last_6mths": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    "open_acc": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    "revol_bal": Pipeline(steps=[("trans", PowerTransformer("yeo-johnson"))]),
    # quantile transformation maybe preferred for revol_util?
    "revol_util": Pipeline(
        steps=[
            (
                "trans",
                QuantileTransformer(
                    n_quantiles=len(df_pipe), output_distribution="normal"
                ),
            )
        ]
    ),
}
# need passthrough to retain the following columns: ["pub_rec"]
preprocessor = ColumnTransformer(
    transformers=[(k, v, [k]) for k, v in col_transformers.items()],
    remainder="passthrough",
)
pipe_trans = Pipeline([("preprocessor", preprocessor)])
df_trans = pd.DataFrame(pipe_trans.fit_transform(df), columns=list(df))
df_trans_pipe = pd.DataFrame(pipe_trans.fit_transform(df_pipe), columns=list(df_pipe))
display(df_trans.head(2))
display(df_trans_pipe.head(2))

Since early all transformations are power transformers, it may be more convenient to use this for the last column as well. Such an approach would be easier to maintain as the number of numeric features is varied during further analysis. This strategy will be used in subsequent steps of the analysis.

In [None]:
print(df_trans.shape)
print(df_trans_pipe.shape)
assert df_trans.equals(df_trans_pipe)

## Check column data types

As a reminder, we should not have any non-numeric columns in the processed data that will be passed to modeling algorithms

In [None]:
list(df_trans_pipe.select_dtypes(include=["object"]))

We should only have numerical columns, and that is indeed the case here

In [None]:
df_trans_pipe.dtypes.to_frame()