# Feature Processing

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from io import StringIO

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from azure.storage.blob import BlobServiceClient
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
%aimport src.custom_transformers
from src.custom_transformers import (
    DFNanThresholdColumnDropper,
    DFColumnDropper,
    DFColumnFilterList,
    DFColumnMapper,
    DFNonUniqueValColDropper,
    DFDropNaN,
    DFOneHotEncoder,
    DFPctNumeric,
)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## User Inputs

Inputs are defined below

In [None]:
raw_data_path = "data/raw/lending_club_loans.csv"
cloud_storage = "no"

## Load Data

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    loans_2007 = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    loans_2007 = pd.read_csv(raw_data_path, skiprows=1, low_memory=False)

In [None]:
# Set aside 33% as test data
loans_2007, _ = train_test_split(loans_2007, test_size=0.33, random_state=4321)
loans_2007 = loans_2007.reset_index(drop=True)

The feature transformation pipeline from the first notebook will be applied here to transform the training split

In [None]:
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]
pipe_part1 = Pipeline(
    [
        ("nan", DFNanThresholdColumnDropper(nan_threshold)),
        ("nouse", DFColumnDropper(non_useful_cols)),
        ("dtime", DFColumnDropper(datetime_cols)),
        ("c1", DFColumnDropper(cols_one_eighteen)),
        ("c2", DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", DFColumnMapper(mapping_dictionary)),
        ("onevals", DFNonUniqueValColDropper(1)),
        ("fourvals", DFColumnDropper(four_or_less_value_columns)),
    ]
)

In [None]:
filtered_loans = pipe_part1.fit_transform(loans_2007)
print(filtered_loans.shape)
filtered_loans.head()

The transformed training data will now be used for further data processing.

## Drop Missing Values

We'll show a count of missing data by column

In [None]:
null_counts = (
    filtered_loans.isnull().sum().to_frame().sort_values(by=[0], ascending=False)
)
null_counts

We'll show the **fraction** of unique values for the first four columns above, since these are reported to contain the largest number of missing values. The fraction allows us to see the missing rows (a percentage from 0-100) in each of these columns

In [None]:
for col in ["pub_rec_bankruptcies", "emp_length", "revol_util", "title"]:
    display(filtered_loans[col].value_counts(normalize=True, dropna=False).to_frame())

`pub_rec_bankruptcies` has both of the following
- missing in more than 1% of the data
- nearly 94% of its rows assigned to a single value

So we'll drop this column entirely.

In [None]:
more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]

In [None]:
filtered_loans = filtered_loans.drop(columns=more_than_one_pct_missing_columns, axis=1)
filtered_loans = filtered_loans.dropna()

In [None]:
# pipe = Pipeline(
#     [
#         ("90pctnan", DFColumnDropper(more_than_one_pct_missing_columns)),
#         ("nan", DFDropNaN()),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
print(filtered_loans.shape)
display(filtered_loans.head())

## Process Categorical Columns

We'll now explore the non-numeric columns in the data

In [None]:
filtered_loans.dtypes.value_counts().to_frame()

In [None]:
object_columns_df = filtered_loans.select_dtypes(include=["object"])
display(object_columns_df.head())

In [None]:
for col in object_columns_df:
    print(col + ": " + str(object_columns_df[col].nunique()))

### Investigate unique values

We'll first show the unique values in columns containing categorical values

In [None]:
for name in list(object_columns_df):
    display(object_columns_df[name].value_counts().to_frame())

**Observations about `object` dtype features**
- `addr_state` has many unique values and will create nearly 50 one-hot encoded variables (one per state in the US)
  - for now, we'll drop it and add it back if required
- `purpose` and `title` have overlapping information but values in `title` are repeated so we'll drop it
- columns `home_ownership`, `verification_status`, `emp_length`, and `term` columns contain a small number of discrete categorical values, so we'll keep them and one-hot encode them
  - `emp_length` will be treated as a numerical colum since the unique values have a natural ordering to them.i.e. 8 > 2 and 2 > 1.
  - it seems like the duration of employment `emp_length` and whether the borrower owns a home `home_ownership` should be important in predicting the level of risk associated with approving a loan to that borrower, so these could be useful to a model looking to make such a prediction. This is further reason to keep these columns.
- `datetime` attribute columns `earliest_cr_line` (month-year in which borrower opened their earliest reported credit line) and `last_credit_pull_d` (month-year in which Lending Club pulled credit for corresponding loan) will be dropped due to lookahead bias
- `int_rate` and `revol_util` are actually numeric features but contain a `%` sign which makes them appear as `object` dtype so we'll strip out the `%` in order to convert them to a numerical datatype

### Drop `datetime` and high cardinality columns

We'll drop the `datetime`-dtype and high cardinality columns (`addr_state` US state in which the loan borrower resides), as identified above

In [None]:
datetime_cols = ["last_credit_pull_d", "earliest_cr_line"]
high_cardinality_cols = ["addr_state"]

In [None]:
# pipe = Pipeline(
#     [
#         ("hcardcols", DFColumnDropper(high_cardinality_cols)),
#         ("dtime", DFColumnDropper(datetime_cols)),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
filtered_loans = filtered_loans.drop(datetime_cols + high_cardinality_cols, axis=1)

In [None]:
print(filtered_loans.shape)
display(filtered_loans.head())

### Clean by removing text from numerical columns

We'll map the employment length column from text describing the duration of employment to numerical values. We'll assume the following here
- fewer than one year of employment will be considered `0`
- ten years of employment or greater will be considered `10`
  - the actual number, greater than `10`, is not provided in the data so we'll take this as `10`, which could mean 14 years of employment gets converted to `10` for modeling purposes

Note that another strategy to process this column is to bucket the unique values based on some discrete window. eg. for a window of 3, we could group 1, 2 and 3 years of employment into the same group, 4, 5 and 6 years into the same group, and so on. However, the exact choice of this window is critical because it could render this feature useless in terms of its predictive power over the status of loan. With domain knowledge, we could make a more appropriate choice for such a window length if deemed appropriate. For now, we'll keep all the one-to-one mapping between the text version of the employment duration and its numerical representation.

In [None]:
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}

In [None]:
filtered_loans = filtered_loans.replace(mapping_dict)

In [None]:
# pipe = Pipeline(
#     [
#         ("texttonum", DFColumnMapper(mapping_dictionary)),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
print(filtered_loans.shape)
display(filtered_loans.head())

### One-Hot Encoding

Next, we'll one-hot encode the variables identified earlier for this type of processing

In [None]:
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]

In [None]:
# pipe = Pipeline(
#     [
#         ("onehot", DFOneHotEncoder(nominal_columns)),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
dummy_df = pd.get_dummies(filtered_loans[nominal_columns])
filtered_loans = pd.concat([filtered_loans, dummy_df], axis=1)
filtered_loans = filtered_loans.drop(columns=nominal_columns, axis=1)

In [None]:
print(filtered_loans.shape)
display(filtered_loans.head())

### Drop columns with overlapping information

We'll drop the `title` (loan title given by the borrower) column and keep the `purpose` column (which contains the same information), as was indicated above

In [None]:
repeated_data_cols = ["title"]

In [None]:
# pipe = Pipeline(
#     [
#         ("repeats", DFColumnDropper(repeated_data_cols)),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
filtered_loans = filtered_loans.drop(repeated_data_cols, axis=1)

### Cleaning special characters

Finally, we'll remove the percentage sign from two of the `object` columns that should be treated as numeric columns

In [None]:
pct_to_numeric_cols = ["int_rate", "revol_util"]

In [None]:
# pipe = Pipeline(
#     [
#         ("pctcols", DFPctNumeric(pct_to_numeric_cols, "%")),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

In [None]:
for col in pct_to_numeric_cols:
    filtered_loans[col] = (
        filtered_loans[col].astype(str).str.rstrip("%").astype("float")
    )

In [None]:
print(filtered_loans.shape)
display(filtered_loans.head())

After this step of processing, all columns in the data are now numeric

In [None]:
filtered_loans.dtypes.to_frame()

## (Not Used) Processing columns containing a date

Columns containing a date have been dropped. Below, we'll first extract the year and month attribute from each of these columns, though these will not be added back to the data or used in any analysis

In [None]:
datetime_colmns = [
    "issue_d",  # dropped in phase 1 of processing
    "last_pymnt_d",  # dropped in phase 1 of processing
    "earliest_cr_line",  # dropped in phase 2 of processing
    "last_credit_pull_d",  # dropped in phase 2 of p-processing
]

In [None]:
# # Manual approach
# for col in datetime_colmns:
#     filtered_loans[col] = pd.to_datetime(filtered_loans[col], format='%b-%y')
#     filtered_loans[f'{col}_month'] = filtered_loans[col].dt.month
#     filtered_loans[f'{col}_year'] = filtered_loans[col].dt.year
#     filtered_loans = filtered_loans.drop(columns=datetime_colmns)
#     filtered_loans[f'{col}_month'] = pd.Categorical(filtered_loans[f"{col}_month"])
#     filtered_loans[f'{col}_year'] = pd.Categorical(filtered_loans[f"{col}_year"])

In [None]:
class DFDateTimeCols(TransformerMixin):
    def __init__(self, cols, convert_to_categorical=False):
        self.convert_to_categorical = convert_to_categorical
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        for col in self.cols:
            X[col] = pd.to_datetime(X[col], format="%b-%y")
            X[f"{col}_month"] = X[col].dt.month
            X[f"{col}_year"] = X[col].dt.year
            X = X.drop(columns=self.cols)
            if self.convert_to_categorical:
                X[f"{col}_year"] = pd.Categorical(X[f"{col}_year"])
                X[f"{col}_month"] = pd.Categorical(X[f"{col}_month"])
        return X

    def fit_transform(self, X, y=None, **kwargs):
        self = self.fit(X, y)
        return self.transform(X)

There are other approaches to treat the extracted year and month during modeling, but we could also one-hot encode each of these features as a first pass at preparing them for further analysis

In [None]:
# # Pipeline-based approach
# pipe = Pipeline(
#     [
#         ("dtime", DFDateTimeCols(datetime_columns, True)),
#         ("onehot", DFOneHotEncoder(datetime_columns)),
#     ]
# )
# filtered_loans = pipe.fit_transform(filtered_loans)

## Combine all processing steps in part 2

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    loans_2007 = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    loans_2007 = pd.read_csv(raw_data_path, skiprows=1, low_memory=False)

In [None]:
# Set aside 33% as test data
loans_2007, _ = train_test_split(loans_2007, test_size=0.33, random_state=4321)
loans_2007 = loans_2007.reset_index(drop=True)

In [None]:
filtered_loans_reloaded = pipe_part1.fit_transform(loans_2007)
print(filtered_loans_reloaded.shape)
display(filtered_loans_reloaded.head())

In [None]:
pipe_part2 = Pipeline(
    [
        ("morethan1pctnan", DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan", DFDropNaN()),
        ("hcardcols", DFColumnDropper(high_cardinality_cols)),
        ("dtime", DFColumnDropper(datetime_cols)),
        ("texttonum", DFColumnMapper(mapping_dict)),
        ("onehot", DFOneHotEncoder(nominal_columns)),
        ("repeats", DFColumnDropper(repeated_data_cols)),
        ("pctcols", DFPctNumeric(pct_to_numeric_cols, "%")),
        # ("dtime", DFDateTimeCols(datetime_columns, True)),
        # ("onehot", DFOneHotEncoder(datetime_columns)),
    ]
)
filtered_loans_pipe = pipe_part2.fit_transform(filtered_loans_reloaded)
print(filtered_loans_pipe.shape)
display(filtered_loans_pipe.head())

## Combine all processing steps in parts 1 and 2

In [None]:
nan_threshold = 0.5
non_useful_cols = ["url", "desc"]
datetime_cols1 = ["issue_d", "last_pymnt_d"]
cols_one_eighteen = [
    "id",
    "member_id",
    "funded_amnt",
    "funded_amnt_inv",
    "grade",
    "sub_grade",
    "emp_title",
]
cols_eighteen_thirtysix = [
    "zip_code",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
]
cols_thirtyseven_end = [
    "total_rec_int",
    "total_rec_late_fee",
    "recoveries",
    "collection_recovery_fee",
    "last_pymnt_amnt",
]
loan_status = ["Fully Paid", "Charged Off"]
mapping_dictionary = {"loan_status": {"Fully Paid": 1, "Charged Off": 0}}
four_or_less_value_columns = ["pymnt_plan"]

more_than_one_pct_missing_columns = ["pub_rec_bankruptcies"]
high_cardinality_cols = ["addr_state"]
datetime_cols2 = ["last_credit_pull_d", "earliest_cr_line"]
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0,
    }
}
nominal_columns = ["home_ownership", "verification_status", "purpose", "term"]
repeated_data_cols = ["title"]
pct_to_numeric_cols = ["int_rate", "revol_util"]

pipe_part_1_and_2 = Pipeline(
    [
        ("nan1", DFNanThresholdColumnDropper(nan_threshold)),
        ("nouse", DFColumnDropper(non_useful_cols)),
        ("dtime1", DFColumnDropper(datetime_cols1)),
        ("c1", DFColumnDropper(cols_one_eighteen)),
        ("c2", DFColumnDropper(cols_eighteen_thirtysix)),
        ("c3", DFColumnDropper(cols_thirtyseven_end)),
        (
            "mapstatus",
            DFColumnFilterList("loan_status", loan_status),
        ),
        ("colmap", DFColumnMapper(mapping_dictionary)),
        ("onevals", DFNonUniqueValColDropper(1)),
        ("fourvals", DFColumnDropper(four_or_less_value_columns)),
        ("morethan1pctnan", DFColumnDropper(more_than_one_pct_missing_columns)),
        ("nan2", DFDropNaN()),
        ("hcardcols", DFColumnDropper(high_cardinality_cols)),
        ("dtime2", DFColumnDropper(datetime_cols2)),
        ("texttonum", DFColumnMapper(mapping_dict)),
        ("onehot", DFOneHotEncoder(nominal_columns)),
        ("repeats", DFColumnDropper(repeated_data_cols)),
        ("pctcols", DFPctNumeric(pct_to_numeric_cols, "%")),
    ]
)

In [None]:
if cloud_storage == "yes":
    az_storage_container_name = "myconedesx7"
    conn_str = (
        "DefaultEndpointsProtocol=https;"
        f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
        f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
        f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
    )
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

    blobstrings = {}
    for blob_name in ["blobedesz38"]:
        blob_client = blob_service_client.get_blob_client(
            container=az_storage_container_name, blob=blob_name
        )
        blobstring = blob_client.download_blob().content_as_text()
    loans_2007 = pd.read_csv(StringIO(blobstring), skiprows=1, low_memory=False)
else:
    loans_2007 = pd.read_csv(raw_data_path, skiprows=1, low_memory=False)

In [None]:
loans_2007, _ = train_test_split(loans_2007, test_size=0.33, random_state=4321)
loans_2007 = loans_2007.reset_index(drop=True)
filtered_loans_pipe_part_1_and_2 = pipe_part_1_and_2.fit_transform(loans_2007)

In [None]:
assert filtered_loans_pipe.equals(filtered_loans)
assert filtered_loans_pipe_part_1_and_2.equals(filtered_loans)