In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 70
import seaborn as sns

%load_ext autoreload
%autoreload 2

In [None]:
from utils import load_csv_compressed

accepted = load_csv_compressed("data/accepted_2007_to_2018Q4.csv")

In [None]:
accepted["label"] = accepted["loan_status"].replace(
    (
        "Fully Paid",
        "Charged Off",
        "Does not meet the credit policy. Status:Fully Paid",
        "Does not meet the credit policy. Status:Charged Off",
        "Default",
    ),
    (0, 1, 0, 1, 1),
)
accepted.drop(index=accepted[~accepted.label.isin([0, 1])].index, inplace=True)

accepted["label"] = accepted["label"].replace("Default", 1)
accepted["label"] = pd.to_numeric(accepted["label"])

In [None]:
cols_with_frequent_nan = accepted.columns[accepted.isna().mean() > 0.1]
accepted.drop(columns=cols_with_frequent_nan, inplace=True)
accepted.drop(columns="policy_code", inplace=True)

In [None]:
numeric = accepted.select_dtypes(include=np.number).columns
categorical = accepted.select_dtypes(exclude=np.number).columns

In [None]:
def calcDrop(res):
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res["v1"].tolist() + res["v2"].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res["drop"].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[res["v1"].isin(keep) | res["v2"].isin(keep)][["v1", "v2"]]
    q = list(set(p["v1"].tolist() + p["v2"].tolist()))
    drop = list(set(q).difference(set(keep)))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[res["v1"].isin(poss_drop) | res["v2"].isin(poss_drop)][["v1", "v2", "drop"]]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m["v1"].isin(drop) & ~m["v2"].isin(drop)]["drop"]))
    for item in more_drop:
        drop.append(item)

    return drop


def corrX(df, cut=0.9):
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis=1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(bool))

    dropcols = list()

    res = pd.DataFrame(columns=(["v1", "v2", "v1.target", "v2.target", "corr", "drop"]))

    for row in range(len(up) - 1):
        col_idx = row + 1
        for col in range(col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = pd.Series(
                    [
                        corr_mtx.index[row],
                        up.columns[col],
                        avg_corr[row],
                        avg_corr[col],
                        up.iloc[row, col],
                        drop,
                    ],
                    index=res.columns,
                )

                res = res.append(s, ignore_index=True)

    dropcols_names = calcDrop(res)

    return dropcols_names

In [None]:
to_drop = corrX(accepted, cut=0.6)
to_drop.remove("loan_amnt")

to_drop += [
    "installment",
    "total_rec_int",
    "last_pymnt_amnt",
    "id",
    "grade",
    "sub_grade",
]  # Результаты скоринга

len(to_drop)

In [None]:
numeric = accepted.select_dtypes(include=np.number).columns
categorical = accepted.select_dtypes(exclude=np.number).columns

In [None]:
# corr_new = accepted[numeric].drop(columns = to_drop).corr()
# plt.figure(figsize=(50, 50))
# mask = np.triu(np.ones_like(corr_new, dtype=bool))
# sns.heatmap(corr_new, vmin=-1, vmax=1, annot=True, mask=mask)

In [None]:
to_drop_categorical = [
    "last_pymnt_d",
    "last_credit_pull_d",
    "url",
    "issue_d",
    "debt_settlement_flag",
    "hardship_flag",
    "zip_code",
    "title",
    "pymnt_plan",
]

In [None]:
accepted.drop(columns=to_drop + to_drop_categorical, inplace=True)

In [None]:
count = accepted["emp_title"].value_counts().sort_values(ascending=False)
count[count > 1000]

In [None]:
accepted.columns

In [None]:
import yaml

print(
    yaml.dump(
        accepted.columns.values.tolist(), explicit_start=True, default_flow_style=False
    )
)

In [None]:
numeric = accepted.select_dtypes(include=np.number).columns.values.tolist()
categorical = accepted.select_dtypes(exclude=np.number).columns.values.tolist()

print(yaml.dump(numeric, explicit_start=True, default_flow_style=False))
print(yaml.dump(categorical, explicit_start=True, default_flow_style=False))

In [None]:
from pipeline import get_preprocessing_pipeline

pipeline = get_preprocessing_pipeline(
    name="log_reg",
    overrides=[
        "+preprocessing_pipeline.steps_config.2.ImputeNumerical.inner.verbose=10",
        "+preprocessing_pipeline.steps_config.3.ImputeCategorical.inner.verbose=10",
    ],
    debug=True,
)
pipeline

In [None]:
X = pipeline.fit_transform([], y=[])