In [1]:
%reload_ext autoreload
%autoreload 1
import joblib
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import auxiliary.lists as aux_lists
import auxiliary.transformers as tr
import auxiliary.eda_functions as eda
import auxiliary.statistics as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Markdown,display
import matplotlib.ticker as ticker
from tabulate import tabulate
%aimport auxiliary.lists
%aimport auxiliary.transformers
%aimport auxiliary.eda_functions
%aimport auxiliary.statistics

In [2]:
application_train = pl.read_csv("data/application_train.csv")
applicattion_test = pl.read_csv("data/application_test.csv")
bureau_balance = pl.read_csv("data/bureau_balance.csv")
bureau = pl.read_csv("data/bureau.csv")
installments_payments = pl.read_csv("data/installments_payments.csv")
previous_application = pl.read_csv("data/previous_application.csv")
credit_card_balance = pl.read_csv("data/credit_card_balance.csv")

In [3]:
applicattion_test = applicattion_test.with_columns(
    pl.lit(None).cast(pl.Int64).alias("TARGET")
)
applicattion_test = applicattion_test[
    [applicattion_test.columns[0]]
    + [applicattion_test.columns[-1]]
    + applicattion_test.columns[1:-1]
]
application_full = pl.concat([application_train, applicattion_test])

In [4]:
active_credit_cards = credit_card_balance.filter(
    (pl.col("MONTHS_BALANCE") == -1) & (pl.col("NAME_CONTRACT_STATUS") == "Active")
)

active_credit_cards = active_credit_cards.with_columns(
    pl.when((pl.col("AMT_CREDIT_LIMIT_ACTUAL") - pl.col("AMT_BALANCE")) < 0)
    .then(pl.lit(1, pl.Int16))
    .otherwise(pl.lit(0, pl.Int16))
    .alias("IS_OVER_LIMIT")
)

active_credit_cards = active_credit_cards.drop(
    columns=["MONTHS_BALANCE", "NAME_CONTRACT_STATUS"]
)

### Historic Behavior

In [5]:
historical_credit_card = credit_card_balance.filter(pl.col("MONTHS_BALANCE") != -1)
historical_credit_card = historical_credit_card.with_columns(
    pl.when((pl.col("AMT_CREDIT_LIMIT_ACTUAL") - pl.col("AMT_BALANCE")) < 0)
    .then(pl.lit(1, pl.Int16))
    .otherwise(pl.lit(0, pl.Int16))
    .alias("IS_OVER_LIMIT")
)

historical_credit_card = historical_credit_card.with_columns(
    (pl.col("AMT_BALANCE") - pl.col("AMT_CREDIT_LIMIT_ACTUAL")).alias("OVER_LIMIT")
)
historical_credit_card = historical_credit_card.with_columns(
    pl.when(pl.col("OVER_LIMIT") > 0)
    .then(pl.col("OVER_LIMIT"))
    .otherwise(pl.lit(0))
    .alias("OVER_LIMIT")
)

historical_credit_card_agg = pl.DataFrame(active_credit_cards["SK_ID_PREV"])

History length:

In [6]:
historical_credit_card_agg = historical_credit_card_agg.join(
    historical_credit_card.group_by("SK_ID_PREV").agg(pl.count().alias("NUM_ENTRIES")),
    on="SK_ID_PREV",
)

over-limit aggs:

In [7]:
historical_credit_card_agg = eda.make_aggregations(
    historical_credit_card_agg,
    historical_credit_card,
    ["IS_OVER_LIMIT", "OVER_LIMIT"],
    "SK_ID_PREV",
)
historical_credit_card_agg = historical_credit_card_agg.drop(
    columns=["IS_OVER_LIMIT_min", "IS_OVER_LIMIT_max"]
)

In [8]:
active_credit_cards = active_credit_cards.join(
    historical_credit_card_agg.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("HISTORIC_")]
    ),
    on="SK_ID_PREV",
    how="left",
)

### Application Attributes

In [9]:
credit_card_applications = previous_application.filter(
    pl.col("SK_ID_PREV").is_in(active_credit_cards["SK_ID_PREV"])
)

**Dropping features irrelevant to credit cards:**

In [10]:
cols_to_drop = [
    "SK_ID_CURR",
    "NAME_CONTRACT_TYPE",
    "AMT_DOWN_PAYMENT",
    "AMT_GOODS_PRICE",
    "RATE_DOWN_PAYMENT",
    "RATE_DOWN_PAYMENT",
    "RATE_INTEREST_PRIMARY",
    "RATE_INTEREST_PRIVILEGED",
    "NAME_CASH_LOAN_PURPOSE",
    "NAME_CONTRACT_STATUS",
    "DAYS_DECISION",
    "NAME_PAYMENT_TYPE",
    "FLAG_LAST_APPL_PER_CONTRACT",
    "CODE_REJECT_REASON",
    "NAME_GOODS_CATEGORY",
    "NAME_PORTFOLIO",
    "CNT_PAYMENT",
    "NAME_YIELD_GROUP",
    "DAYS_FIRST_DRAWING",
    "DAYS_FIRST_DUE",
    "DAYS_LAST_DUE_1ST_VERSION",
    "DAYS_LAST_DUE",
    "DAYS_TERMINATION",
]
credit_card_applications = credit_card_applications.drop(columns=cols_to_drop)

In [11]:
active_credit_cards=active_credit_cards.join(credit_card_applications.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("APPLICATION_")]
    ),on='SK_ID_PREV',how="left")

### Payment Behavioral Data

In [12]:
credit_card_installments = installments_payments.filter(
    pl.col("SK_ID_PREV").is_in(active_credit_cards["SK_ID_PREV"])
)
# Filtering out new data to avoid leakage
credit_card_installments = credit_card_installments.filter(
    (pl.col("DAYS_INSTALMENT") < -31) & (pl.col("DAYS_ENTRY_PAYMENT") < -31)
)

In [13]:
installments_agg = pl.DataFrame(active_credit_cards["SK_ID_PREV"])

In [14]:
credit_card_installments = credit_card_installments.with_columns(
    (pl.col("DAYS_INSTALMENT") - pl.col("DAYS_ENTRY_PAYMENT")).alias("DAYS_LATE")
)

credit_card_installments = credit_card_installments.with_columns(
    pl.when(pl.col("DAYS_LATE") < 0)
    .then(pl.lit(0))
    .otherwise(pl.col("DAYS_LATE"))
    .alias("DAYS_LATE")
)


credit_card_installments = credit_card_installments.with_columns(
    (pl.col("AMT_INSTALMENT") - pl.col("AMT_PAYMENT")).alias("AMT_UNDERPAID")
)
credit_card_installments = credit_card_installments.with_columns(
    pl.when(pl.col("AMT_UNDERPAID") < 0)
    .then(pl.lit(0))
    .otherwise(pl.col("AMT_UNDERPAID"))
    .alias("AMT_UNDERPAID")
)

In [15]:
installments_agg = eda.make_aggregations(
    installments_agg,
    credit_card_installments,
    ["DAYS_LATE"],
    id="SK_ID_PREV",
    aggregations=["mean", "sum", "max"],
    join_suffix="inst"
)

installments_agg = eda.make_aggregations(
    installments_agg,
    credit_card_installments,
    ["AMT_UNDERPAID"],
    id="SK_ID_PREV",
    aggregations=["mean", "sum", "max"],
    join_suffix="inst"
)

In [16]:
active_credit_cards = active_credit_cards.join(
    installments_agg.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("instalment_")]
    ),
    on="SK_ID_PREV",
    how="left",
)


### Current status

In [17]:
application_cols_to_merge = [
    "SK_ID_CURR",
    "CODE_GENDER",
    "FLAG_OWN_REALTY",
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "NAME_TYPE_SUITE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "REGION_POPULATION_RELATIVE",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "OWN_CAR_AGE",
    "OCCUPATION_TYPE",
    "CNT_FAM_MEMBERS",
    "REGION_RATING_CLIENT",
    "REGION_RATING_CLIENT_W_CITY",
    "REG_REGION_NOT_LIVE_REGION",
    "REG_REGION_NOT_WORK_REGION",
    "LIVE_REGION_NOT_WORK_REGION",
    "REG_CITY_NOT_LIVE_CITY",
    "REG_CITY_NOT_WORK_CITY",
    "LIVE_CITY_NOT_WORK_CITY",
    "ORGANIZATION_TYPE",
]
active_credit_cards=active_credit_cards.join(application_full[application_cols_to_merge],on="SK_ID_CURR",how='left')

In [18]:
active_credit_cards=active_credit_cards.drop("SK_ID_PREV","SK_ID_CURR")

## Feature Selection

### Variance

In [19]:
str_cols=active_credit_cards.select(pl.col(pl.Utf8)).columns

In [20]:
home_target_encoders = {}
active_credit_cards_encoded = active_credit_cards.clone()
for col in str_cols:
    home_target_encoders[col] = tr.TargetMeanOrderedLabeler(how="label")
    active_credit_cards_encoded = active_credit_cards_encoded.with_columns(
        home_target_encoders[col]
        .fit_transform(
            active_credit_cards_encoded[col], active_credit_cards_encoded["IS_OVER_LIMIT"]
        )
        .alias(col)
    )
active_credit_cards_encoded_scaled = pl.DataFrame(
    MinMaxScaler().fit_transform(active_credit_cards_encoded),
    schema=active_credit_cards_encoded.columns,
)

In [21]:
feature_variance = pl.DataFrame(
    {
        "feature": active_credit_cards_encoded_scaled.columns,
        "variance": np.var(
            active_credit_cards_encoded_scaled.to_numpy(), axis=0
        ),
    }
)
feature_variance.sort("variance")[:20]

feature,variance
str,f64
"""SK_DPD_DEF""",4.5e-05
"""AMT_INCOME_TOT…",6.7e-05
"""AMT_PAYMENT_TO…",0.000445
"""AMT_DRAWINGS_C…",0.000465
"""SK_DPD""",0.000796
"""CNT_CHILDREN""",0.001457
"""CNT_DRAWINGS_C…",0.001846
"""CNT_FAM_MEMBER…",0.00221
"""REG_REGION_NOT…",0.012806
"""AMT_INST_MIN_R…",0.01394


In [23]:
feature_variance.filter(pl.col("variance") < 0.001)["feature"]

feature
str
"""AMT_DRAWINGS_C…"
"""AMT_PAYMENT_TO…"
"""SK_DPD"""
"""SK_DPD_DEF"""
"""AMT_INCOME_TOT…"


In [24]:
low_var_fatures = []
for feature in feature_variance.filter(pl.col("variance") < 0.001)["feature"]:
    if active_credit_cards_encoded_scaled[feature].n_unique() < 10:
        low_var_fatures.append(feature)
        minority_count = (
            active_credit_cards_encoded_scaled[feature]
            .value_counts()
            .sort("counts")["counts"][0]
        )
        print(
            f"{feature}: minority class count = {minority_count} feature will be removed"
        )

### Highly Correlated

In [25]:
feature_correlation_test = st.get_correlation_pairs(
    active_credit_cards_encoded.drop(columns="IS_OVER_LIMIT"),
    max_threshold=0.999,
    min_threshold=-0.999,
)
print(tabulate(feature_correlation_test['clusters']))

--------------------  -------------
AMT_TOTAL_RECEIVABLE  AMT_RECIVABLE
--------------------  -------------


In [26]:
feature_correlation_test['pairs']

features,correlation
list[str],f64
"[""AMT_RECIVABLE"", ""AMT_TOTAL_RECEIVABLE""]",1.0


In [27]:
active_credit_cards=active_credit_cards.drop(columns="AMT_TOTAL_RECEIVABLE")

In [28]:
active_credit_cards.write_parquet('temp/active_credit_cards.parquet')