In [1]:
%reload_ext autoreload
%autoreload 1
import joblib
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import auxiliary.lists as aux_lists
import auxiliary.transformers as tr
import auxiliary.eda_functions as eda
import auxiliary.statistics as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Markdown,display
import matplotlib.ticker as ticker
from tabulate import tabulate
%aimport auxiliary.lists
%aimport auxiliary.transformers
%aimport auxiliary.eda_functions
%aimport auxiliary.statistics

In [2]:
application_train = pl.read_csv("data/application_train.csv")
bureau_balance = pl.read_csv("data/bureau_balance.csv")
bureau = pl.read_csv("data/bureau.csv")
installments_payments = pl.read_csv("data/installments_payments.csv")
POS_CASH_balance = pl.read_csv("data/POS_CASH_balance.csv")
previous_application = pl.read_csv("data/previous_application.csv")
credit_card_balance = pl.read_csv("data/credit_card_balance.csv")

In [3]:
active_credit_cards = credit_card_balance.filter(
    (pl.col("MONTHS_BALANCE") == -1) & (pl.col("NAME_CONTRACT_STATUS") == "Active")
)

active_credit_cards = active_credit_cards.with_columns(
    pl.when((pl.col("AMT_CREDIT_LIMIT_ACTUAL") - pl.col("AMT_BALANCE")) < 0)
    .then(pl.lit(1, pl.Int16))
    .otherwise(pl.lit(0, pl.Int16))
    .alias("IS_OVER_LIMIT")
)

active_credit_cards=active_credit_cards.drop(columns='MONTHS_BALANCE')

### Historic Behavior

In [4]:
historical_credit_card = credit_card_balance.filter(pl.col("MONTHS_BALANCE") != -1)
historical_credit_card = historical_credit_card.with_columns(
    pl.when((pl.col("AMT_CREDIT_LIMIT_ACTUAL") - pl.col("AMT_BALANCE")) < 0)
    .then(pl.lit(1, pl.Int16))
    .otherwise(pl.lit(0, pl.Int16))
    .alias("IS_OVER_LIMIT")
)

historical_credit_card = historical_credit_card.with_columns(
    (pl.col("AMT_BALANCE") - pl.col("AMT_CREDIT_LIMIT_ACTUAL")).alias("OVER_LIMIT")
)
historical_credit_card = historical_credit_card.with_columns(
    pl.when(pl.col("OVER_LIMIT") > 0)
    .then(pl.col("OVER_LIMIT"))
    .otherwise(pl.lit(0))
    .alias("OVER_LIMIT")
)

historical_credit_card_agg = pl.DataFrame(active_credit_cards["SK_ID_PREV"])

History length:

In [5]:
historical_credit_card_agg = historical_credit_card_agg.join(
    historical_credit_card.group_by("SK_ID_PREV").agg(pl.count().alias("NUM_ENTRIES")),
    on="SK_ID_PREV",
)

over-limit aggs:

In [6]:
historical_credit_card_agg = eda.make_aggregations(
    historical_credit_card_agg,
    historical_credit_card,
    ["IS_OVER_LIMIT", "OVER_LIMIT"],
    "SK_ID_PREV",
)
historical_credit_card_agg = historical_credit_card_agg.drop(
    columns=["IS_OVER_LIMIT_min", "IS_OVER_LIMIT_max"]
)

In [7]:
active_credit_cards = active_credit_cards.join(
    historical_credit_card_agg.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("HISTORIC_")]
    ),
    on="SK_ID_PREV",
    how="left",
)

### Application Attributes

In [8]:
credit_card_applications = previous_application.filter(
    pl.col("SK_ID_PREV").is_in(active_credit_cards["SK_ID_PREV"])
)

**Dropping features irrelevant to credit cards:**

In [9]:
cols_to_drop = [
    "SK_ID_CURR",
    "NAME_CONTRACT_TYPE",
    "AMT_DOWN_PAYMENT",
    "AMT_GOODS_PRICE",
    "RATE_DOWN_PAYMENT",
    "RATE_DOWN_PAYMENT",
    "RATE_INTEREST_PRIVILEGED",
    "NAME_CASH_LOAN_PURPOSE",
    "NAME_CONTRACT_STATUS",
    "DAYS_DECISION",
    "NAME_PAYMENT_TYPE",
    "FLAG_LAST_APPL_PER_CONTRACT",
    "CODE_REJECT_REASON",
    "NAME_GOODS_CATEGORY",
    "NAME_PORTFOLIO",
    "CNT_PAYMENT",
    "NAME_YIELD_GROUP",
    "DAYS_FIRST_DRAWING",
    "DAYS_FIRST_DUE",
    "DAYS_LAST_DUE_1ST_VERSION",
    "DAYS_LAST_DUE",
    "DAYS_TERMINATION",
]
credit_card_applications = credit_card_applications.drop(columns=cols_to_drop)

In [10]:
active_credit_cards=active_credit_cards.join(credit_card_applications.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("APPLICATION_")]
    ),on='SK_ID_PREV',how="left")

### Payment Behavioral Data

In [16]:
credit_card_installments = installments_payments.filter(
    pl.col("SK_ID_PREV").is_in(active_credit_cards["SK_ID_PREV"])
)
# Filtering out new data to avoid leakage
credit_card_installments = credit_card_installments.filter(
    (pl.col("DAYS_INSTALMENT") < -31) & (pl.col("DAYS_ENTRY_PAYMENT") < -31)
)

In [18]:
installments_agg = pl.DataFrame(active_credit_cards["SK_ID_PREV"])

In [19]:
credit_card_installments = credit_card_installments.with_columns(
    (pl.col("DAYS_INSTALMENT") - pl.col("DAYS_ENTRY_PAYMENT")).alias("DAYS_LATE")
)

credit_card_installments = credit_card_installments.with_columns(
    pl.when(pl.col("DAYS_LATE") < 0)
    .then(pl.lit(0))
    .otherwise(pl.col("DAYS_LATE"))
    .alias("DAYS_LATE")
)


credit_card_installments = credit_card_installments.with_columns(
    (pl.col("AMT_INSTALMENT") - pl.col("AMT_PAYMENT")).alias("AMT_UNDERPAID")
)
credit_card_installments = credit_card_installments.with_columns(
    pl.when(pl.col("AMT_UNDERPAID") < 0)
    .then(pl.lit(0))
    .otherwise(pl.col("AMT_UNDERPAID"))
    .alias("AMT_UNDERPAID")
)

In [21]:
installments_agg = eda.make_aggregations(
    installments_agg,
    credit_card_installments,
    ["DAYS_LATE"],
    id="SK_ID_PREV",
    aggregations=["mean", "sum", "max"],
    join_suffix="inst"
)

installments_agg = eda.make_aggregations(
    installments_agg,
    credit_card_installments,
    ["AMT_UNDERPAID"],
    id="SK_ID_PREV",
    aggregations=["mean", "sum", "max"],
    join_suffix="inst"
)

In [23]:
active_credit_cards = active_credit_cards.join(
    installments_agg.select(
        [pl.col("SK_ID_PREV"), pl.all().exclude("SK_ID_PREV").prefix("instalment_")]
    ),
    on="SK_ID_PREV",
    how="left",
)
