# Feature generation and filtering

## Intro

In [1]:
%reload_ext autoreload
%autoreload 1
import joblib
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import auxiliary.lists as aux_lists
import auxiliary.transformers as tr
import auxiliary.eda_functions as eda
import auxiliary.statistics as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Markdown,display
import matplotlib.ticker as ticker
from tabulate import tabulate
%aimport auxiliary.lists
%aimport auxiliary.transformers
%aimport auxiliary.eda_functions
%aimport auxiliary.statistics

In [2]:
sns.set()
BASE_FIG_SIZE = (8.5, 4.5)

In [3]:
applicattion_test = pl.read_csv("data/application_test.csv")
application_train = pl.read_csv("data/application_train.csv")
bureau_balance = pl.read_csv("data/bureau_balance.csv")
bureau = pl.read_csv("data/bureau.csv")
installments_payments = pl.read_csv("data/installments_payments.csv")
POS_CASH_balance = pl.read_csv("data/POS_CASH_balance.csv")
previous_application = pl.read_csv("data/previous_application.csv")
credit_card_balance = pl.read_csv("data/credit_card_balance.csv")
id_and_target = ["SK_ID_CURR", "TARGET"]

In [4]:
applicattion_test = applicattion_test.with_columns(
    pl.lit(None).cast(pl.Int64).alias("TARGET")
)
applicattion_test = applicattion_test[
    [applicattion_test.columns[0]]
    + [applicattion_test.columns[-1]]
    + applicattion_test.columns[1:-1]
]
application_full = pl.concat([application_train, applicattion_test])

### Bureau

**Removing amount values for non-major currencies as three is no way of converting:**

In [5]:
for col in [col for col in bureau.columns if "AMT" in col]:
    bureau = bureau.with_columns(
        pl.when(pl.col("CREDIT_CURRENCY") != "currency 1")
        .then(pl.lit(None))
        .otherwise(pl.col(col))
        .alias(col)
    )

**Numeric feature aggregations:**

In [6]:
bureau_agg = pl.DataFrame(application_full["SK_ID_CURR"])

bureau_num_cols = (
    bureau.select([pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)])
    .drop(columns=id_and_target)
    .columns
)
bureau_num_cols.remove("SK_ID_BUREAU")

bureau_agg = eda.make_aggregations(
    bureau_agg, bureau, bureau_num_cols, "SK_ID_CURR", join_suffix="TOT"
)


**Aggregations from active credits only**

In [7]:
bureau_agg = eda.make_aggregations(
    bureau_agg,
    bureau.filter(pl.col("CREDIT_ACTIVE") == "Active"),
    bureau_num_cols,
    "SK_ID_CURR",
    join_suffix="_Active",
)


**Credit Counts:**

In [8]:
bureau_agg = bureau_agg.join(
    bureau.group_by(pl.col("SK_ID_CURR")).count(),
    on="SK_ID_CURR",
    how="left",
    suffix="_tot",
)

for credit_activity in ["Active", "Closed", "Sold"]:
    bureau_agg = bureau_agg.join(
        bureau.filter(pl.col("CREDIT_ACTIVE") == credit_activity)
        .group_by(pl.col("SK_ID_CURR"))
        .count(),
        on="SK_ID_CURR",
        how="left",
        suffix=f"_{credit_activity}",
    )

**Credit Type**

In [9]:
bureau_agg = bureau_agg.join(
    bureau.group_by(pl.col("SK_ID_CURR")).agg(
        pl.col("CREDIT_TYPE").mode().first().suffix("_mode")
    ),
    on="SK_ID_CURR",
    how="left",
    suffix="_tot",
)

bureau_agg = bureau_agg.join(
    bureau.filter(pl.col("CREDIT_ACTIVE") == "Active")
    .group_by(pl.col("SK_ID_CURR"))
    .agg(pl.col("CREDIT_TYPE").mode().first().suffix("_mode")),
    on="SK_ID_CURR",
    how="left",
    suffix="_active",
)

**Adding last balance status to bureau data:**

In [10]:
bureau = bureau.join(
    bureau_balance.group_by("SK_ID_BUREAU").agg(
        [pl.all().sort_by("MONTHS_BALANCE").last()]
    )[["SK_ID_BUREAU", "STATUS"]],
    on="SK_ID_BUREAU",
    how="left",
    suffix="_last",
)

**Adding days-past-due for each credit entry (last status) to bureau:**

In [11]:
bureau = bureau.with_columns(
    pl.col("STATUS").str.to_decimal().cast(pl.Int16).alias("DPD_approx")
)


**Adding the data to the aggregated table**

In [12]:
bureau_agg = bureau_agg.join(
    bureau.group_by(pl.col("SK_ID_CURR")).agg(
        pl.col("STATUS").mode().first().suffix("_mode")
    ),
    on="SK_ID_CURR",
    how="left",
    suffix="_tot",
)

bureau_agg = bureau_agg.join(
    bureau.group_by(pl.col("SK_ID_CURR")).agg(pl.col("DPD_approx").sum()),
    on="SK_ID_CURR",
    how="left",
    suffix="_tot",
)

bureau_agg = bureau_agg.join(
    bureau.filter(pl.col("CREDIT_ACTIVE") == "Active")
    .group_by(pl.col("SK_ID_CURR"))
    .agg(pl.col("DPD_approx").sum()),
    on="SK_ID_CURR",
    how="left",
    suffix="_active",
)

bureau_agg = bureau_agg.join(
    bureau.filter(pl.col("CREDIT_ACTIVE") == "Active")
    .group_by(pl.col("SK_ID_CURR"))
    .agg(pl.col("STATUS").mode().first().suffix("_mode")),
    on="SK_ID_CURR",
    how="left",
    suffix="_active",
)


**Adding the bureau features to the application table:**

In [13]:
application_full = application_full.join(
    bureau_agg.select(
        [pl.col("SK_ID_CURR"), pl.all().exclude("SK_ID_CURR").prefix("bureau_")]
    ),
    on="SK_ID_CURR",
    how="left",
)


### Previous Applications

**Filter only last applications per contract**

In [14]:
previous_application_agg = pl.DataFrame(application_full["SK_ID_CURR"])
previous_application = previous_application.filter(
    pl.col("FLAG_LAST_APPL_PER_CONTRACT") == "Y"
)

#### Positive Cash Balance

**Filter to only last entries for active previous loans and calculate installments left**

In [15]:
POS_CASH_balance = POS_CASH_balance.group_by("SK_ID_PREV").agg(
    [pl.all().sort_by("MONTHS_BALANCE").last()]
)
POS_CASH_balance = POS_CASH_balance.filter(pl.col("NAME_CONTRACT_STATUS") == "Active")

POS_CASH_balance = POS_CASH_balance.with_columns(
    (pl.col("CNT_INSTALMENT_FUTURE") + pl.col("MONTHS_BALANCE")).alias(
        "installments_left"
    )
)

POS_CASH_balance = POS_CASH_balance.with_columns(
    pl.when(pl.col("installments_left") < 1)
    .then(pl.lit(0))
    .otherwise(pl.col("installments_left"))
    .alias("installments_left")
)

**Adding installments left to previous application data:**

In [16]:
previous_application = previous_application.join(
    POS_CASH_balance[["SK_ID_PREV", "installments_left"]],
    on="SK_ID_PREV",
    how="left",
)

#### Credit Card Balance

**Calculating credit card balance and limit difference:**

In [17]:
credit_card_balance = credit_card_balance.with_columns(
    (pl.col("AMT_CREDIT_LIMIT_ACTUAL") - pl.col("AMT_BALANCE")).alias(
        "balance_limit_diff"
    )
)


**Amount of months where credit balance was over limit**

In [18]:
previous_application = previous_application.join(
    credit_card_balance.filter(pl.col("balance_limit_diff") < 0)
    .group_by("SK_ID_PREV")
    .agg(pl.count().suffix("_count_neg")),
    on="SK_ID_PREV",
    how="left",
)


**Current Credit Card Balance and Balance under Limit**

In [19]:
previous_application = previous_application.join(
    credit_card_balance.filter(
        (pl.col("MONTHS_BALANCE") == -1) & (pl.col("NAME_CONTRACT_STATUS") == "Active")
    ).select(
        [
            pl.col("SK_ID_PREV"),
            pl.col("balance_limit_diff").alias("curr_bal_lim_diff"),
            pl.col("AMT_BALANCE").alias("AMT_BALANCE_CURR"),
        ]
    ),
    on="SK_ID_PREV",
    how="left",
)


#### Making aggregations by current application:

**Sum of previous credits for application with different status and type:**

In [20]:
previous_application_agg = eda.make_aggregations(
    previous_application_agg,
    previous_application,
    ["AMT_CREDIT"],
    "SK_ID_CURR",
    ["sum", "mean"],
    "_total",
)

for status in ["Approved", "Refused", "Canceled"]:
    previous_application_agg = eda.make_aggregations(
        previous_application_agg,
        previous_application.filter(pl.col("NAME_CONTRACT_STATUS") == status),
        ["AMT_CREDIT"],
        "SK_ID_CURR",
        ["sum", "mean"],
        f"_{status}",
    )

for type in ["Consumer loans", "Revolving loans", "Cash loans"]:
    previous_application_agg = eda.make_aggregations(
        previous_application_agg,
        previous_application.filter(pl.col("NAME_CONTRACT_TYPE") == type),
        ["AMT_CREDIT"],
        "SK_ID_CURR",
        ["sum", "mean"],
        f"_{type.replace(' ','_')}",
    )

**Adding counts of accepted and rejected previous applications**

In [21]:
for status in ["Approved", "Refused"]:
    previous_application_agg = previous_application_agg.join(
        previous_application.filter(pl.col("NAME_CONTRACT_STATUS") == status)
        .group_by("SK_ID_CURR")
        .agg(pl.count().suffix(f"_{status}")),
        on="SK_ID_CURR",
        how="left",
    )

**Amount of total payments left from previous applications:**

In [22]:
previous_application = previous_application.with_columns(
    (pl.col("AMT_ANNUITY") * pl.col("installments_left")).alias("payment_left")
)

previous_application_agg = previous_application_agg.join(
    previous_application.group_by("SK_ID_CURR").agg(pl.col("payment_left").sum()),
    on="SK_ID_CURR",
    how="left",
)

**Difference between credit limit and balance**

In [23]:
previous_application_agg = eda.make_aggregations(
    previous_application_agg,
    previous_application,
    ["AMT_BALANCE_CURR", "curr_bal_lim_diff"],
    "SK_ID_CURR",
    ["sum"],
    "_CURR",
)

In [24]:
application_full = application_full.join(
    previous_application_agg.select(
        [pl.col("SK_ID_CURR"), pl.all().exclude("SK_ID_CURR").prefix("prev_")]
    ),
    on="SK_ID_CURR",
    how="left",
)


# else

In [25]:
application_train = application_full.filter(pl.col("TARGET").is_not_null())
applicattion_test = application_full.filter(pl.col("TARGET").is_null())

In [26]:
# test_bureu_aggs = eda.test_with_catboost_crossval(
#     application_train.drop(columns=id_and_target),
#     application_train["TARGET"],
#     sample_size=10000,
#     cat_features=application_train.select(pl.col(pl.Utf8)).columns,
# )


In [27]:
# test_bureu_aggs["scores"]

In [28]:
# test_bureu_aggs["features"]

### Feature importance

### Common Sense Removal

In [29]:
application_edited_train = application_train.clone()
cols_to_remove = []

#### Home Feature Elimination

### Col cats

In [30]:
application_train.filter(pl.col("FLAG_OWN_CAR") == "Y")["OWN_CAR_AGE"].null_count()

5

In [31]:
cols_to_remove.append("FLAG_OWN_CAR")

In [32]:
application_edited_train = application_edited_train.drop(cols_to_remove)

string_cols_reduced = application_edited_train.select(pl.col(pl.Utf8)).columns

### Variance

In [33]:
home_target_encoders = {}
application_edited_encoded = application_edited_train.clone()
for col in string_cols_reduced:
    home_target_encoders[col] = tr.TargetMeanOrderedLabeler(how="label")
    application_edited_encoded = application_edited_encoded.with_columns(
        home_target_encoders[col]
        .fit_transform(
            application_edited_encoded[col], application_edited_encoded["TARGET"]
        )
        .alias(col)
    )
application_edited_encoded = pl.DataFrame(
    MinMaxScaler().fit_transform(application_edited_encoded),
    schema=application_edited_encoded.columns,
)

In [34]:
feature_variance = pl.DataFrame(
    {
        "feature": application_edited_encoded.columns,
        "variance": np.var(
            application_edited_encoded.to_numpy(), axis=0
        ),
    }
)
feature_variance.sort("variance")[:20]

feature,variance
str,f64
"""FLAG_MOBIL""",3e-06
"""AMT_INCOME_TOT…",4e-06
"""FLAG_DOCUMENT_…",7e-06
"""FLAG_DOCUMENT_…",2.3e-05
"""FLAG_DOCUMENT_…",4.2e-05
"""FLAG_DOCUMENT_…",8.1e-05
"""FLAG_DOCUMENT_…",0.000192
"""FLAG_DOCUMENT_…",0.000267
"""FLAG_DOCUMENT_…",0.000335
"""FLAG_DOCUMENT_…",0.000507


In [35]:
low_var_fatures = []
for feature in feature_variance.filter(pl.col("variance") < 0.001)["feature"]:
    if application_edited_train[feature].n_unique() < 10:
        low_var_fatures.append(feature)
        minority_count = (
            application_edited_train[feature].value_counts().sort("counts")["counts"][0]
        )
        print(
            f"{feature}: minority class count = {minority_count} feature will be removed"
        )

FLAG_MOBIL: minority class count = 1 feature will be removed
FLAG_DOCUMENT_2: minority class count = 13 feature will be removed
FLAG_DOCUMENT_4: minority class count = 25 feature will be removed
FLAG_DOCUMENT_7: minority class count = 59 feature will be removed
FLAG_DOCUMENT_10: minority class count = 7 feature will be removed
FLAG_DOCUMENT_12: minority class count = 2 feature will be removed
FLAG_DOCUMENT_17: minority class count = 82 feature will be removed
FLAG_DOCUMENT_19: minority class count = 183 feature will be removed
FLAG_DOCUMENT_20: minority class count = 156 feature will be removed
FLAG_DOCUMENT_21: minority class count = 103 feature will be removed


In [36]:
application_edited_train = application_edited_train.drop(columns=low_var_fatures)
cols_to_remove.extend(low_var_fatures)

In [37]:
# test_var_removal = eda.test_with_catboost_crossval(
#     application_edited_train.drop(columns=id_and_target),
#     application_edited_train["TARGET"],
#     string_cols_reduced,
#     sample_size=10000,
# )

# print(test_var_removal["scores"])


In [38]:
feature_correlation_test = st.get_correlation_pairs(
    application_edited_encoded.drop(columns=id_and_target),
    max_threshold=0.999,
    min_threshold=-0.999,
)
print(tabulate(feature_correlation_test['clusters']))

-----------------------------------------  ----------------------------------------  ----------------------------------------  ----------------------------------------  -----------------------------------------  ------------------------------------
APARTMENTS_MEDI                            APARTMENTS_AVG
BASEMENTAREA_MEDI                          BASEMENTAREA_AVG
YEARS_BEGINEXPLUATATION_AVG                YEARS_BEGINEXPLUATATION_MEDI
YEARS_BUILD_MEDI                           YEARS_BUILD_MODE                          YEARS_BUILD_AVG
COMMONAREA_MEDI                            COMMONAREA_AVG                            COMMONAREA_MODE
ELEVATORS_MEDI                             ELEVATORS_AVG
ENTRANCES_AVG                              ENTRANCES_MEDI
FLOORSMAX_MEDI                             FLOORSMAX_AVG
FLOORSMIN_MODE                             FLOORSMIN_MEDI                            FLOORSMIN_AVG
LANDAREA_MEDI                              LANDAREA_AVG
LIVINGAPARTMENTS_MODE           

In [39]:
highly_correlated_to_drop=[]
for cluster in feature_correlation_test['clusters']:
    highly_correlated_to_drop.extend(list(cluster)[1:])
application_edited_train = application_edited_train.drop(columns=highly_correlated_to_drop)
cols_to_remove.extend(highly_correlated_to_drop)

In [40]:
application_full=application_full.drop(columns=cols_to_remove)
application_full.filter(pl.col('TARGET').is_not_null()).write_parquet('temp/application_train_filtered.parquet')
application_full.filter(pl.col('TARGET').is_null()).write_parquet('temp/application_test_filtered.parquet')