In [1]:
import numpy as np
import polars as pl
import pandas as pd

from data_prep_utilities import *
from dataset_descriptions import dataset_full, dataset_example

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import copy

from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN # alternative: adds undersampling to remove noisy data, but much slower!

# Data Preparation Notebook

This notebook loads the data, performs feature selection and engineering, and joins the tables. The end result is a Train/Val/Test split, to be used for any model training.

## Data Explanation

A couple notes on data interpretation:

Where predictors were transformed, columns describing the transformation have been added with a capital letter suffixing the predictor name
* P - Transform DPD (Days past due)
* M - Masking categories
* A - Transform amount
* D - Transform date
* T - Unspecified Transform
* L - Unspecified Transform

On depths: depth of a table refers to how many num_group# columns are used to index. Each case_id is only featured once for each unique set of indices, although it may not have a listing for every set. The indexing is not necessarily chronological either; dates where num_group1 == 2 may be earlier than dates where num_group1 == 0. It may be useful to pull summary information for each case_id, e.g. min, max, median, fraction_empty.

In [2]:
# for exploration purposes: this gives more information about each feature
dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
feature_definitions = pl.read_csv(dataPath + "feature_definitions.csv")
print(feature_definitions.head())

shape: (5, 2)
┌─────────────────────────┬───────────────────────────────────┐
│ Variable                ┆ Description                       │
│ ---                     ┆ ---                               │
│ str                     ┆ str                               │
╞═════════════════════════╪═══════════════════════════════════╡
│ actualdpd_943P          ┆ Days Past Due (DPD) of previous … │
│ actualdpdtolerance_344P ┆ DPD of client with tolerance.     │
│ addres_district_368M    ┆ District of the person's address… │
│ addres_role_871L        ┆ Role of person's address.         │
│ addres_zip_823M         ┆ Zip code of the address.          │
└─────────────────────────┴───────────────────────────────────┘


In [3]:
# for exploration: investigate a particular df or set of dfs
df_info = {
    "name":"tax_registry_c_1",
    "depth":2,
    "feature_types":["A","M"]
}
train_df, submit_df = load_df(**df_info)
train_df.head()

case_id,max_employername_160M,max_pmtamount_36A
i64,str,f64
1550762,"""717ea6c3""",2550.0
691594,"""67b8af22""",1211.0
1487238,"""aaa40643""",6269.8003
1529617,"""67f075e6""",7302.4
1561301,"""7daec9f3""",6800.0


In [None]:
# create a generator to step through features and their descriptions
cols=train_df.columns
if df_info['depth'] > 0:
    cols = [c[4:] for c in cols]
pl.Config.set_tbl_width_chars(100)
desc = feature_definitions.filter(pl.col('Variable').is_in(cols)).rows()
def next_row(desc):
    for row in desc:
        print(row[0],":")
        print(row[1])
        yield
row = next_row(desc)
print(len(desc))

In [None]:
next(row)

# Load Data

## Example: Generating splits from dataset descriptions

Below is a small example dataset description; in fact, it describes the same dataset used in the starter notebook.

In [None]:
####################################################
# stores dataset info, arguments for load_df
#    description: notes to self. Ignored by load functions
#    name: from the actual name of the file, ignoring extra info (e.g., train/train_{NAME}_1.csv)
#    features (default all): specify columns to keep (ignore all others)
#    feature_types (default all): from kept features, select only those ending with these tags
#    depth (default 0): from kaggle description. If >0, aggregation will be performed
#    aggs (default ["agg_max"]): which aggregations to use (from agg_max, agg_min, agg_median)
#####################################################
dataset_example = {
    "base":{
        "description": "links case_id to WEEK_NUM and target",
        "name":"base",
    },
    "static_0":{
        "description":"contains transaction history for each case_id (late payments, total debt, etc)",
        "name":"static_0",
        "feature_types":["A", "M"],
    },
    "static_cb":{
        "description":"data from an external cb: demographic data, risk assessment, number of credit checks",
        "name":"static_cb",
        "feature_types":["A", "M"],
    },
    "person_1_feats_1":{
        "description":" internal demographic information: zip code, marital status, gender etc (all hashed)",
        "name":"person_1",
        "features":["mainoccupationinc_384A", "incometype_1044T"],
        "depth":1,
    },
    "person_1_feats_2":{
        "description":" internal demographic information: zip code, marital status, gender etc (all hashed)",
        "name":"person_1",
        "features":["housetype_905L"],
        "depth":1,
    },
    "credit_bureau_b_2":{
        "description":"historical data from an external source, num and value of overdue payments",
        "name":"credit_bureau_b_2",
        "features":["pmts_pmtsoverdue_635A","pmts_dpdvalue_108P"],
        "depth":2,
    }
}

We call load_all_dfs to load the specified datasets from csv, select features, aggregate as indicated, then join all.

In [4]:
train_df, submission_df = load_all_dfs(dataset_example)

We will only use submission_df at the end. We save our model's results on this submission_df data for kaggle to evaluate. Train_df is passed to our split function, which returns the splits ready for scaling and training. We also pass in the submission_df to standardize format.

In [5]:
data = train_val_test_split(train_df, submission_df, train_split=0.6)

"data" has attributes for each split (train, val, test, and submit), with each split containing 
* base (case_id, WEEK_NUM, target)
* X (all predictor columns)
* y (target only; not present for submission, of course)

In [6]:
data.train.X.head()

Unnamed: 0,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,...,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,max_mainoccupationinc_384A,max_incometype_1044T,max_housetype_905L,max_pmts_pmtsoverdue_635A,max_pmts_dpdvalue_108P
0,,1917.6,0.0,,,,,30000.0,0.0,0.0,...,,,,,,10800.0,SALARIED_GOVT,,,
1,,4937.0,0.0,,,,,78000.0,0.0,0.0,...,,,,,,14000.0,EMPLOYED,,,
2,,3600.0,0.0,,,,,60000.0,0.0,0.0,...,,,,,,64000.0,PRIVATE_SECTOR_EMPLOYEE,,,
3,,3110.8,0.0,,,,,20000.0,0.0,0.0,...,,,,,,20000.0,EMPLOYED,,,
4,,1218.0,0.0,,,,,20300.0,0.0,0.0,...,,,,,,46000.0,SALARIED_GOVT,,,


## Feature Engineering
As we now have our training data isolated, we can begin feature engineering with the ability to minimize data leakage. 

### Missing Data
To start, we need to handle any missing data. We will drop features if they are below a certain completeness threshold in our training data. We may also impute missing data in more complete features if our model requires it.

In [7]:
# imputer to replace missing values with most frequent per feature
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# the greatest fraction of a feature we will allow to be NaN before we exclude the column
MAX_MISSING_FEATURE = 0.45
# if there is a minimum fraction missing, add an indicator column to show imputed values
MISSING_INDICATOR_THRESHOLD = 0.1
# the greatest fraction of a data point we will allow to be missing before we exclude it
MAX_MISSING_INSTANCE = 0.75


In [8]:
# clean X sets
data_no_na = handle_missing_data(data,
                                 max_missing_feature = MAX_MISSING_FEATURE, 
                                 max_missing_instance = MAX_MISSING_INSTANCE,
                                 missing_indicator_threshold=MISSING_INDICATOR_THRESHOLD,
                                 imputer=imp,
                                )

NameError: name 'copy' is not defined

In [None]:
# removed columns with their fraction missing
X_train = data.train.X
X_clean = data_no_na.train.X
(X_train[list(set(X_train.columns)-set(X_clean.columns))].isna().sum()/len(X_train)).sort_values()

### Categorical data to dummy variables
Next, convert categorical columns for all to dummy variables:

In [None]:
MAX_DUMMIES = 5
# there will be two additional dummies for each categorical: "Unknown" and "nan"
# If there are more categories than our max, the least common items will be compressed into "Unknown"

In [None]:
# we must perform all dummy creation at the same time.
# if values in X_test are not found in X_train, they need to be marked 'Unknown' even if they are most frequent in X_test
data_with_dummies = cat_to_dummies(data_no_na, max_categories=MAX_DUMMIES)

In [None]:
data_with_dummies.train.X.head()

### SMOTE for target imbalance
We need to fix the dataset imbalance. For this we will use imbalanced-learn's implementation of SMOTE. Note that this implementation requires no NaN values, and cannot handle strings.

In [None]:
data_with_dummies.train.y.value_counts()

In [None]:
oversampler = SMOTE()
X_train_smote, y_train_smote = oversampler.fit_resample(data_with_dummies.train.X, data_with_dummies.train.y)
data_with_dummies.train.X = X_train_smote
data_with_dummies.train.y = y_train_smote

In [None]:
y_train_smote.value_counts()

### Pipeline
Instead of following the above steps, we could instead pass the parameters to data_prep_pipeline:

In [None]:
parameters = {
    "train_split"=0.6
    "max_missing_feature" = 0.45, 
    "max_missing_instance" = 0.75,
    "missing_indicator_threshold"=0.1,
    "imputer"=SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
    "string_imputer"=None,
    "max_categories"=5,
    "oversampler"=SMOTE()
}

# # this may take a while...
# data = data_prep_pipeline(dataset_example, **parameters)

    
    