- Read data
- Filter the data by too much missing values
- Aggregation pipeline
- Merge pipeline
- Train base line model

In [1]:
import pandas as pd
import polars as pl
import time

Config

In [4]:
train_path = './home-credit-credit-risk-model-stability/csv_files/train/'
test_path = './home-credit-credit-risk-model-stability/csv_files/test/'

### Help functions

In [5]:
def set_dtypes(df : pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("D"):
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("M"):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col in ['date_decision']:
            df = df.with_columns(pl.col(col).cast(pl.Date))
            
    return df

In [6]:
# drop the column which has 95% null

def missing_value_pct(df : pl.DataFrame, base_columns : list, target_columns : list, threshold : float) -> pl.DataFrame:
    df_base = df.select(pl.col(base_columns).n_unique())
    
    df_sele = df.select(target_columns)
    df_missing = df_sele.select([pl.col(col).is_null().sum() for col in target_columns])
    
    result = pl.concat([df_base, df_missing], how='horizontal')
    
    uni_base = df.select(pl.col(base_columns)).n_unique()
    
    result = result.select([
        pl.col(col)/uni_base for col in result.columns
    ])
    
    result = result.transpose(include_header=True).rename({"column":"column_name", "column_0":"missing_pct"}) \
                    .sort(by='missing_pct', descending=True)
    
    sele_cols = result.filter(pl.col('missing_pct') < threshold).select(pl.col('column_name'))

    df_result = df.select(pl.col(base_columns + [col for col in target_columns if col in sele_cols.select(pl.col('column_name')).to_series()]))
    
    return df_result  

### Depth 0

base 

In [7]:
train_base = pl.read_csv(train_path +'train_base.csv')
train_base.head(5)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


In [8]:
train_base.shape

(1526659, 5)

In [9]:
train_statistic_0 = pl.read_csv(train_path + 'train_static_0_0.csv').pipe(set_dtypes)
train_statistic_1 = pl.read_csv(train_path + 'train_static_0_1.csv').pipe(set_dtypes)
train_statistic = pl.concat([train_statistic_0, train_statistic_1], how='vertical_relaxed')

In [10]:
train_cb_0 = pl.read_csv(train_path + 'train_static_cb_0.csv')

In [11]:
train_statistic_miss_fil = missing_value_pct(train_statistic, ['case_id'], [col for col in train_statistic.columns if col != 'case_id'], 0.90)
train_cb_miss_fil = missing_value_pct(train_cb_0, ['case_id'], [col for col in train_cb_0.columns if col != "case_id"], 0.90)

### Depth 1

- aggregate by case_is, num_group1

applprev_1

In [12]:
train_applprev_1_0 = pl.read_csv(train_path + 'train_applprev_1_0.csv').pipe(set_dtypes)
train_applprev_1_1 = pl.read_csv(train_path + 'train_applprev_1_1.csv').pipe(set_dtypes)
train_applprev = pl.concat([train_applprev_1_0, train_applprev_1_1], how='vertical_relaxed')

In [13]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_applprev.columns if col not in base_columns]

train_applprev_miss_fil = missing_value_pct(train_applprev, base_columns, tar_columns, 0.90)

other

In [14]:
train_other = pl.read_csv(train_path + 'train_other_1.csv').pipe(set_dtypes)

In [15]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_other.columns if col not in base_columns]
train_other_miss_fil = missing_value_pct(train_other, base_columns, tar_columns, 0.90)

tax_registry_a_1

In [17]:
tax_registry_a_1 = pl.read_csv(train_path + 'train_tax_registry_a_1.csv').pipe(set_dtypes)

In [39]:
train_credit_bureau_a_1_0 = pl.read_csv(train_path + 'train_credit_bureau_a_1_0.csv').pipe(set_dtypes)

In [40]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_credit_bureau_a_1_0.columns if col not in base_columns]
train_credit_bureau_a_10_miss_fil = missing_value_pct(train_credit_bureau_a_1_0, base_columns, tar_columns, 0.90)

In [42]:
train_credit_bureau_a_10_miss_fil[['case_id']].n_unique()

335275

### Depth 2

- aggregate by case_is, num_group1

applprev_2

In [43]:
train_applprev_2 = pl.read_csv(train_path + 'train_applprev_2.csv').pipe(set_dtypes)

In [59]:
base_columns = ['case_id', 'num_group1', 'num_group2']
tar_columns = [col for col in train_applprev_2.columns if col not in base_columns]
train_applprev_2_miss_fil = missing_value_pct(train_applprev_2, base_columns, tar_columns, 0.90)

In [63]:
# conts_type_50L is Person contact type in previous application.
# Aggregate by the mode times of the type
train_applprev_2_miss_fil.group_by(['case_id','conts_type_509L']).agg(pl.count().alias('type_count'))

  train_applprev_2_miss_fil.group_by(['case_id','conts_type_509L']).agg(pl.count().alias('type_count'))


case_id,conts_type_509L,type_count
i64,str,u32
3,"""PRIMARY_EMAIL""",1
4,"""PRIMARY_MOBILE…",1
23,"""PRIMARY_MOBILE…",1
24,"""HOME_PHONE""",2
90,"""HOME_PHONE""",2
…,…,…
2703435,"""PHONE""",2
2703437,"""HOME_PHONE""",6
2703439,"""PRIMARY_EMAIL""",2
2703443,"""HOME_PHONE""",4
