- Read data
- Filter the data by too much missing values
- Aggregation pipeline
- Merge pipeline
- Train base line model

In [1]:
import pandas as pd
import polars as pl
import time

Config

In [2]:
train_path = './home-credit-credit-risk-model-stability/csv_files/train/'
test_path = './home-credit-credit-risk-model-stability/csv_files/test/'

### Help functions

In [3]:
def set_dtypes(df : pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("D"):
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("M"):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col in ['date_decision']:
            df = df.with_columns(pl.col(col).cast(pl.Date))
            
    return df

In [4]:
# drop the column which has 95% null

def missing_value_pct(df : pl.DataFrame, base_columns : list, target_columns : list, threshold : float) -> pl.DataFrame:
    df_base = df.select(pl.col(base_columns).n_unique())
    
    df_sele = df.select(target_columns)
    df_missing = df_sele.select([pl.col(col).is_null().sum() for col in target_columns])
    
    result = pl.concat([df_base, df_missing], how='horizontal')
    
    uni_base = df.select(pl.col(base_columns)).n_unique()
    
    result = result.select([
        pl.col(col)/uni_base for col in result.columns
    ])
    
    result = result.transpose(include_header=True).rename({"column":"column_name", "column_0":"missing_pct"}) \
                    .sort(by='missing_pct', descending=True)
    
    sele_cols = result.filter(pl.col('missing_pct') < threshold).select(pl.col('column_name'))

    df_result = df.select(pl.col(base_columns + [col for col in target_columns if col in sele_cols.select(pl.col('column_name')).to_series()]))
    
    return df_result  

### Depth 0

base 

In [5]:
train_base = pl.read_csv(train_path +'train_base.csv')
train_base.head(5)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


In [6]:
train_base.shape

(1526659, 5)

In [7]:
train_statistic_0 = pl.read_csv(train_path + 'train_static_0_0.csv').pipe(set_dtypes)
train_statistic_1 = pl.read_csv(train_path + 'train_static_0_1.csv').pipe(set_dtypes)
train_statistic = pl.concat([train_statistic_0, train_statistic_1], how='vertical_relaxed')

In [8]:
train_cb_0 = pl.read_csv(train_path + 'train_static_cb_0.csv')

In [9]:
train_statistic_miss_fil = missing_value_pct(train_statistic, ['case_id'], [col for col in train_statistic.columns if col != 'case_id'], 0.90)
train_cb_miss_fil = missing_value_pct(train_cb_0, ['case_id'], [col for col in train_cb_0.columns if col != "case_id"], 0.90)

### Depth 1

- aggregate by case_is, num_group1

applprev_1

In [10]:
train_applprev_1_0 = pl.read_csv(train_path + 'train_applprev_1_0.csv').pipe(set_dtypes)
train_applprev_1_1 = pl.read_csv(train_path + 'train_applprev_1_1.csv').pipe(set_dtypes)
train_applprev = pl.concat([train_applprev_1_0, train_applprev_1_1], how='vertical_relaxed')

In [11]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_applprev.columns if col not in base_columns]

train_applprev_miss_fil = missing_value_pct(train_applprev, base_columns, tar_columns, 0.90)

other

In [12]:
train_other = pl.read_csv(train_path + 'train_other_1.csv').pipe(set_dtypes)

In [13]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_other.columns if col not in base_columns]
train_other_miss_fil = missing_value_pct(train_other, base_columns, tar_columns, 0.90)

tax_registry_a_1

In [14]:
tax_registry_a_1 = pl.read_csv(train_path + 'train_tax_registry_a_1.csv').pipe(set_dtypes)

In [15]:
train_credit_bureau_a_1_0 = pl.read_csv(train_path + 'train_credit_bureau_a_1_0.csv').pipe(set_dtypes)

In [16]:
base_columns = ['case_id', 'num_group1']
tar_columns = [col for col in train_credit_bureau_a_1_0.columns if col not in base_columns]
train_credit_bureau_a_10_miss_fil = missing_value_pct(train_credit_bureau_a_1_0, base_columns, tar_columns, 0.90)

In [17]:
train_credit_bureau_a_10_miss_fil[['case_id']].n_unique()

335275

### Depth 2

- aggregate by case_is, num_group1

applprev_2

In [18]:
train_applprev_2 = pl.read_csv(train_path + 'train_applprev_2.csv').pipe(set_dtypes)

In [19]:
base_columns = ['case_id', 'num_group1', 'num_group2']
tar_columns = [col for col in train_applprev_2.columns if col not in base_columns]
train_applprev_2_miss_fil = missing_value_pct(train_applprev_2, base_columns, tar_columns, 0.90)

In [22]:
# conts_type_50L is Person contact type in previous application.
# Aggregate by the mode times of the type
train_applprev_2_agg = train_applprev_2_miss_fil.group_by(['case_id','conts_type_509L']).agg(pl.count().alias('type_count'))

  train_applprev_2_agg = train_applprev_2_miss_fil.group_by(['case_id','conts_type_509L']).agg(pl.count().alias('type_count'))


person_2

In [23]:
train_person_2 = pl.read_csv(train_path + 'train_person_2.csv').pipe(set_dtypes)

In [39]:
base_columns = ['case_id', 'num_group1', 'num_group2']
tar_columns = [col for col in train_person_2.columns if col not in base_columns]
train_person_2_miss_fil = missing_value_pct(train_person_2, base_columns, tar_columns, 0.90)

In [40]:
train_person_2_miss_fil.filter(pl.col('case_id') == 6)

case_id,num_group1,num_group2,addres_district_368M,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employer_name_740M
i64,i64,i64,str,str,str,str,str
6,0,0,"""P55_110_32""","""P10_68_40""","""P38_92_157""","""P164_110_33""","""a55475b1"""
6,0,1,"""P55_110_32""","""P10_68_40""","""a55475b1""","""a55475b1""","""a55475b1"""
6,1,0,"""P204_92_178""","""P65_136_169""","""P38_92_157""","""P164_110_33""","""a55475b1"""
6,1,1,"""P191_109_75""","""P10_68_40""","""P7_147_157""","""a55475b1""","""a55475b1"""
6,1,2,"""P204_92_178""","""P164_28_170""","""P38_92_157""","""a55475b1""","""a55475b1"""
6,1,3,"""P55_110_32""","""P10_68_40""","""P38_92_157""","""a55475b1""","""a55475b1"""
6,1,4,"""P204_92_178""","""P65_136_169""","""a55475b1""","""a55475b1""","""a55475b1"""
6,1,5,"""P204_92_178""","""P164_28_170""","""a55475b1""","""a55475b1""","""a55475b1"""


In [68]:
def get_mode_value(primary_keys, value):
    return (
        pl.col(value)
        .drop_nulls()
        .mode()
        .first()
        .over(primary_keys)
        .alias(f'mode_{value}')
    )

In [69]:
columns_to_process = ['addres_district_368M', 'addres_zip_823M', 'conts_role_79M', 'empls_economicalst_849M', 'empls_employer_name_740M']
train_person_2_agg = train_base.select(pl.col(['case_id']))
for col in columns_to_process:
    temp = (train_person_2_miss_fil.with_columns([
                get_mode_value(primary_keys=['case_id'], value=col)
            ])
            .select(pl.col(['case_id',f'mode_{col}']))
            .unique())
    
    train_person_2_agg = train_person_2_agg.join(temp, on='case_id', how='left')