- Read data
- Filter the data by too much missing values
- Aggregation pipeline
- Merge pipeline
- Train base line model

In [1]:
import pandas as pd
import polars as pl
import glob

Config

In [2]:
train_path = './home-credit-credit-risk-model-stability/csv_files/train/'
test_path = './home-credit-credit-risk-model-stability/csv_files/test/'
result_path = './home-credit-credit-risk-model-stability/csv_files/results/'

missing_filter_threshold = 0.9

### Help functions

In [3]:
def set_dtypes(df : pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("D"):
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("M"):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col in ['date_decision']:
            df = df.with_columns(pl.col(col).cast(pl.Date))
            
    return df

In [4]:
# drop the column which has 95% null

def col_filter_by_high_missing_pct(df : pl.DataFrame, base_columns : list, target_columns : list, threshold : float) -> pl.DataFrame:
    df_base = df.select(pl.col(base_columns).n_unique())
    
    df_sele = df.select(target_columns)
    df_missing = df_sele.select([pl.col(col).is_null().sum() for col in target_columns])
    
    result = pl.concat([df_base, df_missing], how='horizontal')
    
    uni_base = df.select(pl.col(base_columns)).n_unique()
    
    result = result.select([
        pl.col(col)/uni_base for col in result.columns
    ])
    
    result = result.transpose(include_header=True).rename({"column":"column_name", "column_0":"missing_pct"}) \
                    .sort(by='missing_pct', descending=True)
    
    sele_cols = result.filter(pl.col('missing_pct') < threshold).select(pl.col('column_name'))

    df_result = df.select(pl.col(base_columns + [col for col in target_columns if col in sele_cols.select(pl.col('column_name')).to_series()]))
    
    return df_result  

In [5]:
def extract_tar_columns(df, base_columns):
    return  [col for col in df.columns if col not in base_columns]

In [6]:
def data_cleaning(target_file, base_columns, missing_filter_threshold, train_path, result_path):
    file_pattern = f'{train_path}{target_file}*.csv'
    files = glob.glob(file_pattern)
    df = pl.concat([
        pl.read_csv(file).pipe(set_dtypes) for file in files
    ], how='vertical_relaxed')

    tar_columns = extract_tar_columns(df, base_columns)
    df_miss_fil = col_filter_by_high_missing_pct(df, base_columns, tar_columns, missing_filter_threshold)
    df_miss_fil.write_csv(f'{result_path}{target_file}_miss_fil.csv')

### Depth 0

In [8]:
base_columns_depth0 = ['case_id']

base 

In [9]:
train_base = pl.read_csv(train_path +'train_base.csv')
train_base.head(5)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


In [10]:
train_base.shape

(1526659, 5)

In [8]:
data_cleaning('train_static_0', base_columns_depth0, missing_filter_threshold, train_path, result_path)

In [9]:
data_cleaning('train_static_cb_0', base_columns_depth0, missing_filter_threshold, train_path, result_path)

### Depth 1

- aggregate by case_is, num_group1

In [42]:
base_columns_depth1 = ['case_id', 'num_group1']

applprev_1

In [13]:
data_cleaning('train_applprev_1', base_columns_depth1, missing_filter_threshold, train_path, result_path)

other

In [14]:
data_cleaning('train_other_1', base_columns_depth1, missing_filter_threshold, train_path, result_path)

credit_bureau_a_1

In [15]:
data_cleaning('train_credit_bureau_a_1', base_columns_depth1, missing_filter_threshold, train_path, result_path)

credit_bureau_a_2

In [None]:
# too big
# data_cleaning('train_credit_bureau_a_2', base_columns_depth1, missing_filter_threshold, train_path, result_path)

credit_bureau_b_1

In [16]:
data_cleaning('train_credit_bureau_b_1', base_columns_depth1, missing_filter_threshold, train_path, result_path)

tax_registry_a_1

In [17]:
data_cleaning('train_tax_registry_a_1', base_columns_depth1, missing_filter_threshold, train_path, result_path)

### Depth 2

- aggregate by case_is, num_group1

In [18]:
base_columns_depth2 = ['case_id', 'num_group1', 'num_group2']

applprev_2

In [20]:
data_cleaning('train_applprev_2', base_columns_depth2, missing_filter_threshold, train_path, result_path)

person_2

In [21]:
data_cleaning('train_person_2', base_columns_depth2, missing_filter_threshold, train_path, result_path)

# Feature Engineering

### Depth 1

applprev_1

- rm: 
    - approvaldate_319D, credacc_credlmt_575A, dtlastpmt_581D, dtlastpmtallstes_3545839D
    - case_id and num_group1 where creationdate_885D == null
- fill_0: childnum_21L
- max, min, mean, std: 
    - actualdpd_943P, annuity_853A, byoccupationinc_3656910L, childnum_21L, credamount_590A, currdebt_94A, credacc_credlmt_575A, downpmt_134A
- mode: credtype_587L, district_544M
- last_value: currdebt_94A, childnum_21L, education_1138M

- check_later: employedfrom_700D, familystate_726L

In [28]:
train_applprev_1 = pl.read_csv(result_path + 'train_applprev_1_miss_fil.csv')

In [73]:
# train_applprev_1.select(pl.col('credacc_credlmt_575A')).describe()

In [78]:
train_applprev_1.filter(pl.col('case_id') == 2703450).join(train_base, on = ['case_id'], how='inner')

case_id,num_group1,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_credlmt_575A,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,mainoccupationinc_437A,maxdpdtolerance_577P,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,status_219L,tenor_203L,date_decision,MONTH,WEEK_NUM,target
i64,i64,f64,f64,str,f64,str,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,f64,f64,f64,f64,str,str,str,str,str,f64,str,i64,i64,i64
2703450,0,0.0,2102.2,"""2019-10-09""",,"""a55475b1""",,"""2019-10-09""",0.0,20020.0,"""COL""",0.0,"""2019-10-16""","""P123_39_170""",0.0,"""2020-09-08""","""2020-09-08""","""a55475b1""",,,"""2019-11-08""","""POS""",false,36000.0,0.0,0.0,12.0,"""P177_117_192""","""a55475b1""","""a55475b1""","""a55475b1""","""K""",12.0,"""2020-10-05""",202010,91,0
2703450,1,0.0,0.0,"""2019-01-07""",,"""a55475b1""",,"""2019-01-07""",0.0,0.0,"""REL""",0.0,"""2019-01-22""","""P123_39_170""",0.0,,"""2019-01-07""","""a55475b1""",,,,"""NDF""",false,,0.0,0.0,,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""A""",,"""2020-10-05""",202010,91,0
2703450,2,0.0,3773.6,"""2019-01-07""",,"""a55475b1""",,"""2019-01-07""",0.0,60602.0,"""CAL""",0.0,"""2019-01-22""","""P123_39_170""",0.0,"""2020-10-08""","""2020-10-08""","""a55475b1""",,,"""2019-02-07""","""CASH""",false,24000.0,0.0,0.0,24.0,"""P46_145_78""","""a55475b1""","""P94_109_143""","""a55475b1""","""K""",24.0,"""2020-10-05""",202010,91,0
2703450,3,0.0,2474.6,"""2018-04-16""",,"""a55475b1""",,"""2018-04-16""",0.0,20000.0,"""CAL""",0.0,"""2018-04-20""","""P123_39_170""",0.0,"""2019-01-07""","""2019-01-07""","""a55475b1""",,"""MARRIED""","""2018-05-16""","""CASH""",false,26000.0,0.0,0.0,12.0,"""P46_145_78""","""a55475b1""","""P94_109_143""","""a55475b1""","""K""",12.0,"""2020-10-05""",202010,91,0
2703450,4,0.0,30875.0,,,"""P94_109_143""",,"""2017-11-30""",0.0,150000.0,"""CAL""",,,"""P123_39_170""",0.0,,,"""a55475b1""",,,"""2017-12-30""","""CASH""",true,20000.0,,,6.0,"""P46_145_78""","""a55475b1""","""P94_109_143""","""P94_109_143""","""D""",6.0,"""2020-10-05""",202010,91,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2703450,8,0.0,3258.4001,"""2016-06-12""",,"""a55475b1""",2.0,"""2016-06-12""",0.0,24000.0,"""CAL""",0.0,"""2016-06-15""","""P123_39_170""",0.0,"""2017-05-08""","""2017-05-08""","""P97_36_170""","""1998-08-15""","""MARRIED""","""2016-07-15""","""CASH""",true,24000.0,6.0,0.0,12.0,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""K""",12.0,"""2020-10-05""",202010,91,0
2703450,9,0.0,,,,"""P94_109_143""",,"""2016-02-02""",,,,,,"""P123_39_170""",,,,"""a55475b1""",,,,,false,20000.0,,,,"""P46_145_78""","""a55475b1""","""P99_56_166""","""P94_109_143""","""D""",,"""2020-10-05""",202010,91,0
2703450,10,0.0,2075.4001,"""2016-02-02""",0.0,"""a55475b1""",0.0,"""2016-01-31""",0.0,10000.0,"""CAL""",0.0,"""2016-02-08""","""P123_39_170""",0.0,"""2016-06-15""","""2016-06-15""","""P97_36_170""","""1998-08-15""","""MARRIED""","""2016-03-05""","""CASH""",false,24000.0,0.0,0.0,6.0,"""P46_145_78""","""P116_130_157""","""a55475b1""","""a55475b1""","""K""",6.0,"""2020-10-05""",202010,91,0
2703450,11,0.0,1225.4,"""2014-10-02""",1.0,"""a55475b1""",0.0,"""2014-10-02""",0.0,3276.0,"""COL""",0.0,"""2014-10-07""","""P123_39_170""",0.0,,,"""P97_36_170""","""1999-09-15""","""MARRIED""","""2014-11-01""","""POS""",false,7000.0,0.0,0.0,3.0,"""P149_40_170""","""a55475b1""","""a55475b1""","""a55475b1""","""K""",3.0,"""2020-10-05""",202010,91,0


In [82]:
train_applprev_1.filter(pl.col('cancelreason_3545846M') == 'P94_109_143').filter(pl.col('approvaldate_319D').is_not_null())

case_id,num_group1,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_credlmt_575A,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,mainoccupationinc_437A,maxdpdtolerance_577P,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,status_219L,tenor_203L
i64,i64,f64,f64,str,f64,str,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,f64,f64,f64,f64,str,str,str,str,str,f64
1419,1,0.0,6064.0,"""2018-10-15""",,"""P94_109_143""",,"""2018-12-27""",0.0,16400.0,"""COL""",0.0,"""2018-10-17""","""P2_93_127""",0.0,"""2018-12-21""","""2018-12-21""","""P33_146_175""",,"""MARRIED""","""2018-11-15""","""POS""",false,114000.0,0.0,0.0,3.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""K""",3.0
1419,2,0.0,6064.0,"""2018-10-15""",,"""P94_109_143""",,"""2018-10-15""",0.0,16400.0,"""COL""",0.0,"""2018-10-17""","""P2_93_127""",0.0,"""2018-12-21""","""2018-12-21""","""P33_146_175""",,"""MARRIED""","""2018-11-15""","""POS""",false,40000.0,0.0,0.0,3.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""","""K""",3.0
10582,1,0.0,5869.4,"""2019-01-30""",,"""P94_109_143""",,"""2019-01-30""",0.0,13680.0,"""COL""",0.0,"""2019-02-22""","""P169_111_32""",0.0,"""2019-04-10""","""2019-04-10""","""P97_36_170""","""2018-07-24""","""MARRIED""","""2019-03-02""","""POS""",false,52600.0,0.0,0.0,30.0,"""a55475b1""","""a55475b1""","""P99_56_166""","""P94_109_143""","""K""",30.0
27342,1,0.0,1742.0,"""2019-07-10""",0.0,"""P94_109_143""",0.0,"""2019-07-10""",0.0,14000.0,"""COL""",0.0,"""2019-07-19""","""P56_25_83""",0.0,"""2019-07-24""","""2019-07-24""","""P33_146_175""","""2017-01-15""","""SINGLE""","""2019-08-09""","""POS""",false,14000.0,0.0,0.0,3.0,"""a55475b1""","""P19_109_56""","""P45_84_106""","""P94_109_143""","""K""",3.0
36871,0,0.0,4932.2,"""2019-11-21""",,"""P94_109_143""",,"""2019-11-21""",0.0,90000.0,"""REL""",0.0,,"""P98_137_111""",0.0,,,"""a55475b1""",,,"""2019-01-28""","""NDF""",false,28000.0,,0.0,48.0,"""a55475b1""","""a55475b1""","""P198_131_9""","""P94_109_143""","""N""",48.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2701442,0,0.0,14279.4,"""2019-01-07""",,"""P94_109_143""",,"""2020-10-12""",0.0,110000.0,"""CAL""",0.0,"""2019-01-30""","""P194_92_171""",0.0,"""2019-11-07""","""2019-11-07""","""a55475b1""",,,"""2020-11-11""","""CASH""",true,110000.0,0.0,0.0,12.0,"""P67_102_161""","""a55475b1""","""a55475b1""","""P94_109_143""","""K""",12.0
2701571,0,0.0,10641.4,"""2019-09-18""",,"""P94_109_143""",,"""2020-10-09""",0.0,150000.0,"""CAL""",84078.46,"""2019-09-25""","""P188_113_3""",0.0,,"""2020-09-24""","""a55475b1""",,,"""2020-11-08""","""CASH""",true,40000.0,0.0,102716.805,30.0,"""P67_102_161""","""a55475b1""","""a55475b1""","""P94_109_143""","""D""",30.0
2701980,3,0.0,0.0,"""2019-11-21""",,"""P94_109_143""",,"""2020-06-15""",100000.0,100000.0,"""REL""",0.0,"""2019-12-21""","""P31_88_142""",0.0,,"""2019-11-21""","""a55475b1""",,,,"""NDF""",false,60000.0,0.0,0.0,,"""P46_145_78""","""a55475b1""","""P99_56_166""","""P94_109_143""","""D""",
2701980,7,0.0,3059.8,"""2018-09-24""",,"""P94_109_143""",,"""2018-09-24""",0.0,22000.0,"""CAL""",0.0,"""2018-10-16""","""P31_88_142""",0.0,"""2019-09-24""","""2019-09-24""","""a55475b1""",,,"""2018-10-24""","""CASH""",false,50000.0,1.0,0.0,12.0,"""P177_117_192""","""a55475b1""","""P94_109_143""","""P94_109_143""","""K""",12.0


In [86]:
train_applprev_1.group_by(['cancelreason_3545846M']).agg(pl.col('case_id').n_unique()).sort('case_id')

cancelreason_3545846M,case_id
str,u32
"""P89_110_159""",1
"""P65_58_157""",1
"""P151_143_25""",2
"""P203_151_99""",2
"""P59_114_135""",3
…,…
"""P30_86_84""",55361
"""P180_60_137""",57583
"""P73_130_169""",59832
"""P94_109_143""",559178


In [83]:
train_base.filter(pl.col('case_id') == 1419)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
1419,"""2019-01-16""",201901,2,0


### Depth 2

train_applprev_2

In [23]:
train_applprev_2 = pl.read_csv(f'{result_path}train_applprev_2_miss_fil.csv')
train_applprev_2_agg = train_applprev_2.group_by(['case_id','conts_type_509L']).agg(pl.len().alias('type_count'))
train_applprev_2_agg.write_csv(result_path + 'train_applprev_2_agg.csv')

person_2

In [10]:
train_person_2_miss_fil = pl.read_csv(result_path + 'train_person_2_miss_fil.csv')

In [23]:
# train_person_2_miss_fil = pl.read_csv(result_path + 'train_person_2_miss_fil.csv')
columns_to_process = ['addres_district_368M', 'addres_zip_823M', 'conts_role_79M', 'empls_economicalst_849M', 'empls_employer_name_740M']
train_person_2_agg = train_base.select(pl.col(['case_id']))
for col in columns_to_process:
    temp = train_person_2_miss_fil.group_by('case_id').agg([pl.col(col).mode().first()])
    
    train_person_2_agg = train_person_2_agg.join(temp, on='case_id', how='left')

In [27]:
train_person_2_agg.write_csv(result_path + 'train_person_2_mode.csv')