# 1. LOAD DATA

In [1]:
import pandas as pd
import numpy as np
import os
import gc
from category_encoders import WOEEncoder
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

In [3]:
application_train = pd.read_csv("../data/raw/application_train.csv")
application_test = pd.read_csv("../data/raw/application_test.csv")

# 2. PREPROCESSING

## 2.1 Drop irrelevant/high missing features

In [4]:
application_train.replace({'DAYS_EMPLOYED': {365243:np.nan},
                           'CODE_GENDER': {'XNA': 'F'}},
                           inplace= True)

application_test.replace({'DAYS_EMPLOYED': {365243:np.nan},
                           'CODE_GENDER': {'XNA': 'F'}},
                           inplace= True)

In [5]:
# drop hosing variables due to high of missing values in EDA

start = application_train.columns.get_loc('APARTMENTS_AVG')
end = application_train.columns.get_loc('EMERGENCYSTATE_MODE')

housing_df = application_train.iloc[:, start : end+1]

drop_cols=housing_df.columns.to_list()

# keep these 4 cateogrial variables
for x in ['FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']:
    drop_cols.remove(x)

In [6]:
# drop flag_document_ variables
drop_doc_cols = [col for col in application_train.columns if col.startswith("FLAG_DOCUMENT_") and col != "FLAG_DOCUMENT_3"]
drop_cols = drop_cols + drop_doc_cols

In [7]:
# drop AMT_REQ_CREDIT_BUREAU_ variable
drop_credit_request_cols = [col for col in application_train.columns if col.startswith('AMT_REQ_CREDIT_BUREAU_') and col != 'AMT_REQ_CREDIT_BUREAU_YEAR']
drop_cols = drop_cols + drop_credit_request_cols

In [8]:
append_list = ['WEEKDAY_APPR_PROCESS_START', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
               'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'REGION_RATING_CLIENT']

for col in append_list:
    drop_cols.append(col)

len(drop_cols)

80

In [9]:
application_train.drop(columns=drop_cols, inplace=True, errors='ignore')
application_test.drop(columns=drop_cols, inplace=True, errors='ignore')

## 2.2 Handle outliners

## 2.3 Impute missing values

In [10]:
def check_nan(col):
    num_nan = col.isnull().sum()
    num_per = num_nan/len(col) * 100
    col_miss = pd.concat([num_nan, num_per], axis=1, keys=['number_of_NaN', 'percentage_of_NaN']).sort_values(by='percentage_of_NaN', ascending = False).round(1)
    col_miss = col_miss[col_miss['number_of_NaN'] != 0]
    print('Num fields: ', col.shape[1])
    print('Num missing fields: ', col_miss.shape[0])
    return col_miss

In [11]:
check_nan(application_train)

Num fields:  42
Num missing fields:  17


Unnamed: 0,number_of_NaN,percentage_of_NaN
FONDKAPREMONT_MODE,210295,68.4
OWN_CAR_AGE,202929,66.0
EXT_SOURCE_1,173378,56.4
WALLSMATERIAL_MODE,156341,50.8
HOUSETYPE_MODE,154297,50.2
EMERGENCYSTATE_MODE,145755,47.4
OCCUPATION_TYPE,96391,31.3
EXT_SOURCE_3,60965,19.8
DAYS_EMPLOYED,55374,18.0
AMT_REQ_CREDIT_BUREAU_YEAR,41519,13.5


In [12]:
check_nan(application_test)

Num fields:  41
Num missing fields:  14


Unnamed: 0,number_of_NaN,percentage_of_NaN
FONDKAPREMONT_MODE,32797,67.3
OWN_CAR_AGE,32312,66.3
WALLSMATERIAL_MODE,23893,49.0
HOUSETYPE_MODE,23619,48.5
EMERGENCYSTATE_MODE,22209,45.6
EXT_SOURCE_1,20532,42.1
OCCUPATION_TYPE,15605,32.0
DAYS_EMPLOYED,9274,19.0
EXT_SOURCE_3,8668,17.8
AMT_REQ_CREDIT_BUREAU_YEAR,6049,12.4


Replace N/A value of numerical varibles by train median

In [None]:
# # Impute numerical features
# target = application_train['TARGET']
# features_train = application_train.drop('TARGET', axis=1)

# numeric_imputer = SimpleImputer(strategy='median') 
# # Train
# numeric_features_train = features_train.select_dtypes(exclude='object')
# imputed_numeric_features_train = numeric_imputer.fit_transform(numeric_features_train)

# # Test 
# numeric_features_test = application_test.select_dtypes(exclude='object')
# imputed_numeric_features_test = numeric_imputer.transform(numeric_features_test)

# imputed_numeric_features_train = pd.DataFrame(imputed_numeric_features_train, columns=numeric_features_train.columns)
# imputed_numeric_features_test = pd.DataFrame(imputed_numeric_features_test, columns=numeric_features_test.columns)

# # Select categorical features
# categorical_features_train = features_train.select_dtypes(include='object')
# categorical_features_test = application_test.select_dtypes(include='object')

# # Impute categorical features using the most frequent value
# categorical_imputer = SimpleImputer(strategy='most_frequent')
# imputed_categorical_features_train = categorical_imputer.fit_transform(categorical_features_train)
# imputed_categorical_features_test = categorical_imputer.transform(categorical_features_test)

# # Convert the imputed data back to DataFrames with the original column names
# imputed_categorical_features_train = pd.DataFrame(imputed_categorical_features_train, columns=categorical_features_train.columns)
# imputed_categorical_features_test = pd.DataFrame(imputed_categorical_features_test, columns=categorical_features_test.columns)

# imputed_features_train = pd.concat([target, imputed_numeric_features_train, imputed_categorical_features_train], axis=1)

# imputed_features_test = pd.concat([imputed_numeric_features_test, imputed_categorical_features_test], axis=1)

In [16]:
target = application_train['TARGET'].reset_index(drop=True)

train_features = application_train.drop('TARGET', axis=1).reset_index(drop=True)
test_features  = application_test.reset_index(drop=True)

# ============================
# 2. IMPUTE NUMERIC FEATURES
# ============================
numeric_imputer = SimpleImputer(strategy='median')

numeric_cols = train_features.select_dtypes(exclude='object').columns

# Train
numeric_train = train_features[numeric_cols].reset_index(drop=True)
numeric_test  = test_features[numeric_cols].reset_index(drop=True)

imputed_numeric_train = pd.DataFrame(
    numeric_imputer.fit_transform(numeric_train),
    columns=numeric_cols
)

imputed_numeric_test = pd.DataFrame(
    numeric_imputer.transform(numeric_test),
    columns=numeric_cols
)

# ============================
# 3. IMPUTE CATEGORICAL FEATURES
# ============================
categorical_imputer = SimpleImputer(strategy='most_frequent')

categorical_cols = train_features.select_dtypes(include='object').columns

categorical_train = train_features[categorical_cols].reset_index(drop=True)
categorical_test  = test_features[categorical_cols].reset_index(drop=True)

imputed_categorical_train = pd.DataFrame(
    categorical_imputer.fit_transform(categorical_train),
    columns=categorical_cols
)

imputed_categorical_test = pd.DataFrame(
    categorical_imputer.transform(categorical_test),
    columns=categorical_cols
)

# ============================
# 4. GHÉP LẠI THÀNH BẢNG CUỐI
# ============================
imputed_features_train = pd.concat(
    [target, imputed_numeric_train, imputed_categorical_train],
    axis=1
).reset_index(drop=True)

imputed_features_test = pd.concat(
    [imputed_numeric_test, imputed_categorical_test],
    axis=1
).reset_index(drop=True)

In [17]:
imputed_features_train

Unnamed: 0,TARGET,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
0,1,100002.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,-2120.0,9.0,1.0,0.0,1.0,1.0,2.0,10.0,0.0,0.083037,0.262949,0.139376,2.0,-1134.0,1.0,1.0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,Business Entity Type 3,reg oper account,block of flats,"Stone, brick",No
1,0,100003.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,-291.0,9.0,1.0,0.0,1.0,2.0,1.0,11.0,0.0,0.311267,0.622246,0.535276,0.0,-828.0,1.0,0.0,Cash loans,F,N,N,Family,State servant,Higher education,Married,House / apartment,Core staff,School,reg oper account,block of flats,Block,No
2,0,100004.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,-2531.0,26.0,1.0,1.0,1.0,1.0,2.0,9.0,0.0,0.505998,0.555912,0.729567,0.0,-815.0,0.0,0.0,Revolving loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,Government,reg oper account,block of flats,Panel,No
3,0,100006.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,-2437.0,9.0,1.0,0.0,0.0,2.0,2.0,17.0,0.0,0.505998,0.650442,0.535276,0.0,-617.0,1.0,1.0,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Laborers,Business Entity Type 3,reg oper account,block of flats,Panel,No
4,0,100007.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,-3458.0,9.0,1.0,0.0,0.0,1.0,2.0,11.0,0.0,0.505998,0.322738,0.535276,0.0,-1106.0,0.0,0.0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Core staff,Religion,reg oper account,block of flats,Panel,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0,456251.0,0.0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327.0,-236.0,-8456.0,-1982.0,9.0,1.0,0.0,0.0,1.0,1.0,15.0,0.0,0.145570,0.681632,0.535276,0.0,-273.0,0.0,1.0,Cash loans,M,N,N,Unaccompanied,Working,Secondary / secondary special,Separated,With parents,Sales staff,Services,reg oper account,block of flats,"Stone, brick",No
307507,0,456252.0,0.0,72000.0,269550.0,12001.5,225000.0,0.025164,-20775.0,-1648.0,-4388.0,-4090.0,9.0,0.0,0.0,1.0,1.0,2.0,8.0,0.0,0.505998,0.115992,0.535276,0.0,0.0,1.0,1.0,Cash loans,F,N,Y,Unaccompanied,Pensioner,Secondary / secondary special,Widow,House / apartment,Laborers,XNA,reg oper account,block of flats,"Stone, brick",No
307508,0,456253.0,0.0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966.0,-7921.0,-6737.0,-5150.0,9.0,1.0,0.0,0.0,1.0,3.0,9.0,0.0,0.744026,0.535722,0.218859,0.0,-1909.0,1.0,1.0,Cash loans,F,N,Y,Unaccompanied,Working,Higher education,Separated,House / apartment,Managers,School,reg oper account,block of flats,Panel,No
307509,1,456254.0,0.0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961.0,-4786.0,-2562.0,-931.0,9.0,1.0,0.0,0.0,2.0,2.0,9.0,1.0,0.505998,0.514163,0.661024,0.0,-322.0,1.0,0.0,Cash loans,F,N,Y,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,Laborers,Business Entity Type 1,reg oper account,block of flats,"Stone, brick",No


In [18]:
check_nan(imputed_features_train)

Num fields:  42
Num missing fields:  0


Unnamed: 0,number_of_NaN,percentage_of_NaN


In [19]:
check_nan(imputed_features_test)

Num fields:  41
Num missing fields:  0


Unnamed: 0,number_of_NaN,percentage_of_NaN


# 3. FEATURE ENGINEERING

## 3.1 Categorial variables

In [20]:
def get_occupation_risk_lable(x):
    high = ['Low-skill Laborers', 'Drivers', 'Waiters/barmen staff']
    medium = ['Laborers', 'Cooking staff', 'Cleaning staff',
              'Sales staff', 'Security staff']
    low = ['Accountants', 'Core staff', 'High skill tech staff',
           'Managers', 'HR staff', 'IT staff', 'Medicine staff',
           'Private service staff', 'Secretaries', 'Realty agents']

    if x in high:
        return 2   # high risk
    elif x in medium:
        return 1   # medium risk
    else:
        return 0   # low risk


def get_organization_risk_lable(train_df, col='ORGANIZATION_TYPE', target='TARGET',
                             min_samples=300,
                             bins=[0, 0.06, 0.09, 0.12, 1],
                             labels=[0, 1, 2, 3]):
    """
    Create an ordinal risk-group function for organization type variable
    based on default rate and sample size.
    """

    stats = train_df.groupby(col)[target].agg(['count', 'mean'])
    stats = stats.rename(columns={'count': 'n', 'mean': 'default_rate'})

    # Handle rare categories
    stats['category_clean'] = stats.index
    stats.loc[stats['n'] < min_samples, 'category_clean'] = 'OTHER'

    # Re-aggregate the rare categories
    stats_clean = train_df.copy()
    stats_clean[col] = stats_clean[col].replace(
        stats[stats['n'] < min_samples].index, 'OTHER'
    )
    stats2 = stats_clean.groupby(col)[target].agg(['count', 'mean'])
    stats2 = stats2.rename(columns={'count': 'n', 'mean': 'default_rate'})

    # Assign risk level based on default rate (ordinal)
    stats2['risk_level'] = pd.cut(
        stats2['default_rate'],
        bins=bins,
        labels=labels,
        include_lowest=True
    )

    # Convert to dictionary mapping
    risk_dict = stats2['risk_level'].astype(int).to_dict()

    #  Define the final function
    def risk_group_function(x):
        x_clean = x if x in stats2.index else 'OTHER'
        return risk_dict.get(x_clean, max(labels)) 

    return risk_group_function


def get_housing_state_lable(df, test_df):
    cols = [
        'FONDKAPREMONT_MODE',
        'HOUSETYPE_MODE',
        'WALLSMATERIAL_MODE',
        'EMERGENCYSTATE_MODE'
    ]

    for col in cols:
        # flag missing
        df[col + '_IS_NA'] = df[col].isna().astype(int)
        test_df[col + '_IS_NA'] = test_df[col].isna().astype(int)

        # fill missing with 'Unknown'
        df[col] = df[col].fillna('Unknown')
        test_df[col] = test_df[col].fillna('Unknown')

        # label encode
        df[col], uniques = pd.factorize(df[col])
        test_df[col] = test_df[col].map({v: i for i, v in enumerate(uniques)})

        # unseen categories in test → assign special code
        test_df[col] = test_df[col].fillna(-1).astype(int)

    return df, test_df

## 3.2 Numerical variables

In [21]:
def get_age_label(days_birth):
    """ Return the age group label (int).  - a type of label encoder"""
    age_years = days_birth
    if age_years < 27:
        return 1
    elif age_years < 40:
        return 2
    elif age_years < 50:
        return 3
    elif age_years < 65:
        return 4
    elif age_years < 99:
        return 5
    else:
        return 0
    

def get_car_age_label(car_age):
    """
    Return the ordinal age group for OWN_CAR_AGE.
    """
    # Missing or no car (0 or NaN)
    if pd.isna(car_age) or car_age == 0:
        return 0

    # New car (0–5 years)
    elif car_age <= 5:
        return 1

    # Mid-age car (5–15 years)
    elif car_age <= 15:
        return 2

    # Old car (> 15 years)
    else:
        return 3


def get_ext_source_lable(df):
    """
    Feature engineering for EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3.
    Normalizes sources to same scale and creates aggregated risk features.
    """

    ext_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
    
    # Normalization for each column
    for col in ext_cols:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col + '_MM'] = (df[col] - min_val) / (max_val - min_val)

    # Aggregated EXT features 
    mm_cols = [c + '_MM' for c in ext_cols]

    # Mean score
    df['EXT_MEAN'] = df[mm_cols].mean(axis=1)

    # Worst score
    df['EXT_MIN'] = df[mm_cols].min(axis=1)

    # Best score
    df['EXT_MAX'] = df[mm_cols].max(axis=1)

    # Variation between sources
    df['EXT_STD'] = df[mm_cols].std(axis=1)

    # Nonlinear interaction (strong in tree models)
    df['EXT_PROD'] = (
        df['EXT_SOURCE_1_MM'] *
        df['EXT_SOURCE_2_MM'] *
        df['EXT_SOURCE_3_MM']
    )

    # Weighted score (EXT_3 strongest, EXT_1 medium, EXT_2 weakest)
    df['EXT_WEIGHTED'] = (
        3 * df['EXT_SOURCE_3_MM'] +
        2 * df['EXT_SOURCE_1_MM'] +
        1 * df['EXT_SOURCE_2_MM']
    )

    return df


## 3.3 Apply function

In [22]:
def do_sum(dataframe, group_cols, counted, agg_name):
    gp = dataframe[group_cols + [counted]].groupby(
        group_cols)[counted].sum().reset_index().rename(columns={counted: agg_name})
    dataframe = dataframe.merge(gp, on=group_cols, how='left')
    return dataframe


def do_mean(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    return df


def do_median(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].median().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    return df


def do_std(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].std().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    return df


def one_hot_encoder(df, categorical_columns=None, nan_as_category=True):
    """Create a new column for each categorical value in categorical columns using get dummies. """
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    categorical_columns = [c for c in df.columns if c not in original_columns]
    return df, categorical_columns


def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    if not categorical_columns:
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns


In [23]:
def apply_feature_engineer(train, test):

    df = train.copy()
    test_df = test.copy()

    # =====================================================
    # 1. OCCUPATION RISK
    # =====================================================
    df['OCCUPATION_RISK'] = df['OCCUPATION_TYPE'].apply(get_occupation_risk_lable)
    test_df['OCCUPATION_RISK'] = test_df['OCCUPATION_TYPE'].apply(get_occupation_risk_lable)

    # =====================================================
    # 2. ORGANIZATION RISK
    # =====================================================
    org_fn = get_organization_risk_lable(df)
    df['ORG_RISK'] = df['ORGANIZATION_TYPE'].apply(org_fn)
    test_df['ORG_RISK'] = test_df['ORGANIZATION_TYPE'].apply(org_fn)

    # =====================================================
    # 3. HOUSING STATE
    # =====================================================
    df, test_df = get_housing_state_lable(df, test_df)

    # =====================================================
    # 4. AGE + CAR AGE
    # =====================================================
    df['AGE_BIN'] = df['DAYS_BIRTH'].apply(get_age_label)
    test_df['AGE_BIN'] = test_df['DAYS_BIRTH'].apply(get_age_label)

    df['CAR_AGE_BIN'] = df['OWN_CAR_AGE'].apply(get_car_age_label)
    test_df['CAR_AGE_BIN'] = test_df['OWN_CAR_AGE'].apply(get_car_age_label)

    # =====================================================
    # 5. EXT SOURCE
    # =====================================================
    df = get_ext_source_lable(df)
    test_df = get_ext_source_lable(test_df)

    # =====================================================
    # 6. CREDIT / INCOME FEATURES
    # =====================================================
    def add_income_credit_features(d):
        d['CREDIT_TO_INCOME_RATIO'] = d['AMT_CREDIT'] / d['AMT_INCOME_TOTAL']
        d['CREDIT_TO_ANNUITY_RATIO'] = d['AMT_CREDIT'] / d['AMT_ANNUITY']
        d['ANNUITY_TO_INCOME_RATIO'] = d['AMT_ANNUITY'] / d['AMT_INCOME_TOTAL']
        d['PAYMENT_RATE'] = d['AMT_ANNUITY'] / d['AMT_CREDIT']
        d['GOODS_TO_INCOME_RATIO'] = d['AMT_GOODS_PRICE'] / d['AMT_INCOME_TOTAL']
        d['GOODS_TO_CREDIT_RATIO'] = d['AMT_GOODS_PRICE'] / d['AMT_CREDIT']
        d['CREDIT_BURDEN'] = d['AMT_CREDIT'] / d['CNT_FAM_MEMBERS']

    add_income_credit_features(df)
    add_income_credit_features(test_df)

    # =====================================================
    # 7. TIME FEATURES
    # =====================================================
    def time_features(d):
        cols = ['DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE']
        for col in cols:
            d[col + '_YEARS'] = (d[col] / -365).clip(0, 60)

        d['REGISTRATION_TO_PUBLISH'] = d['DAYS_REGISTRATION'] - d['DAYS_ID_PUBLISH']
        d['EMPLOYED_TO_REGISTRATION'] = d['DAYS_EMPLOYED'] - d['DAYS_REGISTRATION']
        d['PHONECHANGE_TO_PUBLISH'] = d['DAYS_LAST_PHONE_CHANGE'] - d['DAYS_ID_PUBLISH']

        d['REGISTRATION_TO_PUBLISH_YEARS'] = d['REGISTRATION_TO_PUBLISH'] / 365
        d['EMPLOYED_TO_REGISTRATION_YEARS'] = d['EMPLOYED_TO_REGISTRATION'] / 365
        d['PHONECHANGE_TO_PUBLISH_YEARS'] = d['PHONECHANGE_TO_PUBLISH'] / 365

    time_features(df)
    time_features(test_df)

    # =====================================================
    # 8. FAMILY FEATURES
    # =====================================================
    def family_features(d):
        d['CNT_NON_CHILD'] = (d['CNT_FAM_MEMBERS'] - d['CNT_CHILDREN']).clip(lower=1)
        d['CHILDREN_RATIO'] = d['CNT_CHILDREN'] / d['CNT_FAM_MEMBERS']
        d['INCOME_PER_PERSON'] = d['AMT_INCOME_TOTAL'] / d['CNT_FAM_MEMBERS']
        d['CREDIT_PER_PERSON'] = d['AMT_CREDIT'] / d['CNT_FAM_MEMBERS']
        d['INCOME_PER_CHILD'] = d['AMT_INCOME_TOTAL'] / (1 + d['CNT_CHILDREN'])
        d['INCOME_PER_NON_CHILD'] = d['AMT_INCOME_TOTAL'] / d['CNT_NON_CHILD']
        d['CREDIT_PER_CHILD'] = d['AMT_CREDIT'] / (1 + d['CNT_CHILDREN'])
        d['CREDIT_PER_NON_CHILD'] = d['AMT_CREDIT'] / d['CNT_NON_CHILD']
        d['CHILD_TO_NON_CHILD_RATIO'] = d['CNT_CHILDREN'] / d['CNT_NON_CHILD']

    family_features(df)
    family_features(test_df)

    # =====================================================
    # 9. GROUP AGG FEATURES
    # =====================================================
    group = ['ORG_RISK', 'OCCUPATION_RISK', 'AGE_BIN',
             'NAME_EDUCATION_TYPE', 'CODE_GENDER']

    agg_cols = {
        'EXT_MEAN': ['median','std'],
        'AMT_INCOME_TOTAL': ['mean','std'],
        'CREDIT_TO_ANNUITY_RATIO': ['mean','std'],
        'AMT_CREDIT': ['mean'],
        'AMT_ANNUITY': ['mean','std']
    }

    group_stats = df.groupby(group).agg(agg_cols)
    group_stats.columns = ['_'.join(col).upper() for col in group_stats.columns]
    group_stats.reset_index(inplace=True)

    df = df.merge(group_stats, on=group, how='left')
    test_df = test_df.merge(group_stats, on=group, how='left')

    # Fill only numeric NaN
    num_cols = test_df.select_dtypes(include=[np.number]).columns
    test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].median())

    # =====================================================
    # 10. WOE ENCODING
    # =====================================================
    feats = [f for f in df.columns if f not in ['TARGET','SK_ID_CURR']]
    target = df['TARGET']

    enc = WOEEncoder(return_df=True)
    df_enc = enc.fit_transform(df[feats], target)
    test_enc = enc.transform(test_df[feats])

    df_enc['TARGET'] = df['TARGET']
    df_enc['SK_ID_CURR'] = df['SK_ID_CURR']
    test_enc['SK_ID_CURR'] = test_df['SK_ID_CURR']

    df_enc["SK_ID_CURR"] = df_enc["SK_ID_CURR"].astype("int32")
    test_enc["SK_ID_CURR"] = test_enc["SK_ID_CURR"].astype("int32")

    return df_enc, test_enc


In [24]:
fe_train, fe_test = apply_feature_engineer(imputed_features_train, imputed_features_test)

In [25]:
fe_train.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OCCUPATION_RISK,ORG_RISK,FONDKAPREMONT_MODE_IS_NA,HOUSETYPE_MODE_IS_NA,WALLSMATERIAL_MODE_IS_NA,EMERGENCYSTATE_MODE_IS_NA,AGE_BIN,CAR_AGE_BIN,EXT_SOURCE_1_MM,EXT_SOURCE_2_MM,EXT_SOURCE_3_MM,EXT_MEAN,EXT_MIN,EXT_MAX,EXT_STD,EXT_PROD,EXT_WEIGHTED,CREDIT_TO_INCOME_RATIO,CREDIT_TO_ANNUITY_RATIO,ANNUITY_TO_INCOME_RATIO,PAYMENT_RATE,GOODS_TO_INCOME_RATIO,GOODS_TO_CREDIT_RATIO,CREDIT_BURDEN,DAYS_EMPLOYED_YEARS,DAYS_REGISTRATION_YEARS,DAYS_ID_PUBLISH_YEARS,DAYS_LAST_PHONE_CHANGE_YEARS,REGISTRATION_TO_PUBLISH,EMPLOYED_TO_REGISTRATION,PHONECHANGE_TO_PUBLISH,REGISTRATION_TO_PUBLISH_YEARS,EMPLOYED_TO_REGISTRATION_YEARS,PHONECHANGE_TO_PUBLISH_YEARS,CNT_NON_CHILD,CHILDREN_RATIO,INCOME_PER_PERSON,CREDIT_PER_PERSON,INCOME_PER_CHILD,INCOME_PER_NON_CHILD,CREDIT_PER_CHILD,CREDIT_PER_NON_CHILD,CHILD_TO_NON_CHILD_RATIO,EXT_MEAN_MEDIAN,EXT_MEAN_STD,AMT_INCOME_TOTAL_MEAN,AMT_INCOME_TOTAL_STD,CREDIT_TO_ANNUITY_RATIO_MEAN,CREDIT_TO_ANNUITY_RATIO_STD,AMT_CREDIT_MEAN,AMT_ANNUITY_MEAN,AMT_ANNUITY_STD,TARGET,SK_ID_CURR
0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,-2120.0,9.0,1.0,0.0,1.0,1.0,2.0,10.0,0.0,0.083037,0.262949,0.139376,2.0,-1134.0,1.0,1.0,0.036202,0.250941,0.056221,-0.015113,0.01282,0.18866,0.111466,0.213833,-0.037978,-0.010771,0.154966,0,0,0,0,1,2,0,0,0,0,1,2,0.072215,0.307542,0.155054,0.178271,0.072215,0.307542,0.119369,0.003444,0.917136,2.007889,16.461104,0.121978,0.060749,1.733333,0.863262,406597.5,1.745205,9.994521,5.808219,3.106849,-1528.0,3011.0,986.0,-4.186301,8.249315,2.70137,1.0,0.0,202500.0,406597.5,202500.0,202500.0,406597.5,406597.5,0.0,0.541297,0.122254,174449.623865,136567.068935,20.363372,7.711365,565095.428951,27181.536788,13301.093616,1,100002
1,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,-291.0,9.0,1.0,0.0,1.0,2.0,1.0,11.0,0.0,0.311267,0.622246,0.535276,0.0,-828.0,1.0,0.0,0.036202,-0.154336,0.056221,0.033532,-0.080369,-0.362672,-0.439432,-0.071234,-0.037978,-0.26593,-0.332558,0,0,1,0,0,0,0,0,0,0,1,2,0.312933,0.727773,0.597163,0.545956,0.312933,0.727773,0.212108,0.136,3.145128,4.79075,36.234085,0.132217,0.027598,4.183333,0.873211,646751.25,3.254795,3.249315,0.79726,2.268493,-895.0,-2.0,-537.0,-2.452055,-0.005479,-1.471233,2.0,0.0,135000.0,646751.25,270000.0,135000.0,1293502.5,646751.25,0.0,0.606316,0.124162,189616.163232,100363.378214,22.834405,8.152298,694796.951855,29648.056851,15950.608832,0,100003
2,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,-2531.0,26.0,1.0,1.0,1.0,1.0,2.0,9.0,0.0,0.505998,0.555912,0.729567,0.0,-815.0,0.0,0.0,-0.41503,0.250941,-0.117305,-0.015113,0.01282,0.18866,0.111466,0.213833,-0.037978,-0.010771,-0.156375,0,0,2,0,1,1,0,0,0,0,1,3,0.518318,0.65019,0.81413,0.660879,0.518318,0.81413,0.148196,0.274366,4.129217,2.0,20.0,0.1,0.05,2.0,1.0,135000.0,0.616438,11.671233,6.934247,2.232877,-1729.0,4035.0,1716.0,-4.736986,11.054795,4.70137,1.0,0.0,67500.0,135000.0,67500.0,67500.0,135000.0,135000.0,0.0,0.547649,0.122658,171450.796932,78486.366844,21.061912,7.855541,572069.601748,26445.670318,13232.56084,0,100004
3,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,-2437.0,9.0,1.0,0.0,0.0,2.0,2.0,17.0,0.0,0.505998,0.650442,0.535276,0.0,-617.0,1.0,1.0,0.036202,-0.154336,0.056221,-0.015113,0.01282,0.18866,0.111466,0.229315,-0.037978,-0.010771,0.154966,0,0,2,0,1,2,0,0,0,0,1,2,0.518318,0.760751,0.597163,0.625411,0.518318,0.760751,0.12366,0.235468,3.588876,2.316167,10.532818,0.2199,0.094941,2.2,0.949845,156341.25,8.326027,26.939726,6.676712,1.690411,-7396.0,6794.0,1820.0,-20.263014,18.613699,4.986301,2.0,0.0,67500.0,156341.25,135000.0,67500.0,312682.5,156341.25,0.0,0.562373,0.120985,146320.43543,546498.772396,21.42137,7.471282,560072.71041,25489.359615,12741.675587,0,100006
4,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,-3458.0,9.0,1.0,0.0,0.0,1.0,2.0,11.0,0.0,0.505998,0.322738,0.535276,0.0,-1106.0,0.0,0.0,0.036202,0.250941,0.056221,-0.015113,0.01282,0.18866,0.111466,0.213833,-0.037978,-0.26593,-0.170278,0,0,2,0,0,1,0,0,0,0,1,2,0.518318,0.377472,0.597163,0.497651,0.377472,0.597163,0.111294,0.116835,3.205597,4.222222,23.461618,0.179963,0.042623,4.222222,1.0,513000.0,8.323288,11.810959,9.473973,3.030137,-853.0,1273.0,2352.0,-2.336986,3.487671,6.443836,1.0,0.0,121500.0,513000.0,121500.0,121500.0,513000.0,513000.0,0.0,0.565726,0.123646,193591.901806,102921.782756,21.464402,8.189567,626961.774952,28651.786597,14746.048793,0,100007


In [26]:
fe_test.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OCCUPATION_RISK,ORG_RISK,FONDKAPREMONT_MODE_IS_NA,HOUSETYPE_MODE_IS_NA,WALLSMATERIAL_MODE_IS_NA,EMERGENCYSTATE_MODE_IS_NA,AGE_BIN,CAR_AGE_BIN,EXT_SOURCE_1_MM,EXT_SOURCE_2_MM,EXT_SOURCE_3_MM,EXT_MEAN,EXT_MIN,EXT_MAX,EXT_STD,EXT_PROD,EXT_WEIGHTED,CREDIT_TO_INCOME_RATIO,CREDIT_TO_ANNUITY_RATIO,ANNUITY_TO_INCOME_RATIO,PAYMENT_RATE,GOODS_TO_INCOME_RATIO,GOODS_TO_CREDIT_RATIO,CREDIT_BURDEN,DAYS_EMPLOYED_YEARS,DAYS_REGISTRATION_YEARS,DAYS_ID_PUBLISH_YEARS,DAYS_LAST_PHONE_CHANGE_YEARS,REGISTRATION_TO_PUBLISH,EMPLOYED_TO_REGISTRATION,PHONECHANGE_TO_PUBLISH,REGISTRATION_TO_PUBLISH_YEARS,EMPLOYED_TO_REGISTRATION_YEARS,PHONECHANGE_TO_PUBLISH_YEARS,CNT_NON_CHILD,CHILDREN_RATIO,INCOME_PER_PERSON,CREDIT_PER_PERSON,INCOME_PER_CHILD,INCOME_PER_NON_CHILD,CREDIT_PER_CHILD,CREDIT_PER_NON_CHILD,CHILD_TO_NON_CHILD_RATIO,EXT_MEAN_MEDIAN,EXT_MEAN_STD,AMT_INCOME_TOTAL_MEAN,AMT_INCOME_TOTAL_STD,CREDIT_TO_ANNUITY_RATIO_MEAN,CREDIT_TO_ANNUITY_RATIO_STD,AMT_CREDIT_MEAN,AMT_ANNUITY_MEAN,AMT_ANNUITY_STD,SK_ID_CURR
0,0.0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241.0,-2329.0,-5170.0,-812.0,9.0,1.0,0.0,0.0,2.0,2.0,18.0,0.0,0.752614,0.789654,0.15952,0.0,-1740.0,1.0,0.0,0.036202,-0.154336,0.056221,-0.015113,0.01282,0.18866,-0.439432,-0.071234,-0.037978,-0.010771,-0.147023,0,0,0,0,1,1,0,0,0,0,1,2,0.798495,0.923572,0.180263,0.63411,0.180263,0.923572,0.397988,0.132938,3.061351,4.213333,27.664697,0.1523,0.036147,3.333333,0.791139,284400.0,6.380822,14.164384,2.224658,4.767123,-4358.0,2841.0,-928.0,-11.939726,7.783562,-2.542466,2.0,0.0,67500.0,284400.0,135000.0,67500.0,568800.0,284400.0,0.0,0.597291,0.123074,178538.889991,100068.581809,22.141536,7.921388,643042.884191,28578.523386,15950.354141,100001
1,0.0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064.0,-4469.0,-9118.0,-1623.0,9.0,1.0,0.0,0.0,2.0,2.0,9.0,0.0,0.56499,0.291656,0.432962,0.0,0.0,1.0,3.0,0.036202,0.250941,0.056221,-0.015113,0.01282,0.18866,0.111466,-0.071234,-0.037978,0.859753,0.254589,0,0,2,0,2,2,0,0,0,0,1,2,0.595809,0.341111,0.490287,0.475736,0.341111,0.595809,0.127971,0.099645,3.003589,2.250182,12.82487,0.175455,0.077973,1.818182,0.808016,111384.0,12.243836,24.980822,4.446575,-0.0,-7495.0,4649.0,1623.0,-20.534247,12.736986,4.446575,2.0,0.0,49500.0,111384.0,99000.0,49500.0,222768.0,111384.0,0.0,0.553943,0.119757,182421.869725,80842.008094,20.415507,7.666245,593298.616569,28637.644566,13443.179326,100005
2,0.0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038.0,-4458.0,-2175.0,-3503.0,5.0,1.0,0.0,0.0,2.0,2.0,14.0,0.0,0.505998,0.699787,0.610991,0.0,-856.0,0.0,4.0,0.036202,0.250941,-0.117305,-0.015113,0.01282,0.18866,-0.439432,-0.071234,-0.037978,0.374973,0.760099,0,0,2,0,2,3,0,0,0,0,1,1,0.532081,0.818463,0.692134,0.680892,0.532081,0.818463,0.143522,0.301416,3.959025,3.275378,9.505482,0.344578,0.105202,3.111111,0.949848,331632.0,12.213699,5.958904,9.59726,2.345205,1328.0,-2283.0,2647.0,3.638356,-6.254795,7.252055,2.0,0.0,101250.0,331632.0,202500.0,101250.0,663264.0,331632.0,0.0,0.565902,0.114045,192672.794118,76440.569476,19.545406,8.307266,532224.132353,26538.783088,12386.191146,100013
3,2.0,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976.0,-1866.0,-2000.0,-4208.0,9.0,1.0,0.0,1.0,4.0,2.0,11.0,0.0,0.525734,0.509677,0.612704,0.0,-1805.0,1.0,3.0,0.036202,-0.154336,0.056221,-0.015113,0.01282,0.18866,0.111466,-0.071234,-0.037978,0.193878,0.154966,0,0,2,0,1,2,0,0,0,0,1,2,0.553401,0.59611,0.694076,0.614529,0.553401,0.694076,0.072123,0.228967,3.78514,5.0,32.130726,0.155614,0.031123,5.0,1.0,393750.0,5.112329,5.479452,11.528767,4.945205,2208.0,134.0,2403.0,6.049315,0.367123,6.583562,2.0,0.5,78750.0,393750.0,105000.0,157500.0,525000.0,787500.0,1.0,0.562373,0.120985,146320.43543,546498.772396,21.42137,7.471282,560072.71041,25489.359615,12741.675587,100028
4,1.0,180000.0,625500.0,32067.0,625500.0,0.010032,-13040.0,-2191.0,-4000.0,-4262.0,16.0,1.0,1.0,0.0,3.0,2.0,5.0,0.0,0.202145,0.425687,0.535276,0.0,-821.0,1.0,1.0,0.036202,0.250941,-0.117305,0.033532,0.01282,0.18866,0.111466,-0.071234,-0.037978,-0.010771,0.154966,0,0,2,0,1,2,0,0,0,0,1,3,0.203835,0.497875,0.606289,0.436,0.203835,0.606289,0.20824,0.061529,2.724413,3.475,19.506034,0.17815,0.051266,3.475,1.0,208500.0,6.00274,10.958904,11.676712,2.249315,262.0,1809.0,3441.0,0.717808,4.956164,9.427397,2.0,0.333333,60000.0,208500.0,90000.0,90000.0,312750.0,312750.0,0.5,0.541297,0.122254,174449.623865,136567.068935,20.363372,7.711365,565095.428951,27181.536788,13301.093616,100038
