notebook's summary:
- feature selection based on information value and chi-squared test
- handling highly correlated features

In [1]:
# load packages
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.options.display.float_format = '{:.3f}'.format

In [2]:
# load train-test set
train_test = pd.read_csv("../datasets/raw/applications_train_test_set.csv")
raw_train_test = train_test.copy()

# display
display(raw_train_test.shape, raw_train_test.head(3))

(24004, 178)

Unnamed: 0,id,credit_event,name_contract_type,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,prev_rate_interest_privileged_avg,prev_days_decision_avg,prev_name_payment_type_count,prev_cnt_payment_sum,prev_days_first_drawing_avg,prev_days_first_due_avg,prev_days_last_due_1st_version_avg,prev_days_last_due_avg,prev_days_termination_avg,prev_nflag_insured_on_approval_avg
0,288213,0,Cash loans,False,False,0,225000.0,953460.0,62703.0,900000.0,...,,933.666667,3.0,12.0,365243.0,-1326.0,-1176.0,-1176.0,-1170.0,0.0
1,400317,0,Cash loans,False,True,0,90000.0,284400.0,16011.0,225000.0,...,,1095.555556,9.0,118.0,365243.0,39646.888889,40013.555556,80502.333333,80507.222222,0.444444
2,191384,0,Cash loans,False,True,0,180000.0,454500.0,15151.5,454500.0,...,,,,,,,,,,


In [3]:
# set features, exclude id and target column
# id just an identifier and holds predictive power
# credit_event is removed to prevent leak target informationto the model
features = raw_train_test.drop(['credit_event', 'id'], axis=1)

# binning continuous features
# binning is used to transform continuous or categorical variables with many unique values into a smaller bins
# this can help to simplify the model, improve its performance on unseen data by reducing overfitting and enhance interpretability
# the choice number of bins could be based on domain knowledge, experimentation, or a balance between retaining information and reducing complexity
for feature in features:
    if raw_train_test[feature].dtype.kind in 'ifc':
        if len(raw_train_test[feature].value_counts()) > 6: 
            raw_train_test[feature] = pd.cut(raw_train_test[feature], 6)

display(features.shape, features.head(3))

(24004, 176)

Unnamed: 0,name_contract_type,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,name_type_suite,name_income_type,...,prev_rate_interest_privileged_avg,prev_days_decision_avg,prev_name_payment_type_count,prev_cnt_payment_sum,prev_days_first_drawing_avg,prev_days_first_due_avg,prev_days_last_due_1st_version_avg,prev_days_last_due_avg,prev_days_termination_avg,prev_nflag_insured_on_approval_avg
0,Cash loans,False,False,0,225000.0,953460.0,62703.0,900000.0,Unaccompanied,Commercial associate,...,,933.666667,3.0,12.0,365243.0,-1326.0,-1176.0,-1176.0,-1170.0,0.0
1,Cash loans,False,True,0,90000.0,284400.0,16011.0,225000.0,"Spouse, partner",Pensioner,...,,1095.555556,9.0,118.0,365243.0,39646.888889,40013.555556,80502.333333,80507.222222,0.444444
2,Cash loans,False,True,0,180000.0,454500.0,15151.5,454500.0,Unaccompanied,Pensioner,...,,,,,,,,,,


In [4]:
# display
display(raw_train_test.shape, raw_train_test.head(3))

(24004, 178)

Unnamed: 0,id,credit_event,name_contract_type,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,prev_rate_interest_privileged_avg,prev_days_decision_avg,prev_name_payment_type_count,prev_cnt_payment_sum,prev_days_first_drawing_avg,prev_days_first_due_avg,prev_days_last_due_1st_version_avg,prev_days_last_due_avg,prev_days_termination_avg,prev_nflag_insured_on_approval_avg
0,288213,0,Cash loans,False,False,"(-0.011, 1.833]","(22076.55, 772125.0]","(600000.0, 1155000.0]","(39412.5, 76530.0]","(600000.0, 1155000.0]",...,,"(490.333, 976.667]","(0.94, 11.0]","(-1.434, 239.0]","(303883.667, 365243.0]","(-3257.132, 58466.333]","(-3153.028, 58553.0]","(-3234.109, 58485.5]","(-3143.018, 58561.333]","(-0.001, 0.167]"
1,400317,0,Cash loans,False,True,"(-0.011, 1.833]","(22076.55, 772125.0]","(41670.0, 600000.0]","(2072.295, 39412.5]","(41670.0, 600000.0]",...,,"(976.667, 1463.0]","(0.94, 11.0]","(-1.434, 239.0]","(303883.667, 365243.0]","(-3257.132, 58466.333]","(-3153.028, 58553.0]","(58485.5, 119837.0]","(58561.333, 119897.667]","(0.333, 0.5]"
2,191384,0,Cash loans,False,True,"(-0.011, 1.833]","(22076.55, 772125.0]","(41670.0, 600000.0]","(2072.295, 39412.5]","(41670.0, 600000.0]",...,,,,,,,,,,


In [5]:
# preparing the target variable
Y = np.ravel(np.array(raw_train_test[['credit_event']])) # extract target, convert into a flat 1D array
X = np.array(raw_train_test.drop(['credit_event', 'id'], axis=1)) # extract features, convert into a flat 1D array

print(Y, X)

[0 0 0 ... 0 0 0] [['Cash loans' False False ...
  Interval(-3234.109, 58485.5, closed='right')
  Interval(-3143.018, 58561.333, closed='right')
  Interval(-0.001, 0.167, closed='right')]
 ['Cash loans' False True ... Interval(58485.5, 119837.0, closed='right')
  Interval(58561.333, 119897.667, closed='right')
  Interval(0.333, 0.5, closed='right')]
 ['Cash loans' False True ... nan nan nan]
 ...
 ['Revolving loans' True False ...
  Interval(181188.5, 242540.0, closed='right')
  Interval(181234.0, 242570.333, closed='right')
  Interval(0.167, 0.333, closed='right')]
 ['Cash loans' False True ...
  Interval(181188.5, 242540.0, closed='right')
  Interval(181234.0, 242570.333, closed='right')
  Interval(0.333, 0.5, closed='right')]
 ['Cash loans' False False ... nan nan nan]]


In [6]:
def information_value(X, Y):
    """
    Calculate the Information Value (IV) of a feature (X) with respect to a binary target variable (Y).
    IV is a measure of the predictive power of an independent variable in relation to the dependent variable.
    This function aggregates the data by the unique values of X, calculates the distribution of 'goods' and 'bads',
    and computes the Weight of Evidence (WoE) and IV for each grouping. The total IV for the feature is then
    calculated as the sum of IVs across all groups, providing a single metric to quantify its predictive power.

    Parameters:
    - X (array-like): Feature array. Can be a list, numpy array, or a pandas Series. If X is multidimensional,
                      it will be flattened.
    - Y (array-like): Target variable array. Must be binary (0 and 1), where 1 typically represents the
                      presence of the event of interest (e.g., default on a loan).

    Returns:
    - float: The Information Value (IV) of the feature. Higher IV values indicate stronger predictive power.
             IV values are generally interpreted as follows:
                < 0.02: Not useful for prediction
                0.02 to 0.1: Weak predictor
                0.1 to 0.3: Medium predictor
                0.3 to 0.5: Strong predictor
                > 0.5: Suspect or too good to be true

    """
    df = pd.DataFrame({'X': np.ravel(X), 'Y': Y})
    d = df.groupby("X", as_index=False).agg({"Y": ["count", "sum"]})
    d.columns = ['X', 'N', 'bads']
    d['bads_distribution'] = np.maximum(d['bads'], 0.5) / d['bads'].sum()
    d['goods'] = d['N'] - d['bads']
    d['goods_distribution'] = np.maximum(d['goods'], 0.5) / d['goods'].sum()
    d['woe'] = np.log(d['goods_distribution']/d['bads_distribution'])
    d['iv'] = d['woe'] * (d['goods_distribution']-d['bads_distribution'])
    return d['iv'].sum()

In [7]:
# init empty list
score_list = []
oe = OrdinalEncoder()

# loop over features
for i, feature in enumerate(features):
    X_feature = X[:, i].astype(str).reshape(-1, 1) # transformed into a string and reshape to fit
    oe.fit(X_feature) # fitted to the reshaped feature
    X_enc = oe.transform(X_feature) # transform feature and apply the mapping to produce the encoded feature
    iv = information_value(X=X_enc, Y=Y) # calculate information value, assess predictive power of the feature regarding the target
    chi = chi2(X=X_enc, y=Y) # perform chi-squared test on encoded features against the target variable
    score_list.append({"feature": feature, "information_value": iv, "chi2_pvalue": chi[1][0]}) # append result to list

scores = pd.DataFrame(score_list) # build the dataframe

# display result
display(scores.shape, scores)

(176, 3)

Unnamed: 0,feature,information_value,chi2_pvalue
0,name_contract_type,0.026589,4.377965e-09
1,flag_own_car,0.010849,4.470969e-04
2,flag_own_realty,0.000016,9.265264e-01
3,cnt_children,0.013934,1.414532e-05
4,amt_income_total,0.001152,9.243649e-01
...,...,...,...
171,prev_days_first_due_avg,0.016740,8.703287e-20
172,prev_days_last_due_1st_version_avg,0.022134,1.290195e-01
173,prev_days_last_due_avg,0.026089,5.510357e-03
174,prev_days_termination_avg,0.024996,1.768974e-03


In [8]:
# create new columns with threshold values
scores['information_value_threshold'] = 0.02 # lower limit for a feature to have any predictive value
scores['chi2_pvalue_threshold'] = 0.05 # common p-value threshold in statistical hypothesis testing. features with a p-value higher than this fail to reject the null hypothesis of independence from the target variable.

# check the results
scores.head(10)

Unnamed: 0,feature,information_value,chi2_pvalue,information_value_threshold,chi2_pvalue_threshold
0,name_contract_type,0.026589,4.377965e-09,0.02,0.05
1,flag_own_car,0.010849,0.0004470969,0.02,0.05
2,flag_own_realty,1.6e-05,0.9265264,0.02,0.05
3,cnt_children,0.013934,1.414532e-05,0.02,0.05
4,amt_income_total,0.001152,0.9243649,0.02,0.05
5,amt_credit,0.026132,0.003119011,0.02,0.05
6,amt_annuity,0.011284,0.2796677,0.02,0.05
7,amt_goods_price,0.033593,0.1326699,0.02,0.05
8,name_type_suite,0.003627,0.1190774,0.02,0.05
9,name_income_type,0.053258,6.555411e-22,0.02,0.05


In [9]:
# filter features that have significant predictive power and statistically significant association with the target variable
filtered_features = scores[(scores["information_value"] > scores["information_value_threshold"]) & 
                           (scores["chi2_pvalue"] < scores["chi2_pvalue_threshold"])].reset_index(drop=True)

# display
display(filtered_features.shape, filtered_features.head())

(60, 5)

Unnamed: 0,feature,information_value,chi2_pvalue,information_value_threshold,chi2_pvalue_threshold
0,name_contract_type,0.026589,4.377965e-09,0.02,0.05
1,amt_credit,0.026132,0.003119011,0.02,0.05
2,name_income_type,0.053258,6.555411e-22,0.02,0.05
3,name_education_type,0.056981,2.162986e-11,0.02,0.05
4,region_population_relative,0.036178,0.0001280246,0.02,0.05


In [10]:
# select columns that passed the filtering based IV and chi-sqaured p-value
# update by add id and credit event
kept_columns = list(filtered_features["feature"])
kept_columns.insert(0, "id")
kept_columns.insert(1, "credit_event")

application_initial_filter = train_test[kept_columns]

# display results
display(application_initial_filter.shape, application_initial_filter.head(10))

(24004, 62)

Unnamed: 0,id,credit_event,name_contract_type,amt_credit,name_income_type,name_education_type,region_population_relative,days_employed,days_registration,days_id_publish,...,amt_req_credit_bureau_year,sk_id_prev_count,amt_credit_sum_overdue_sum,amt_drawings_atm_current_avg_avg,cnt_drawings_atm_current_avg_avg,cnt_drawings_pos_current_avg_avg,prev_days_decision_avg,prev_days_first_drawing_avg,prev_days_last_due_avg,prev_days_termination_avg
0,288213,0,Cash loans,953460.0,Commercial associate,Secondary / secondary special,0.04622,116,4106.0,912,...,0.0,14.0,0.0,,,,933.666667,365243.0,-1176.0,-1170.0
1,400317,0,Cash loans,284400.0,Pensioner,Secondary / secondary special,0.00702,-365243,13237.0,3346,...,5.0,96.0,,,,,1095.555556,365243.0,80502.333333,80507.222222
2,191384,0,Cash loans,454500.0,Pensioner,Secondary / secondary special,0.04622,-365243,425.0,1622,...,1.0,3.0,0.0,4809.375,0.3125,0.0,,,,
3,412414,0,Cash loans,900000.0,Working,Secondary / secondary special,0.010276,896,776.0,2544,...,2.0,35.0,0.0,,,,1382.0,365243.0,-1195.5,-874.5
4,429870,1,Cash loans,254700.0,Commercial associate,Secondary / secondary special,0.025164,1224,3112.0,201,...,0.0,10.0,0.0,,,,169.25,365243.0,365243.0,365243.0
5,207421,0,Cash loans,417024.0,Commercial associate,Higher education,0.031329,1683,5454.0,1563,...,0.0,,0.0,,,,,,,
6,231340,0,Revolving loans,247500.0,Working,Higher education,0.006671,79,1076.0,263,...,3.0,10.0,0.0,,,,398.0,365243.0,-271.0,-266.0
7,233359,0,Cash loans,675000.0,Working,Secondary / secondary special,0.028663,4216,7978.0,409,...,1.0,36.0,0.0,2050.862069,0.425287,0.011494,2781.333333,242629.666667,-2023.666667,-1631.0
8,363256,0,Revolving loans,540000.0,Commercial associate,Secondary / secondary special,0.04622,4809,4253.0,4362,...,1.0,37.0,0.0,,,,1011.333333,365243.0,121092.666667,121097.333333
9,360287,1,Cash loans,755190.0,Working,Higher education,0.031329,5544,1471.0,2370,...,3.0,30.0,0.0,,,,630.6,365243.0,-972.0,-965.0


In [11]:
# create a dict that mapping feature and their IV
ivs = dict()
for i, row in filtered_features.iterrows():
    ivs[row["feature"]] = row["information_value"]

# display result
ivs

{'name_contract_type': 0.02658910337098841,
 'amt_credit': 0.02613212333721668,
 'name_income_type': 0.053258430146912006,
 'name_education_type': 0.05698126089959049,
 'region_population_relative': 0.0361779907338901,
 'days_employed': 0.03015392489076214,
 'days_registration': 0.033104228914476605,
 'days_id_publish': 0.040435410047729305,
 'own_car_age': 0.028289590889949714,
 'flag_emp_phone': 0.03015392489076214,
 'occupation_type': 0.09510520551929101,
 'region_rating_client': 0.055933235134114634,
 'region_rating_client_w_city': 0.05578738945448686,
 'reg_city_not_live_city': 0.025159863377590026,
 'reg_city_not_work_city': 0.04149190594329766,
 'organization_type': 0.09301232681233113,
 'ext_source_1': 0.11990883021514571,
 'ext_source_2': 0.3167334622798213,
 'ext_source_3': 0.37744926770953846,
 'apartments_avg': 0.028480942352783457,
 'basementarea_avg': 0.024410229837682484,
 'elevators_avg': 0.027361265216298964,
 'entrances_avg': 0.03111823424785643,
 'floorsmax_avg': 0.0

In [12]:
# create a new df, isolates the features by drop target and id 
application_features_initial = application_initial_filter.drop(["credit_event", "id"], axis=1)
application_features_initial.head()

Unnamed: 0,name_contract_type,amt_credit,name_income_type,name_education_type,region_population_relative,days_employed,days_registration,days_id_publish,own_car_age,flag_emp_phone,...,amt_req_credit_bureau_year,sk_id_prev_count,amt_credit_sum_overdue_sum,amt_drawings_atm_current_avg_avg,cnt_drawings_atm_current_avg_avg,cnt_drawings_pos_current_avg_avg,prev_days_decision_avg,prev_days_first_drawing_avg,prev_days_last_due_avg,prev_days_termination_avg
0,Cash loans,953460.0,Commercial associate,Secondary / secondary special,0.04622,116,4106.0,912,,1,...,0.0,14.0,0.0,,,,933.666667,365243.0,-1176.0,-1170.0
1,Cash loans,284400.0,Pensioner,Secondary / secondary special,0.00702,-365243,13237.0,3346,,0,...,5.0,96.0,,,,,1095.555556,365243.0,80502.333333,80507.222222
2,Cash loans,454500.0,Pensioner,Secondary / secondary special,0.04622,-365243,425.0,1622,,0,...,1.0,3.0,0.0,4809.375,0.3125,0.0,,,,
3,Cash loans,900000.0,Working,Secondary / secondary special,0.010276,896,776.0,2544,26.0,1,...,2.0,35.0,0.0,,,,1382.0,365243.0,-1195.5,-874.5
4,Cash loans,254700.0,Commercial associate,Secondary / secondary special,0.025164,1224,3112.0,201,,1,...,0.0,10.0,0.0,,,,169.25,365243.0,365243.0,365243.0


In [13]:
# # extract initial features
# application_features_initial = application_initial_filter.drop(["credit_event", "id"], axis=1)

# # identify categorical variables
# cat_cols = application_features_initial.select_dtypes(include=["object"]).columns

# # apply ordinalencoding
# # fit and transform categorical columns and replace it in the dataframe
# # actually di project dataikunya categorical columns itu mereka drop, tapi disini mau experiment instead jadi coba kita include dengan cara convert dengan encoder
# oe = OrdinalEncoder()
# application_features_initial[cat_cols] = oe.fit_transform(application_features_initial[cat_cols])

# # calculate spearman correlation matrix
# correlation_matrix = application_features_initial.corr(method="spearman")

# # display result
# display(correlation_matrix.shape, correlation_matrix)

In [14]:
# identify categorical features
# keep only numerical features for correlation analysis
cat_cols = application_features_initial.select_dtypes(include=["object"]).columns
application_features_initial = application_features_initial.drop(cat_cols, axis=1)

# calculate spearman correlation coefficient for all pairs features
# we chose spearman's method since it can capture monotonic relationship between features
# and also it's more appropriate for ordinal data or data not meeting normal distribution assumptions
correlation_matrix = application_features_initial.corr(method="spearman")

# display result
display(correlation_matrix.shape, correlation_matrix)

(52, 52)

Unnamed: 0,amt_credit,region_population_relative,days_employed,days_registration,days_id_publish,own_car_age,flag_emp_phone,region_rating_client,region_rating_client_w_city,reg_city_not_live_city,...,amt_req_credit_bureau_year,sk_id_prev_count,amt_credit_sum_overdue_sum,amt_drawings_atm_current_avg_avg,cnt_drawings_atm_current_avg_avg,cnt_drawings_pos_current_avg_avg,prev_days_decision_avg,prev_days_first_drawing_avg,prev_days_last_due_avg,prev_days_termination_avg
amt_credit,1.0,0.052875,0.117695,-0.007268,0.001813,-0.122739,0.072391,-0.085607,-0.093278,-0.02329,...,-0.038963,0.041601,-0.010043,-0.035762,-0.089573,-0.030961,0.09442,-0.047868,-0.095807,-0.084267
region_population_relative,0.052875,1.0,0.006838,0.037532,0.014114,-0.129361,0.002901,-0.4315,-0.440724,-0.048198,...,0.008665,0.039775,-0.006726,-0.025391,-0.038781,-0.030273,0.060199,-0.00664,-0.032844,-0.032631
days_employed,0.117695,0.006838,1.0,-0.036705,-0.115309,-0.040561,0.667398,-0.012205,-0.016254,-0.03143,...,-0.011139,0.039325,0.005121,-0.030921,-0.030179,-0.032775,0.106152,-0.038205,-0.064111,-0.055784
days_registration,-0.007268,0.037532,-0.036705,1.0,0.09624,0.02134,-0.174074,-0.078175,-0.075387,-0.054797,...,0.009038,0.05456,-0.004932,-0.014965,-0.013628,-0.104054,0.064345,0.011963,0.004818,0.003694
days_id_publish,0.001813,0.014114,-0.115309,0.09624,1.0,-0.006372,-0.279055,0.011111,0.012231,-0.076901,...,0.043121,0.091854,-0.003519,-0.038121,-0.03834,-0.132396,0.097782,0.020484,-0.048576,-0.038591
own_car_age,-0.122739,-0.129361,-0.040561,0.02134,-0.006372,1.0,-0.035415,0.168945,0.169756,0.013337,...,0.013875,0.008394,0.018102,0.010357,0.032044,-0.096581,0.036131,-0.007887,-0.020093,-0.010942
flag_emp_phone,0.072391,0.002901,0.667398,-0.174074,-0.279055,-0.035415,1.0,-0.027647,-0.028595,0.092992,...,-0.045131,-0.065362,0.007598,0.008353,0.020702,0.136163,0.010728,-0.047803,-0.039708,-0.035537
region_rating_client,-0.085607,-0.4315,-0.012205,-0.078175,0.011111,0.168945,-0.027647,1.0,0.949763,0.029928,...,0.009635,0.010156,0.011722,-0.005878,0.008881,0.016453,-0.022617,0.018976,0.019408,0.018411
region_rating_client_w_city,-0.093278,-0.440724,-0.016254,-0.075387,0.012231,0.169756,-0.028595,0.949763,1.0,0.039574,...,0.008453,0.007481,0.013565,-0.003302,0.013613,0.012269,-0.023591,0.017052,0.020509,0.02013
reg_city_not_live_city,-0.02329,-0.048198,-0.03143,-0.054797,-0.076901,0.013337,0.092992,0.029928,0.039574,1.0,...,0.00382,-0.046387,-0.003802,0.001571,0.007898,0.085859,-0.049187,-0.010067,0.00767,0.008742


In [15]:
# identify high correlated feature pairs
# this will helps in feature reduction that could improve our model performance later
# set threshold to 0.7 as starting point as it also commonly used in practice
columns = correlation_matrix.columns
high_corr_list = []
threshold = 0.7

# loop thru the correlation matrix and find pairs of features with a correlation coeeficient greater than 0.7
for i in range(len(columns)-1):
    for j in range(i+1, len(columns)):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            high_corr_list.append({"x":[columns[i]], "y":[columns[j]]})


high_corr_pairs = pd.DataFrame(high_corr_list)

# display
high_corr_pairs.head()

Unnamed: 0,x,y
0,[region_rating_client],[region_rating_client_w_city]
1,[apartments_avg],[floorsmax_avg]
2,[apartments_avg],[livingarea_avg]
3,[apartments_avg],[apartments_mode]
4,[apartments_avg],[floorsmax_mode]


In [16]:
# identify features that need to be removed by compares the IV of the two features
var_to_remove = []

# loop thru the high_corr_pairs
for i, row in high_corr_pairs.iterrows():
    # if the IV row[x] greater than the IV of the feature row[y], remove the feature row[y] considered it less informative since its IV is lower
    if ivs[row["x"][0]] > ivs[row["y"][0]]:
        var_to_remove.append(row["y"])
    # if IV feature row[y] greater or if the IV are equal, return feature row[x]
    else:
        var_to_remove.append(row["x"])

# create a unique list of variables to remove
var_to_remove = list(np.unique(var_to_remove))

# display result
var_to_remove

['amt_drawings_atm_current_avg_avg',
 'apartments_avg',
 'apartments_medi',
 'apartments_mode',
 'basementarea_avg',
 'basementarea_medi',
 'basementarea_mode',
 'elevators_avg',
 'elevators_medi',
 'elevators_mode',
 'entrances_avg',
 'entrances_mode',
 'floorsmax_avg',
 'floorsmax_medi',
 'livingarea_avg',
 'livingarea_medi',
 'livingarea_mode',
 'nonlivingarea_medi',
 'nonlivingarea_mode',
 'prev_days_termination_avg',
 'region_rating_client_w_city',
 'totalarea_mode']

In [18]:
application_initial_filter.head()

Unnamed: 0,id,credit_event,name_contract_type,amt_credit,name_income_type,name_education_type,region_population_relative,days_employed,days_registration,days_id_publish,...,amt_req_credit_bureau_year,sk_id_prev_count,amt_credit_sum_overdue_sum,amt_drawings_atm_current_avg_avg,cnt_drawings_atm_current_avg_avg,cnt_drawings_pos_current_avg_avg,prev_days_decision_avg,prev_days_first_drawing_avg,prev_days_last_due_avg,prev_days_termination_avg
0,288213,0,Cash loans,953460.0,Commercial associate,Secondary / secondary special,0.04622,116,4106.0,912,...,0.0,14.0,0.0,,,,933.666667,365243.0,-1176.0,-1170.0
1,400317,0,Cash loans,284400.0,Pensioner,Secondary / secondary special,0.00702,-365243,13237.0,3346,...,5.0,96.0,,,,,1095.555556,365243.0,80502.333333,80507.222222
2,191384,0,Cash loans,454500.0,Pensioner,Secondary / secondary special,0.04622,-365243,425.0,1622,...,1.0,3.0,0.0,4809.375,0.3125,0.0,,,,
3,412414,0,Cash loans,900000.0,Working,Secondary / secondary special,0.010276,896,776.0,2544,...,2.0,35.0,0.0,,,,1382.0,365243.0,-1195.5,-874.5
4,429870,1,Cash loans,254700.0,Commercial associate,Secondary / secondary special,0.025164,1224,3112.0,201,...,0.0,10.0,0.0,,,,169.25,365243.0,365243.0,365243.0


In [20]:
# filter the applications dataframe by drop the less informative of each highly correlated features
final_application_features_filtered = application_initial_filter.drop(var_to_remove, axis=1)

# display it
final_application_features_filtered

Unnamed: 0,id,credit_event,name_contract_type,amt_credit,name_income_type,name_education_type,region_population_relative,days_employed,days_registration,days_id_publish,...,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_year,sk_id_prev_count,amt_credit_sum_overdue_sum,cnt_drawings_atm_current_avg_avg,cnt_drawings_pos_current_avg_avg,prev_days_decision_avg,prev_days_first_drawing_avg,prev_days_last_due_avg
0,288213,0,Cash loans,953460.0,Commercial associate,Secondary / secondary special,0.046220,116,4106.0,912,...,0.0,0.0,0.0,14.0,0.0,,,933.666667,365243.0,-1176.000000
1,400317,0,Cash loans,284400.0,Pensioner,Secondary / secondary special,0.007020,-365243,13237.0,3346,...,0.0,0.0,5.0,96.0,,,,1095.555556,365243.0,80502.333333
2,191384,0,Cash loans,454500.0,Pensioner,Secondary / secondary special,0.046220,-365243,425.0,1622,...,0.0,0.0,1.0,3.0,0.0,0.3125,0.0,,,
3,412414,0,Cash loans,900000.0,Working,Secondary / secondary special,0.010276,896,776.0,2544,...,0.0,0.0,2.0,35.0,0.0,,,1382.000000,365243.0,-1195.500000
4,429870,1,Cash loans,254700.0,Commercial associate,Secondary / secondary special,0.025164,1224,3112.0,201,...,0.0,0.0,0.0,10.0,0.0,,,169.250000,365243.0,365243.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23999,407577,0,Cash loans,278712.0,Pensioner,Secondary / secondary special,0.020713,-365243,12485.0,568,...,,,,21.0,,,,444.000000,365243.0,182411.000000
24000,275756,0,Cash loans,601470.0,Commercial associate,Secondary / secondary special,0.031329,596,5548.0,2383,...,0.0,0.0,1.0,5.0,0.0,0.0000,0.0,1297.000000,365243.0,-1201.000000
24001,376333,0,Revolving loans,180000.0,Commercial associate,Higher education,0.031329,2716,404.0,3421,...,0.0,0.0,4.0,19.0,0.0,,,238.800000,365243.0,182534.750000
24002,305520,0,Cash loans,780363.0,Working,Secondary / secondary special,0.018029,4392,2980.0,1710,...,0.0,0.0,1.0,21.0,0.0,,,954.666667,365243.0,181750.000000


In [21]:
final_application_features_filtered.to_csv("../datasets/preprocessed/application_correlation_filtered.csv", index=False)