In [1]:
import pandas as pd
from IPython.display import display
from collections import Counter
from scipy.stats import chi2_contingency

from utils import load_train

In [2]:
path_to_data = "../data/"

### load train

In [3]:
df_application_train = load_train(path_to_train_folder=path_to_data)

### binary features

In [4]:
binary_columns = df_application_train.columns[df_application_train.nunique() == 2].tolist()

In [5]:
binary_columns

['TARGET',
 'NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EMERGENCYSTATE_MODE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

In [6]:
print(f"number of binary features: {len(binary_columns)}")

number of binary features: 37


### contingency matrix

In [7]:
# chi-square test of independence of variables in a contingency matrix
for column in binary_columns:
    
    target_crosstab = pd.crosstab(
        index=df_application_train[column],
        columns=df_application_train['TARGET'],
        margins=True,
    )
    display(target_crosstab)
    
    _, p_value, _, _ = chi2_contingency(target_crosstab)
    print(f"p-value: {p_value}\n")


TARGET,0,1,All
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282686,0,282686
1,0,24825,24825
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
NAME_CONTRACT_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cash loans,255011,23221,278232
Revolving loans,27675,1604,29279
All,282686,24825,307511


p-value: 2.6845456940854183e-62



TARGET,0,1,All
FLAG_OWN_CAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,185675,17249,202924
Y,97011,7576,104587
All,282686,24825,307511


p-value: 9.748590738786988e-31



TARGET,0,1,All
FLAG_OWN_REALTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,86357,7842,94199
Y,196329,16983,213312
All,282686,24825,307511


p-value: 0.02037128923282243



TARGET,0,1,All
FLAG_MOBIL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,1
1,282685,24825,307510
All,282686,24825,307511


p-value: 0.99906374744738



TARGET,0,1,All
FLAG_EMP_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,52395,2991,55386
1,230291,21834,252125
All,282686,24825,307511


p-value: 2.119842327116858e-139



TARGET,0,1,All
FLAG_WORK_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,227282,18921,246203
1,55404,5904,61308
All,282686,24825,307511


p-value: 5.888114280759174e-53



TARGET,0,1,All
FLAG_CONT_MOBILE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,529,45,574
1,282157,24780,306937
All,282686,24825,307511


p-value: 0.9997812539105887



TARGET,0,1,All
FLAG_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,202336,18744,221080
1,80350,6081,86431
All,282686,24825,307511


p-value: 1.262133206787528e-36



TARGET,0,1,All
FLAG_EMAIL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,266618,23451,290069
1,16068,1374,17442
All,282686,24825,307511


p-value: 0.9171627113035286



TARGET,0,1,All
REG_REGION_NOT_LIVE_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,278462,24392,302854
1,4224,433,4657
All,282686,24825,307511


p-value: 0.04851136161171001



TARGET,0,1,All
REG_REGION_NOT_WORK_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,268462,23437,291899
1,14224,1388,15612
All,282686,24825,307511


p-value: 0.0050917734853862286



TARGET,0,1,All
LIVE_REGION_NOT_WORK_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,271239,23769,295008
1,11447,1056,12503
All,282686,24825,307511


p-value: 0.6545922180328126



TARGET,0,1,All
REG_CITY_NOT_LIVE_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,261586,21886,283472
1,21100,2939,24039
All,282686,24825,307511


p-value: 7.455408611968837e-130



TARGET,0,1,All
REG_CITY_NOT_WORK_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,219339,17305,236644
1,63347,7520,70867
All,282686,24825,307511


p-value: 9.087791669062205e-172



TARGET,0,1,All
LIVE_CITY_NOT_WORK_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,232974,19322,252296
1,49712,5503,55215
All,282686,24825,307511


p-value: 4.007303274167387e-69



TARGET,0,1,All
EMERGENCYSTATE_MODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,148324,11104,159428
Yes,2105,223,2328
All,150429,11327,161756


p-value: 7.705359894297366e-05



TARGET,0,1,All
FLAG_DOCUMENT_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282677,24821,307498
1,9,4,13
All,282686,24825,307511


p-value: 0.060501164963493136



TARGET,0,1,All
FLAG_DOCUMENT_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,83658,5513,89171
1,199028,19312,218340
All,282686,24825,307511


p-value: 1.452251791868989e-129



TARGET,0,1,All
FLAG_DOCUMENT_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282661,24825,307486
1,25,0,25
All,282686,24825,307511


p-value: 0.6998283755865763



TARGET,0,1,All
FLAG_DOCUMENT_5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,278410,24453,302863
1,4276,372,4648
All,282686,24825,307511


p-value: 0.9998836625809906



TARGET,0,1,All
FLAG_DOCUMENT_6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,257115,23318,280433
1,25571,1507,27078
All,282686,24825,307511


p-value: 2.995096723773805e-53



TARGET,0,1,All
FLAG_DOCUMENT_7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282630,24822,307452
1,56,3,59
All,282686,24825,307511


p-value: 0.9500888919653433



TARGET,0,1,All
FLAG_DOCUMENT_8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,259498,22989,282487
1,23188,1836,25024
All,282686,24825,307511


p-value: 0.0005274108654631586



TARGET,0,1,All
FLAG_DOCUMENT_9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281562,24751,306313
1,1124,74,1198
All,282686,24825,307511


p-value: 0.21257860163581757



TARGET,0,1,All
FLAG_DOCUMENT_10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282679,24825,307504
1,7,0,7
All,282686,24825,307511


p-value: 0.9614115123984657



TARGET,0,1,All
FLAG_DOCUMENT_11,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281558,24750,306308
1,1128,75,1203
All,282686,24825,307511


p-value: 0.2396792229371515



TARGET,0,1,All
FLAG_DOCUMENT_12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282684,24825,307509
1,2,0,2
All,282686,24825,307511


p-value: 0.9963624179203493



TARGET,0,1,All
FLAG_DOCUMENT_13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281632,24795,306427
1,1054,30,1084
All,282686,24825,307511


p-value: 2.375493460034972e-08



TARGET,0,1,All
FLAG_DOCUMENT_14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281813,24795,306608
1,873,30,903
All,282686,24825,307511


p-value: 1.544409215710728e-05



TARGET,0,1,All
FLAG_DOCUMENT_15,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282325,24814,307139
1,361,11,372
All,282686,24825,307511


p-value: 0.010633377018992666



TARGET,0,1,All
FLAG_DOCUMENT_16,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,279783,24675,304458
1,2903,150,3053
All,282686,24825,307511


p-value: 2.1344406270156996e-08



TARGET,0,1,All
FLAG_DOCUMENT_17,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282606,24823,307429
1,80,2,82
All,282686,24825,307511


p-value: 0.4766512379093101



TARGET,0,1,All
FLAG_DOCUMENT_18,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,280328,24683,305011
1,2358,142,2500
All,282686,24825,307511


p-value: 0.0006418729273121938



TARGET,0,1,All
FLAG_DOCUMENT_19,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282515,24813,307328
1,171,12,183
All,282686,24825,307511


p-value: 0.9666913795720936



TARGET,0,1,All
FLAG_DOCUMENT_20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282543,24812,307355
1,143,13,156
All,282686,24825,307511


p-value: 0.9999746780322938



TARGET,0,1,All
FLAG_DOCUMENT_21,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282597,24811,307408
1,89,14,103
All,282686,24825,307511


p-value: 0.37583962179951236



#### gender

In [8]:
Counter(df_application_train['CODE_GENDER'])

Counter({'M': 105059, 'F': 202448, 'XNA': 4})

In [9]:
# filter XNA
gender_filter = (df_application_train['CODE_GENDER'] != 'XNA')

In [10]:
target_crosstab = pd.crosstab(
    index=df_application_train['CODE_GENDER'][gender_filter],
    columns=df_application_train['TARGET'][gender_filter],
    margins=True,
)
target_crosstab

TARGET,0,1,All
CODE_GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,188278,14170,202448
M,94404,10655,105059
All,282682,24825,307507


In [11]:
_, p_value, _, _ = chi2_contingency(target_crosstab)
print(f"p-value: {p_value}\n")

p-value: 6.241024167295911e-198

