In [1]:
import pandas as pd
from IPython.display import display
from collections import Counter
from scipy.stats import chi2_contingency
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

from stats_test import matthews_significance_test
from utils import path_join, load_train

In [2]:
train_filename = "application_train_binary.csv"
path_to_data = "../data/"

### load train binary

In [3]:
df_application_train_binary = load_train(
    filename=train_filename,
    path_to_train_folder=path_to_data,
)

### contingency matrix

In [4]:
# chi-square test of independence of variables in a contingency matrix
for column in df_application_train_binary.columns[1:]:  # exclude target
    
    feature = df_application_train_binary[column]
    target = df_application_train_binary['TARGET']
    
    target_crosstab = pd.crosstab(
        index=feature,
        columns=target,
        margins=True,
    )
    display(target_crosstab)
    
    _, p_value, _, _ = chi2_contingency(target_crosstab)
    print(f"p-value: {round(p_value, 3)}\n")

TARGET,0,1,All
NAME_CONTRACT_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cash loans,255011,23221,278232
Revolving loans,27675,1604,29279
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_OWN_CAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,185675,17249,202924
Y,97011,7576,104587
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_OWN_REALTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,86357,7842,94199
Y,196329,16983,213312
All,282686,24825,307511


p-value: 0.02



TARGET,0,1,All
FLAG_MOBIL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,1
1,282685,24825,307510
All,282686,24825,307511


p-value: 0.999



TARGET,0,1,All
FLAG_EMP_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,52395,2991,55386
1,230291,21834,252125
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_WORK_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,227282,18921,246203
1,55404,5904,61308
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_CONT_MOBILE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,529,45,574
1,282157,24780,306937
All,282686,24825,307511


p-value: 1.0



TARGET,0,1,All
FLAG_PHONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,202336,18744,221080
1,80350,6081,86431
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_EMAIL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,266618,23451,290069
1,16068,1374,17442
All,282686,24825,307511


p-value: 0.917



TARGET,0,1,All
REG_REGION_NOT_LIVE_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,278462,24392,302854
1,4224,433,4657
All,282686,24825,307511


p-value: 0.049



TARGET,0,1,All
REG_REGION_NOT_WORK_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,268462,23437,291899
1,14224,1388,15612
All,282686,24825,307511


p-value: 0.005



TARGET,0,1,All
LIVE_REGION_NOT_WORK_REGION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,271239,23769,295008
1,11447,1056,12503
All,282686,24825,307511


p-value: 0.655



TARGET,0,1,All
REG_CITY_NOT_LIVE_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,261586,21886,283472
1,21100,2939,24039
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
REG_CITY_NOT_WORK_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,219339,17305,236644
1,63347,7520,70867
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
LIVE_CITY_NOT_WORK_CITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,232974,19322,252296
1,49712,5503,55215
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
EMERGENCYSTATE_MODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,148324,11104,159428
Yes,2105,223,2328
All,150429,11327,161756


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282677,24821,307498
1,9,4,13
All,282686,24825,307511


p-value: 0.061



TARGET,0,1,All
FLAG_DOCUMENT_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,83658,5513,89171
1,199028,19312,218340
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282661,24825,307486
1,25,0,25
All,282686,24825,307511


p-value: 0.7



TARGET,0,1,All
FLAG_DOCUMENT_5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,278410,24453,302863
1,4276,372,4648
All,282686,24825,307511


p-value: 1.0



TARGET,0,1,All
FLAG_DOCUMENT_6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,257115,23318,280433
1,25571,1507,27078
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282630,24822,307452
1,56,3,59
All,282686,24825,307511


p-value: 0.95



TARGET,0,1,All
FLAG_DOCUMENT_8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,259498,22989,282487
1,23188,1836,25024
All,282686,24825,307511


p-value: 0.001



TARGET,0,1,All
FLAG_DOCUMENT_9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281562,24751,306313
1,1124,74,1198
All,282686,24825,307511


p-value: 0.213



TARGET,0,1,All
FLAG_DOCUMENT_10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282679,24825,307504
1,7,0,7
All,282686,24825,307511


p-value: 0.961



TARGET,0,1,All
FLAG_DOCUMENT_11,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281558,24750,306308
1,1128,75,1203
All,282686,24825,307511


p-value: 0.24



TARGET,0,1,All
FLAG_DOCUMENT_12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282684,24825,307509
1,2,0,2
All,282686,24825,307511


p-value: 0.996



TARGET,0,1,All
FLAG_DOCUMENT_13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281632,24795,306427
1,1054,30,1084
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,281813,24795,306608
1,873,30,903
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_15,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282325,24814,307139
1,361,11,372
All,282686,24825,307511


p-value: 0.011



TARGET,0,1,All
FLAG_DOCUMENT_16,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,279783,24675,304458
1,2903,150,3053
All,282686,24825,307511


p-value: 0.0



TARGET,0,1,All
FLAG_DOCUMENT_17,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282606,24823,307429
1,80,2,82
All,282686,24825,307511


p-value: 0.477



TARGET,0,1,All
FLAG_DOCUMENT_18,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,280328,24683,305011
1,2358,142,2500
All,282686,24825,307511


p-value: 0.001



TARGET,0,1,All
FLAG_DOCUMENT_19,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282515,24813,307328
1,171,12,183
All,282686,24825,307511


p-value: 0.967



TARGET,0,1,All
FLAG_DOCUMENT_20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282543,24812,307355
1,143,13,156
All,282686,24825,307511


p-value: 1.0



TARGET,0,1,All
FLAG_DOCUMENT_21,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,282597,24811,307408
1,89,14,103
All,282686,24825,307511


p-value: 0.376



In [5]:
# chi-square test significance matthews_corrcoef
for column in df_application_train_binary.columns[1:]:  # exclude target
    
    n_samples = sum(~df_application_train_binary[column].isna())
    target = df_application_train_binary['TARGET']
    
    if pd.api.types.is_string_dtype(df_application_train_binary[column]):
        feature = LabelEncoder().fit_transform(df_application_train_binary[column])
    else:
        feature = df_application_train_binary[column]
    
    matthews_coef = matthews_corrcoef(
        y_pred=feature,
        y_true=target,
    )
    
    _, p_value = matthews_significance_test(
        matthews_coef=matthews_coef,
        n_samples=n_samples,
        alternative='two-sided',
    )
    print(f"{column} p-value: {round(p_value, 3)}")

NAME_CONTRACT_TYPE p-value: 0.0
FLAG_OWN_CAR p-value: 0.0
FLAG_OWN_REALTY p-value: 0.001
FLAG_MOBIL p-value: 1.534
FLAG_EMP_PHONE p-value: 0.0
FLAG_WORK_PHONE p-value: 0.0
FLAG_CONT_MOBILE p-value: 1.675
FLAG_PHONE p-value: 0.0
FLAG_EMAIL p-value: 0.659
REG_REGION_NOT_LIVE_REGION p-value: 0.004
REG_REGION_NOT_WORK_REGION p-value: 0.0
LIVE_REGION_NOT_WORK_REGION p-value: 0.236
REG_CITY_NOT_LIVE_CITY p-value: 0.0
REG_CITY_NOT_WORK_CITY p-value: 0.0
LIVE_CITY_NOT_WORK_CITY p-value: 0.0
EMERGENCYSTATE_MODE p-value: 0.0
FLAG_DOCUMENT_2 p-value: 0.005
FLAG_DOCUMENT_3 p-value: 0.0
FLAG_DOCUMENT_4 p-value: 0.277
FLAG_DOCUMENT_5 p-value: 1.722
FLAG_DOCUMENT_6 p-value: 0.0
FLAG_DOCUMENT_7 p-value: 0.799
FLAG_DOCUMENT_8 p-value: 0.0
FLAG_DOCUMENT_9 p-value: 0.032
FLAG_DOCUMENT_10 p-value: 0.866
FLAG_DOCUMENT_11 p-value: 0.038
FLAG_DOCUMENT_12 p-value: 1.35
FLAG_DOCUMENT_13 p-value: 0.0
FLAG_DOCUMENT_14 p-value: 0.0
FLAG_DOCUMENT_15 p-value: 0.001
FLAG_DOCUMENT_16 p-value: 0.0
FLAG_DOCUMENT_17 p-v

#### gender

In [6]:
gender = pd.read_csv(
    path_join("application_train.csv"),
    index_col="SK_ID_CURR",
    usecols=["SK_ID_CURR", "CODE_GENDER"],
).squeeze()

In [7]:
Counter(gender)

Counter({'M': 105059, 'F': 202448, 'XNA': 4})

In [8]:
# filter XNA
gender_filter = (gender != 'XNA')

In [9]:
target_crosstab = pd.crosstab(
    index=gender[gender_filter],
    columns=df_application_train_binary['TARGET'][gender_filter],
    margins=True,
)
target_crosstab

TARGET,0,1,All
CODE_GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,188278,14170,202448
M,94404,10655,105059
All,282682,24825,307507


In [10]:
_, p_value, _, _ = chi2_contingency(target_crosstab)
print(f"p-value: {round(p_value, 3)}\n")

p-value: 0.0



In [11]:
matthews_coef = matthews_corrcoef(
    y_pred=LabelEncoder().fit_transform(gender[gender_filter]),
    y_true=df_application_train_binary['TARGET'][gender_filter],
)
matthews_coef

0.05471012101772606

In [12]:
_, p_value = matthews_significance_test(
    matthews_coef=matthews_coef,
    n_samples=sum(~gender[gender_filter].isna()),
    alternative='two-sided',
)
print(f"gender p-value: {round(p_value, 3)}")

gender p-value: 0.0
