In [1]:
# imports
import pandas as pd
from scipy.stats import chi2_contingency

In [2]:
# selecting relevant features is important when it comes to data analysis
# Chi square test and ANOVA are popular feature selection techniques
# I've tried covering detailed explanation of Chi square for this assessment

# loading the datasets
loan_dataset = pd.read_excel('loan.xlsx')

In [3]:
# basic data loading and analysis for the next few cells
loan_dataset.head(10)

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,reject
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,reject
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,accept
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,accept
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,reject
5,F,16.08,0.335,owner,given,unemploye,unemploye,0,1,f,given,160,126,reject
6,M,23.17,11.125,owner,given,professio,governmen,0,1,f,given,100,0,accept
7,F,27.58,3.0,owner,given,manager,private_s,2,1,t,given,280,10,reject
8,F,19.17,5.415,owner,given,guard_etc,governmen,0,0,f,given,80,484,reject
9,F,27.25,0.29,owner,given,manager,governmen,0,1,t,given,272,108,reject


In [4]:
# encoding categorical data to numbers

# encoder = LabelEncoder()
# loan_dataset['Sex'] = encoder.fit_transform(loan_dataset['Sex'])
# loan_dataset['Res_status'] = encoder.fit_transform(loan_dataset['Res_status'])
# loan_dataset['Telephone'] = encoder.fit_transform(loan_dataset['Telephone'])
# loan_dataset['Occupation'] = encoder.fit_transform(loan_dataset['Occupation'])
# loan_dataset['Job_status'] = encoder.fit_transform(loan_dataset['Job_status'])
# loan_dataset['Liab_ref'] = encoder.fit_transform(loan_dataset['Liab_ref'])
# loan_dataset['Acc_ref'] = encoder.fit_transform(loan_dataset['Acc_ref'])
# loan_dataset['Decision'] = encoder.fit_transform(loan_dataset['Decision'])

In [5]:
# we've been using chi_contingency over chisquare test is the Null Hypothesis
# i.e., calculating a significant difference between 2 groups

crosstable_Res_status = pd.crosstab(loan_dataset['Res_status'], loan_dataset['Decision'])

print(crosstable_Res_status, '\n')

chi_val_Res_status, p_val_Res_status, df_Res_status, exp_Res_status = chi2_contingency(crosstable_Res_status)
chi_val_Res_status, p_val_Res_status, df_Res_status, exp_Res_status
################################

Decision    accept  reject
Res_status                
owner          161     171
rent            31      66 



(7.645830831133978,
 0.005690368175836289,
 1,
 array([[148.58741259, 183.41258741],
        [ 43.41258741,  53.58741259]]))

In [7]:
crosstable_Telephone = pd.crosstab(loan_dataset['Telephone'], loan_dataset['Decision'])

print(crosstable_Telephone, '\n')

chi_val_Telephone, p_val_Telephone, df_Telephone, exp_Telephone = chi2_contingency(crosstable_Telephone)
chi_val_Telephone, p_val_Telephone, df_Telephone, exp_Telephone
################################

Decision   accept  reject
Telephone                
given         161     171
not_given      31      66 



(7.645830831133978,
 0.005690368175836289,
 1,
 array([[148.58741259, 183.41258741],
        [ 43.41258741,  53.58741259]]))

In [7]:
crosstable_Occupation = pd.crosstab(loan_dataset['Occupation'], loan_dataset['Decision'])

print(crosstable_Occupation, '\n')

chi_val_Occupation, p_val_Occupation, df_Occupation, exp_Occupation = chi2_contingency(crosstable_Occupation)
chi_val_Occupation, p_val_Occupation, df_Occupation, exp_Occupation
################################

Decision    accept  reject
Occupation                
creative_       44      48
driver           4      12
executive       12       8
guard_etc        8      27
labourer        12      30
manager         11      17
office_st       31      18
productio       22      23
professio       17       4
sales           12      19
semi_pro        15       5
unemploye        4      26 



(54.399272667846525,
 9.980070508394483e-08,
 11,
 array([[41.17482517, 50.82517483],
        [ 7.16083916,  8.83916084],
        [ 8.95104895, 11.04895105],
        [15.66433566, 19.33566434],
        [18.7972028 , 23.2027972 ],
        [12.53146853, 15.46853147],
        [21.93006993, 27.06993007],
        [20.13986014, 24.86013986],
        [ 9.3986014 , 11.6013986 ],
        [13.87412587, 17.12587413],
        [ 8.95104895, 11.04895105],
        [13.42657343, 16.57342657]]))

In [9]:
crosstable_Job_status = pd.crosstab(loan_dataset['Job_status'], loan_dataset['Decision'])

print(crosstable_Job_status, '\n')

chi_val_Job_status, p_val_Job_status, df_Job_status, exp_Job_status = chi2_contingency(crosstable_Job_status)
chi_val_Job_status, p_val_Job_status, df_Job_status, exp_Job_status
################################

Decision    accept  reject
Job_status                
governmen       50      32
military         0       1
private_s      114     151
retired          6       2
self_empl       16      21
student          2       3
unemploye        4      27 



(25.619055363910597,
 0.0002621149847778766,
 6,
 array([[ 36.6993007 ,  45.3006993 ],
        [  0.44755245,   0.55244755],
        [118.6013986 , 146.3986014 ],
        [  3.58041958,   4.41958042],
        [ 16.55944056,  20.44055944],
        [  2.23776224,   2.76223776],
        [ 13.87412587,  17.12587413]]))

In [10]:
crosstable_Liab_ref = pd.crosstab(loan_dataset['Liab_ref'], loan_dataset['Decision'])

print(crosstable_Liab_ref, '\n')

chi_val_Liab_ref, p_val_Liab_ref, df_Liab_ref, exp_Liab_ref = chi2_contingency(crosstable_Liab_ref)
chi_val_Liab_ref, p_val_Liab_ref, df_Liab_ref, exp_Liab_ref
################################

Decision  accept  reject
Liab_ref                
f             97     136
t             95     101 



(1.7463603456735195,
 0.1863349426112134,
 1,
 array([[104.27972028, 128.72027972],
        [ 87.72027972, 108.27972028]]))

In [11]:
crosstable_Acc_ref = pd.crosstab(loan_dataset['Acc_ref'], loan_dataset['Decision'])

print(crosstable_Acc_ref, '\n')

chi_val_Acc_ref, p_val_Acc_ref, df_Acc_ref, exp_Acc_ref = chi2_contingency(crosstable_Acc_ref)
chi_val_Acc_ref, p_val_Acc_ref, df_Acc_ref, exp_Acc_ref
################################

Decision   accept  reject
Acc_ref                  
given         184     214
oth_inst_       8      23 



(4.061563099061301,
 0.04386987339959904,
 1,
 array([[178.12587413, 219.87412587],
        [ 13.87412587,  17.12587413]]))