In [18]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix



In [10]:
credit_data = pd.read_excel ('default of credit card clients.xls') 
credit_data.shape 
#this shows it a large data set. 
# to reduce the calculation time I will randomly select 200 rows instead of 30.000 
# I will do this after the cleaning 

(30001, 25)

In [11]:
credit_data.head(2)

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1


In [3]:
credit_data.columns = list(credit_data.iloc[0])
credit_data_2 = credit_data[1:]
print(credit_data_2.columns)
credit_data_2.head(2) # To make from the first row the column names

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1


In [4]:
credit_data_2.dtypes

ID                            object
LIMIT_BAL                     object
SEX                           object
EDUCATION                     object
MARRIAGE                      object
AGE                           object
PAY_0                         object
PAY_2                         object
PAY_3                         object
PAY_4                         object
PAY_5                         object
PAY_6                         object
BILL_AMT1                     object
BILL_AMT2                     object
BILL_AMT3                     object
BILL_AMT4                     object
BILL_AMT5                     object
BILL_AMT6                     object
PAY_AMT1                      object
PAY_AMT2                      object
PAY_AMT3                      object
PAY_AMT4                      object
PAY_AMT5                      object
PAY_AMT6                      object
default payment next month    object
dtype: object

In [5]:
credit_data_3 = credit_data_2.apply(pd.to_numeric,errors='coerce') # make all colums numeric 
credit_data_3.head(2)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1


In [6]:
'''credit_data_3["isnull"] = credit_data_3.isnull().any(axis=1)
missing_values = credit_data_3[credit_data_3['isnull'] == True]
missing_values '''
credit_data_3.isna().sum()# there are no missing values 

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [7]:
test_p_value = pd.crosstab(credit_data_3['LIMIT_BAL'], credit_data_3['SEX'])
test_p_value[:3]

SEX,1,2
LIMIT_BAL,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,306,187
16000,1,1
20000,1110,866


In [8]:
# Your comment here
# The p value is almost 0 and so smaller that 0.05. So the H0 hypothes can be dropped
# H0 = Limit_bal and sexare independent
# H1 =  they are dependent
chi, p, dof, expected = chi2_contingency(test_p_value)
p

7.747572367063936e-109

In [9]:
colnames = credit_data_3.columns
P_crostab = pd.DataFrame(columns=colnames, index=colnames) 
P_crostab.head(2)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,,,,,,,,,,,...,,,,,,,,,,
LIMIT_BAL,,,,,,,,,,,...,,,,,,,,,,


In [14]:
from sklearn.model_selection import train_test_split

# Your code here:
y = credit_data_3['LIMIT_BAL'] # How high the limit ball will be 
X = credit_data_3.drop(["LIMIT_BAL"], axis=1) #rest of the data to predict the limit ball 
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, shuffle=True)



In [16]:
# Your code here:
clf = RandomForestClassifier() # because we use the default setting () are empthy 
credit_rf = clf.fit(X_train, y_train)



In [17]:
y_pred_test_rf = credit_rf.predict(X_test)
len(y_pred_test_rf)

6000

In [21]:
X_test.head(2)

Unnamed: 0,ID,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
18476,18476,2,2,2,26,0,0,0,0,0,...,19693,19894,3473,1300,1500,1250,451,160,69,0
7402,7402,1,3,2,23,0,0,0,0,0,...,19221,18603,19075,1315,1297,672,675,781,734,0


In [24]:
y_test.head(2)

18476    20000
7402     20000
Name: LIMIT_BAL, dtype: int64

In [19]:
# Your code here:
y_pred_test_rf = credit_rf.predict(X_test)
confusion_matrix(y_test, y_pred_test_rf) # compare the axtueal Y_test value with the predicted y values 
# this matrix only works if its categorical 

array([[ 48,   0,  28, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       [ 21,   0, 240, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [None]:
colnames = credit_data_3.columns
P_crostab = pd.DataFrame(columns=colnames, index=colnames) 
P_crostab.head(2)

In [None]:
''''for col in colnames:# loops trough columns 
    for row in colnames: # loops trough rows 
        ct_sub = pd.crosstab(credit_data_3[col], credit_data_3[row]) # makes a costab 
        test = chi2_contingency(ct_sub)[1] # the [1] is the P value of the test 
        P_crostab[col][row] = test # for every section fill it with the p value of the test 
P_crostab''''