#This research aimed at the case of customersâ€™ default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods. From the perspective of risk management, the result of predictive accuracy of the estimated probability of default will be more valuable than the binary result of classification - credible or not credible clients. Because the real probability of default is unknown, this study presented the novel â€œSorting Smoothing Methodâ€ to estimate the real probability of default. With the real probability of default as the response variable (Y), and the predictive probability of default as the independent variable (X), the simple linear regression result (Y = A + BX) shows that the forecasting model produced by artificial neural network has the highest coefficient of determination; its regression intercept (A) is close to zero, and regression coefficient (B) to one. Therefore, among the six data mining techniques, artificial neural network is the only one that can accurately estimate the real probability of default

In [None]:
import os
os.chdir("..")
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm

sns.set(style="white") 

In [None]:
data=pd.read_excel("ADS-Spring2-2020_Cohort 19\\default of credit card clients.xls", header=1)
data.head()


In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
# Import style for boxplot
matplotlib.style.use('ggplot')

#Graph boxplot 
data.plot(kind='box',figsize=(20,10))

In [None]:
data['LIMIT_BAL'].plot(kind='hist', alpha=0.5, figsize=(20,10))

In [None]:
data['SEX'].plot(kind='hist', alpha=0.5, figsize=(20,10))

In [None]:
#Balance?
g = sns.FacetGrid(data, col='default payment next month', height=10, aspect=.5)
g = g.map(sns.boxplot, 'LIMIT_BAL')

In [None]:
#Age
h = sns.FacetGrid(data, col='default payment next month', height=10, aspect=.5)
h = h.map(sns.boxplot, 'AGE')

In [None]:
X=data[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE']]
y=data[['default payment next month']]
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size = 0.5
)
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
print("Score:")
print(model.score(X_test, y_test))

In [None]:
model = lr.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(model.coef_, model.intercept_)

In [None]:
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary())

In [None]:
# Compute the correlation matrix
corr = data.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)