In [5]:
#!wget https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv
#mkdir data

In [25]:
# import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
%matplotlib inline

In [26]:
# read the file
df = pd.read_csv('data/CreditScoring.csv')
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [27]:
# convert diverse columns for providing more information 
status_values = {1: 'ok', 2: 'default', 0: 'unknown'}
df['status']= df['status'].map(status_values)

home_values = {1: 'rent', 2: 'owner', 3: 'private', 4: 'ignore', 5: 'parents', 6: 'other', 0: 'unknown'}
df['home'] = df['home'].map(home_values)

marital_values = {1: 'single', 2: 'married', 3: 'widow', 4: 'separated', 5: 'divorced', 0: 'unknown'}
df['marital'] = df['marital'].map(marital_values)

records_values = {1: 'no', 2: 'yes', 0: 'unknown'}
df['records'] = df['records'].map(records_values)

job_values = {1: 'fixed', 2: 'partime', 3: 'freelance', 4: 'others', 0: 'unknown'}
df['job'] = df['job'].map(job_values)

df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [28]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [29]:
# max value in some columns is 99999999, we need to convert them to NA
for i in ['income', 'assets', 'debt']:
    df[i] = df[i].replace(to_replace = 99999999, value = np.nan)
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [30]:
df['status'].value_counts()

ok         3200
default    1254
unknown       1
Name: status, dtype: int64

In [31]:
df = df[df['status'] != 'unknown']

In [32]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=442)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [33]:
len(df_train), len(df_val), len(df_test)

(2672, 891, 891)

In [34]:
y_train = (df_train['status'] == 'default').values
y_val = (df_val['status'] == 'default').values


In [35]:
del df_train['status']
del df_val['status']

In [36]:
df_train.isna().sum(), df_val.isna().sum()

(seniority     0
 home          0
 time          0
 age           0
 marital       0
 records       0
 job           0
 expenses      0
 income       18
 assets       27
 debt         10
 amount        0
 price         0
 dtype: int64,
 seniority     0
 home          0
 time          0
 age           0
 marital       0
 records       0
 job           0
 expenses      0
 income        8
 assets       11
 debt          5
 amount        0
 price         0
 dtype: int64)

In [37]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

In [38]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

In [39]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [40]:
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.665562423409957

In [41]:
tree_text = export_text(dt, feature_names=dv.feature_names_)
print(tree_text)

|--- records=yes <= 0.50
|   |--- income <= 83.50
|   |   |--- seniority <= 2.50
|   |   |   |--- amount <= 1120.00
|   |   |   |   |--- job=fixed <= 0.50
|   |   |   |   |   |--- seniority <= 0.50
|   |   |   |   |   |   |--- assets <= 4750.00
|   |   |   |   |   |   |   |--- home=parents <= 0.50
|   |   |   |   |   |   |   |   |--- expenses <= 38.50
|   |   |   |   |   |   |   |   |   |--- income <= 26.50
|   |   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |   |   |--- income >  26.50
|   |   |   |   |   |   |   |   |   |   |--- income <= 41.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |   |--- income >  41.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |--- expenses >  38.50
|   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |--- home=parents >  0.50
|   |   |   |   |   |   |   |   |--- age <= 29.00
| 

In [42]:
for m in [4, 5, 6]:
    print('depth: %s' % m)
    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=s)
        dt.fit(X_train, y_train)
        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        print('%s -> %.3f' % (s, auc))
    print()

depth: 4
1 -> 0.782
5 -> 0.782
10 -> 0.782
15 -> 0.782
20 -> 0.782
50 -> 0.772
100 -> 0.775
200 -> 0.771

depth: 5
1 -> 0.797
5 -> 0.795
10 -> 0.798
15 -> 0.793
20 -> 0.793
50 -> 0.783
100 -> 0.783
200 -> 0.775

depth: 6
1 -> 0.789
5 -> 0.792
10 -> 0.801
15 -> 0.792
20 -> 0.792
50 -> 0.786
100 -> 0.793
200 -> 0.775



In [43]:
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)

In [44]:
#Gradient boosting
import xgboost as xgb

In [46]:
dtrain = xgb.DMatrix(X_train, label = y_train, feature_names= dv.feature_names_)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [48]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [49]:
model = xgb.train(xgb_params, dtrain, num_boost_round=10)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [50]:
y_pred = model.predict(dval)

In [51]:
roc_auc_score(y_val, y_pred)

0.8109724073849636

In [52]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [53]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc', #A
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [54]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=10)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.85078	val-auc:0.78957
[10]	train-auc:0.94973	val-auc:0.81113
[20]	train-auc:0.97383	val-auc:0.80493
[30]	train-auc:0.98637	val-auc:0.79861
[40]	train-auc:0.99150	val-auc:0.79770
[50]	train-auc:0.99532	val-auc:0.79662
[60]	train-auc:0.99789	val-auc:0.79161
[70]	train-auc:0.99926	val-auc:0.79136
[80]	train-auc:0.99982	val-auc:0.78824
[90]	train-auc:0.99993	val-auc:0.78735
[99]	train-auc:0.99997	val-auc:0.78499
