In [56]:
!wget 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

--2025-10-28 16:34:54--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: ‘CreditScoring.csv.1’


2025-10-28 16:34:54 (25.5 MB/s) - ‘CreditScoring.csv.1’ saved [182489/182489]



In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('CreditScoring.csv')

In [3]:
df.columns = df.columns.str.lower()

In [4]:
status_values = {
    1:"ok",
    2:"default",
    0:"unk",
    }

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [5]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [6]:
for col in ['income', 'assets', 'debt']:
    df[col] = df[col].replace(np.int64(99999999), np.nan)

In [7]:
df.status.value_counts()

status
ok         3200
default    1254
unk           1
Name: count, dtype: int64

In [8]:
# Remove row with unk

df = df[df.status != 'unk']

In [9]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['status']
y_val = df_val['status']
y_test = df_test['status']

df_train = df_train.drop(columns='status')
df_val = df_val.drop(columns='status')
df_test = df_test.drop(columns='status')



In [13]:
y_train = (y_train=='default').astype('int64')
y_val = (y_val=='default').astype('int64')
y_test = (y_test=='default').astype('int64')

In [19]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,15,owner,60,34,married,no,fixed,45,82.0,3500.0,0.0,750,1624
1,7,parents,60,30,single,no,fixed,35,95.0,0.0,0.0,900,1158
2,10,owner,36,47,married,no,fixed,60,133.0,3000.0,0.0,360,360
3,5,owner,48,39,married,yes,freelance,45,100.0,30000.0,0.0,1550,2294
4,14,owner,36,40,married,no,fixed,45,80.0,3000.0,0.0,900,1263


In [20]:
def access_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'

In [25]:
xi = df_train.iloc[0].to_dict()
xi

{'seniority': 15,
 'home': 'owner',
 'time': 60,
 'age': 34,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 45,
 'income': 82.0,
 'assets': 3500.0,
 'debt': 0.0,
 'amount': 750,
 'price': 1624}

In [26]:
access_risk(xi)

'default'

In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer


In [72]:
df_train.isna().sum()

seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       26
assets       31
debt         11
amount        0
price         0
dtype: int64

In [73]:
train_dicts = df_train.to_dict(orient="records")

In [81]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
dv.get_feature_names_out()

array(['age', 'amount', 'assets', 'debt', 'expenses', 'home=ignore',
       'home=other', 'home=owner', 'home=parents', 'home=private',
       'home=rent', 'home=unk', 'income', 'job=fixed', 'job=freelance',
       'job=others', 'job=partime', 'job=unk', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unk', 'marital=widow', 'price', 'records=no',
       'records=yes', 'seniority', 'time'], dtype=object)

In [78]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [90]:
X_val = dv.transform(df_val.to_dict(orient='records'))
y_pred = dt.predict_proba(X_val)
y_pred

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]], shape=(713, 2))

In [87]:
from sklearn.metrics import roc_auc_score

In [91]:
roc_auc_score(y_val, y_pred[:, 1])

0.6505211267605634

In [93]:
roc_auc_score(y_train, dt.predict_proba(X_train)[:,1])

1.0

In [96]:
dt_3 = DecisionTreeClassifier(max_depth=3)
dt_3.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [97]:
roc_auc_score(y_train, dt_3.predict_proba(X_train)[:,1])

0.7700033886231307

In [98]:
roc_auc_score(y_val, dt_3.predict_proba(X_val)[:, 1])

0.7477511737089203

In [108]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,15,owner,60,34,married,no,fixed,45,82.0,3500.0,0.0,750,1624
1,7,parents,60,30,single,no,fixed,35,95.0,0.0,0.0,900,1158
2,10,owner,36,47,married,no,fixed,60,133.0,3000.0,0.0,360,360
3,5,owner,48,39,married,yes,freelance,45,100.0,30000.0,0.0,1550,2294
4,14,owner,36,40,married,no,fixed,45,80.0,3000.0,0.0,900,1263


In [105]:
from sklearn.tree import export_text

print(export_text(dt_3, feature_names=dv.get_feature_names_out()))

|--- records=no <= 0.50
|   |--- seniority <= 6.50
|   |   |--- seniority <= 1.50
|   |   |   |--- class: 1
|   |   |--- seniority >  1.50
|   |   |   |--- class: 1
|   |--- seniority >  6.50
|   |   |--- income <= 103.50
|   |   |   |--- class: 1
|   |   |--- income >  103.50
|   |   |   |--- class: 0
|--- records=no >  0.50
|   |--- job=partime <= 0.50
|   |   |--- income <= 74.50
|   |   |   |--- class: 0
|   |   |--- income >  74.50
|   |   |   |--- class: 0
|   |--- job=partime >  0.50
|   |   |--- assets <= 8750.00
|   |   |   |--- class: 1
|   |   |--- assets >  8750.00
|   |   |   |--- class: 0



In [111]:
md_roc_auc = []
for md in [1,2,3,5,10,15,50,None]:

    dt = DecisionTreeClassifier(max_depth=md)
    dt.fit(X_train, y_train)
    ras = roc_auc_score(y_val, dt.predict_proba(X_val)[:, 1])
    md_roc_auc.append((md, ras))

print(md_roc_auc)

[(1, 0.6073192488262912), (2, 0.6759624413145541), (3, 0.7477511737089203), (5, 0.7712488262910798), (10, 0.7008779342723005), (15, 0.6543098591549296), (50, 0.643131455399061), (None, 0.6407417840375588)]
