In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns


In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

### 6.2 Data cleaning and preparation

#### - Downloading the dataset
#### - Re-encoding the categorical variables
#### - Doing the train/validation/test split

In [4]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

In [5]:
!wget $data

--2025-11-09 19:18:54--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: ‘CreditScoring.csv.3’


2025-11-09 19:18:54 (3.41 MB/s) - ‘CreditScoring.csv.3’ saved [182489/182489]



In [6]:
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


In [7]:
df = pd.read_csv(data)
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4455 entries, 0 to 4454
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Status     4455 non-null   int64
 1   Seniority  4455 non-null   int64
 2   Home       4455 non-null   int64
 3   Time       4455 non-null   int64
 4   Age        4455 non-null   int64
 5   Marital    4455 non-null   int64
 6   Records    4455 non-null   int64
 7   Job        4455 non-null   int64
 8   Expenses   4455 non-null   int64
 9   Income     4455 non-null   int64
 10  Assets     4455 non-null   int64
 11  Debt       4455 non-null   int64
 12  Amount     4455 non-null   int64
 13  Price      4455 non-null   int64
dtypes: int64(14)
memory usage: 487.4 KB


In [9]:
df.columns = df.columns.str.lower()

In [10]:
df.status.value_counts()

status
1    3200
2    1254
0       1
Name: count, dtype: int64

In [11]:
status_values = {1:"ok", 2:"default", 3:"unk"}

df.status = df.status.map(status_values)

In [12]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [13]:
df.describe()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,7.987205,2.657015,46.441751,37.077666,1.879012,1.173513,1.67587,55.568799,763317.0,1060341.0,404382.0,1039.021773,1462.875645
std,8.173444,1.610467,14.655225,10.984856,0.643748,0.378733,0.954035,19.515878,8703625.0,10217570.0,6344253.0,474.543007,628.089913
min,0.0,0.0,6.0,18.0,0.0,1.0,0.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,2.0,36.0,28.0,2.0,1.0,1.0,35.0,80.0,0.0,0.0,700.0,1117.5
50%,5.0,2.0,48.0,36.0,2.0,1.0,1.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,4.0,60.0,45.0,2.0,1.0,3.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,6.0,72.0,68.0,5.0,2.0,4.0,180.0,100000000.0,100000000.0,100000000.0,5000.0,11140.0


In [14]:
df.describe().round(2)

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,7.99,2.66,46.44,37.08,1.88,1.17,1.68,55.57,763316.99,1060340.81,404381.96,1039.02,1462.88
std,8.17,1.61,14.66,10.98,0.64,0.38,0.95,19.52,8703625.26,10217568.67,6344253.4,474.54,628.09
min,0.0,0.0,6.0,18.0,0.0,1.0,0.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,2.0,36.0,28.0,2.0,1.0,1.0,35.0,80.0,0.0,0.0,700.0,1117.5
50%,5.0,2.0,48.0,36.0,2.0,1.0,1.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,4.0,60.0,45.0,2.0,1.0,3.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,6.0,72.0,68.0,5.0,2.0,4.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [15]:
for c in ["income", "assets", "debt"]:
    df[c] = df[c].replace(to_replace=999999,value=np.nan)
#df.income.replace(to_replace=999999,value=np.nan).max()

In [16]:
df.describe().round()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,3.0,46.0,37.0,2.0,1.0,2.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,2.0,15.0,11.0,1.0,0.0,1.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,0.0,6.0,18.0,0.0,1.0,0.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,2.0,36.0,28.0,2.0,1.0,1.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,2.0,48.0,36.0,2.0,1.0,1.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,4.0,60.0,45.0,2.0,1.0,3.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,6.0,72.0,68.0,5.0,2.0,4.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [17]:
df.status.value_counts()

status
ok         3200
default    1254
Name: count, dtype: int64

In [18]:
df = df[df.status != 'unk'].reset_index(drop=True)
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [19]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state =11)

df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state = 11)

In [20]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [21]:
y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

In [22]:
del df_train['status']
del df_val['status']
del df_test['status']

In [23]:
df_train

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,3,1,36,61,2,1,1,41,57,12000,0,2500,3559
1,5,1,48,30,4,2,2,39,41,0,0,1300,1600
2,15,2,60,32,5,1,3,35,0,6000,3300,1550,1612
3,2,1,12,47,2,2,1,103,426,0,0,350,450
4,6,5,48,32,1,2,1,35,85,0,0,1100,1330
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668,1,5,60,21,1,1,2,35,45,0,0,400,904
2669,5,1,60,47,1,2,1,46,106,8500,0,2500,2664
2670,23,1,24,47,1,1,1,44,86,0,0,350,975
2671,5,2,60,58,2,1,3,45,93,45000,0,1500,2090


### 6.3 Decision trees

In [24]:
def assess_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'partime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'

In [25]:
xi = df_train.iloc[0].to_dict()
xi

{'seniority': 3,
 'home': 1,
 'time': 36,
 'age': 61,
 'marital': 2,
 'records': 1,
 'job': 1,
 'expenses': 41,
 'income': 57,
 'assets': 12000,
 'debt': 0,
 'amount': 2500,
 'price': 3559}

In [26]:
assess_risk(xi)

'ok'

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [28]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
X
#v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
#                           {'baz': 1.0, 'foo': 3.0}]
#v.transform({'foo': 4, 'unseen_feature': 3})

array([[2., 0., 1.],
       [0., 1., 3.]])

In [29]:
train_dicts = df_train.to_dict(orient='records')
#print(train_dicts)

In [30]:
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_dicts)
X_train

array([[6.10e+01, 2.50e+03, 1.20e+04, ..., 1.00e+00, 3.00e+00, 3.60e+01],
       [3.00e+01, 1.30e+03, 0.00e+00, ..., 2.00e+00, 5.00e+00, 4.80e+01],
       [3.20e+01, 1.55e+03, 6.00e+03, ..., 1.00e+00, 1.50e+01, 6.00e+01],
       ...,
       [4.70e+01, 3.50e+02, 0.00e+00, ..., 1.00e+00, 2.30e+01, 2.40e+01],
       [5.80e+01, 1.50e+03, 4.50e+04, ..., 1.00e+00, 5.00e+00, 6.00e+01],
       [2.20e+01, 1.25e+03, 1.00e+04, ..., 1.00e+00, 4.00e+00, 4.80e+01]],
      shape=(2673, 13))

In [31]:
dt = DecisionTreeClassifier()
dt.fit(X_train , y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [32]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [33]:
#y_pred = dt.predict_proba(X_val)[:,1]
y_pred = dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train,y_pred)

1.0

In [34]:
roc_auc_score(y_val, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [891, 2673]

In [None]:
## https://www.youtube.com/watch?v=YGiQvFbSIg8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=62
## minute 10:26

In [36]:
#### Changing the model defining the depth
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train,y_train)

#y_pred = dt.predict_proba(X_val)[:,1]
#roc_auc_score(y_val,y_pred)

y_pred = dt.predict_proba(X_train)[:,1]
auc = roc_auc_score(y_train, y_pred)
print('train:', auc)

y_pred = dt.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
print('val:', auc)


train: 0.7228170289982685
val: 0.6895511687412701


In [37]:
from sklearn.tree import export_text
#print(export_text(dt, feature_names = dv.get_feature_names()))
print(export_text(dt, feature_names=dv.get_feature_names_out()))


|--- records <= 1.50
|   |--- seniority <= 2.50
|   |   |--- class: 0
|   |--- seniority >  2.50
|   |   |--- class: 0
|--- records >  1.50
|   |--- seniority <= 6.50
|   |   |--- class: 1
|   |--- seniority >  6.50
|   |   |--- class: 0



### 6.4 Decision tree learning algorithm

#### - Fiding the best split for one column
#### - Finding the best split for the entire dataset
#### - Stopping criteria

In [50]:
data = [ 
    [8000, 'default'],
    [2000, 'default'],
    [0 , 'default'],
    [5000, 'ok'],
    [4000, 'ok'],
    [9000, 'ok'],
    [3000, 'default'],
]

df_example = pd.DataFrame(data, columns=['assets','status'])
df_example.head()

Unnamed: 0,assets,status
0,8000,default
1,2000,default
2,0,default
3,5000,ok
4,4000,ok


In [51]:
df_example.sort_values('assets')

Unnamed: 0,assets,status
2,0,default
1,2000,default
6,3000,default
4,4000,ok
3,5000,ok
0,8000,default
5,9000,ok


In [52]:
Ts = [2000, 3000 , 4000 ,   5000 , 8000 , 9000]

In [53]:
for t in Ts:
    df_left = df_example[df_example.assets <= t]
    df_right = df_example[df_example.assets > t]
    display(df_left)
    display(df_right)
    print()

Unnamed: 0,assets,status
1,2000,default
2,0,default


Unnamed: 0,assets,status
0,8000,default
3,5000,ok
4,4000,ok
5,9000,ok
6,3000,default





Unnamed: 0,assets,status
1,2000,default
2,0,default
6,3000,default


Unnamed: 0,assets,status
0,8000,default
3,5000,ok
4,4000,ok
5,9000,ok





Unnamed: 0,assets,status
1,2000,default
2,0,default
4,4000,ok
6,3000,default


Unnamed: 0,assets,status
0,8000,default
3,5000,ok
5,9000,ok





Unnamed: 0,assets,status
1,2000,default
2,0,default
3,5000,ok
4,4000,ok
6,3000,default


Unnamed: 0,assets,status
0,8000,default
5,9000,ok





Unnamed: 0,assets,status
0,8000,default
1,2000,default
2,0,default
3,5000,ok
4,4000,ok
6,3000,default


Unnamed: 0,assets,status
5,9000,ok





Unnamed: 0,assets,status
0,8000,default
1,2000,default
2,0,default
3,5000,ok
4,4000,ok
5,9000,ok
6,3000,default


Unnamed: 0,assets,status



