# A Data Scientist's Toolkit
**Data source:** 
- The data is provided by Home Credit, a service dedicated to provided lines of credit (loans) to the unbanked population. Predicting whether or not a client will repay a loan or have difficulty is a critical business need.
- Please download the data from this [Kaggle webpage](https://www.kaggle.com/code/willkoehrsen/start-here-a-gentle-introduction/input) and save to your local directory.

## (0) Load a dataset

In [18]:
# data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split                                                    
# https://www.kaggle.com/code/willkoehrsen/start-here-a-gentle-introduction/input
path = '/Users/chriskuo/Downloads/data'
#data =  pd.read_csv(path + "/home_loan_selected.csv") 
data =  pd.read_csv(path + "/application_train.csv") 
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Print sample
df_train = train[['OCCUPATION_TYPE','TARGET']].copy()
df_train['OCCUPATION_TYPE'].value_counts(dropna=False)

OCCUPATION_TYPE
NaN                      77106
Laborers                 44201
Sales staff              25622
Core staff               22042
Managers                 17165
Drivers                  14886
High skill tech staff     9180
Accountants               7848
Medicine staff            6817
Security staff            5375
Cooking staff             4688
Cleaning staff            3722
Private service staff     2115
Low-skill Laborers        1657
Waiters/barmen staff      1071
Secretaries               1059
Realty agents              596
HR staff                   447
IT staff                   411
Name: count, dtype: int64

In [19]:
df_train['OCCUPATION_TYPE'] = df_train['OCCUPATION_TYPE'].fillna('NoData')
df_train['OCCUPATION_TYPE'].value_counts(dropna=False)

OCCUPATION_TYPE
NoData                   77106
Laborers                 44201
Sales staff              25622
Core staff               22042
Managers                 17165
Drivers                  14886
High skill tech staff     9180
Accountants               7848
Medicine staff            6817
Security staff            5375
Cooking staff             4688
Cleaning staff            3722
Private service staff     2115
Low-skill Laborers        1657
Waiters/barmen staff      1071
Secretaries               1059
Realty agents              596
HR staff                   447
IT staff                   411
Name: count, dtype: int64

In [40]:
# Apply to test
df_test = test[['OCCUPATION_TYPE','TARGET']].copy()
df_test['OCCUPATION_TYPE'].value_counts(dropna=False)

df_test['OCCUPATION_TYPE'] = df_test['OCCUPATION_TYPE'].fillna('NoData')
df_test['OCCUPATION_TYPE'].value_counts(dropna=False)

OCCUPATION_TYPE
NoData                   19285
Laborers                 10985
Sales staff               6480
Core staff                5528
Managers                  4206
Drivers                   3717
High skill tech staff     2200
Accountants               1965
Medicine staff            1720
Security staff            1346
Cooking staff             1258
Cleaning staff             931
Private service staff      537
Low-skill Laborers         436
Waiters/barmen staff       277
Secretaries                246
Realty agents              155
HR staff                   116
IT staff                   115
Name: count, dtype: int64

## (1) Dummy/One-hot Encoding
### (1.1) Get_dummy

In [20]:
dummies = pd.get_dummies(df_train['OCCUPATION_TYPE'],dtype=float,dummy_na=False)
print(dummies.shape) # (246008, 19)
dummies.head()

(246008, 19)


Unnamed: 0,Accountants,Cleaning staff,Cooking staff,Core staff,Drivers,HR staff,High skill tech staff,IT staff,Laborers,Low-skill Laborers,Managers,Medicine staff,NoData,Private service staff,Realty agents,Sales staff,Secretaries,Security staff,Waiters/barmen staff
123473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64716,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234940,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
new_df = pd.concat([df_train,dummies],axis=1).drop('OCCUPATION_TYPE',axis=1)
new_df.head()

Unnamed: 0,TARGET,Accountants,Cleaning staff,Cooking staff,Core staff,Drivers,HR staff,High skill tech staff,IT staff,Laborers,Low-skill Laborers,Managers,Medicine staff,NoData,Private service staff,Realty agents,Sales staff,Secretaries,Security staff,Waiters/barmen staff
123473,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10118,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64716,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234940,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236051,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### (1.2) Category_encoders

In [42]:
import category_encoders as ce
ec = ce.OneHotEncoder(cols='OCCUPATION_TYPE', use_cat_names=True,
     handle_unknown='indicator').fit(df_train)
onehot = ec.fit_transform(df_train['OCCUPATION_TYPE'])
new_df = pd.concat([df_train,onehot],axis=1).drop('OCCUPATION_TYPE',axis=1)
new_df.head()

Unnamed: 0,TARGET,OCCUPATION_TYPE_Mean_Encoded,OCCUPATION_TYPE_NoData,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Managers,...,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_-1
123473,0,0.065144,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10118,0,0.105722,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64716,1,0.111447,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234940,0,0.060893,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
236051,0,0.105722,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## (2) Mean Encoding
### (2.1) Manual

In [24]:
mean_encoded = df_train.groupby('OCCUPATION_TYPE')['TARGET'].mean()
mean_encoded

OCCUPATION_TYPE
Accountants              0.048930
Cleaning staff           0.096185
Cooking staff            0.106015
Core staff               0.064014
Drivers                  0.111447
HR staff                 0.060403
High skill tech staff    0.060893
IT staff                 0.072993
Laborers                 0.105722
Low-skill Laborers       0.171998
Managers                 0.063210
Medicine staff           0.067332
NoData                   0.065144
Private service staff    0.069031
Realty agents            0.087248
Sales staff              0.096206
Secretaries              0.064212
Security staff           0.106791
Waiters/barmen staff     0.112979
Name: TARGET, dtype: float64

In [25]:
# Map the mean values to the original 'Country' column
df_train['OCCUPATION_TYPE_Mean_Encoded'] = df_train['OCCUPATION_TYPE'].map(mean_encoded)
df_train[['OCCUPATION_TYPE','OCCUPATION_TYPE_Mean_Encoded']].head()

Unnamed: 0,OCCUPATION_TYPE,OCCUPATION_TYPE_Mean_Encoded
123473,NoData,0.065144
10118,Laborers,0.105722
64716,Drivers,0.111447
234940,High skill tech staff,0.060893
236051,Laborers,0.105722


In [43]:
# Apply to the test data
df_test['OCCUPATION_TYPE_Mean_Encoded'] = df_test['OCCUPATION_TYPE'].map(mean_encoded)
df_test[['OCCUPATION_TYPE','OCCUPATION_TYPE_Mean_Encoded']].head()

Unnamed: 0,OCCUPATION_TYPE,OCCUPATION_TYPE_Mean_Encoded
245895,Sales staff,0.096206
98194,Managers,0.06321
36463,Sales staff,0.096206
249923,High skill tech staff,0.060893
158389,Laborers,0.105722


### (2.2) Target_encoder in category_encoders

In [26]:
from category_encoders import target_encoder as te
ec = te.TargetEncoder()
X_TE = ec.fit_transform(df_train['OCCUPATION_TYPE'],df_train['TARGET'])
outf = pd.concat([df_train['OCCUPATION_TYPE'],X_TE],axis=1)
outf.columns = ['OCCUPATION_TYPE','mean']
outf.head()

Unnamed: 0,OCCUPATION_TYPE,mean
123473,NoData,0.065144
10118,Laborers,0.105722
64716,Drivers,0.111447
234940,High skill tech staff,0.060893
236051,Laborers,0.105722


In [27]:
# add some noises
cntrl = 0.3
capped = df_train['TARGET'].mean() * cntrl
num_obs = df_train.shape[0]
noise = np.random.uniform(0,capped,num_obs) 
outf['mean'] = outf['mean'] + noise
outf.head(10)

Unnamed: 0,OCCUPATION_TYPE,mean
123473,NoData,0.083057
10118,Laborers,0.126582
64716,Drivers,0.114187
234940,High skill tech staff,0.074347
236051,Laborers,0.128866
30611,Accountants,0.064236
871,High skill tech staff,0.063119
153082,Private service staff,0.084866
188110,High skill tech staff,0.063593
278046,Laborers,0.112015


## (3) Weight of Evidence

### (3.1) WOE with Binary Target Variable
#### (3.1.1) Manual

In [28]:
var = 'OCCUPATION_TYPE'
df_train[var] = df_train[var].fillna('NoData')
k = df_train[[var,'TARGET']].groupby(var)['TARGET'].agg(['count','sum']).reset_index()
k.columns = [var,'Count','Bad']
k

Unnamed: 0,OCCUPATION_TYPE,Count,Bad
0,Accountants,7848,384
1,Cleaning staff,3722,358
2,Cooking staff,4688,497
3,Core staff,22042,1411
4,Drivers,14886,1659
5,HR staff,447,27
6,High skill tech staff,9180,559
7,IT staff,411,30
8,Laborers,44201,4673
9,Low-skill Laborers,1657,285


In [29]:
k['Good'] = k['Count'] - k['Bad']
k['Good %'] = (k['Good']/k['Good'].sum()*100).round(2)
k['Bad %'] = (k['Bad']/k['Bad'].sum()*100).round(2)
k[var + '_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
k = k.sort_values(by=var + '_WOE', ascending=False)
k

Unnamed: 0,OCCUPATION_TYPE,Count,Bad,Good,Good %,Bad %,OCCUPATION_TYPE_WOE
0,Accountants,7848,384,7464,3.3,1.93,0.54
5,HR staff,447,27,420,0.19,0.14,0.31
6,High skill tech staff,9180,559,8621,3.81,2.81,0.3
10,Managers,17165,1085,16080,7.11,5.46,0.26
16,Secretaries,1059,68,991,0.44,0.34,0.26
3,Core staff,22042,1411,20631,9.12,7.1,0.25
12,NoData,77106,5023,72083,31.88,25.27,0.23
11,Medicine staff,6817,459,6358,2.81,2.31,0.2
13,Private service staff,2115,146,1969,0.87,0.73,0.18
7,IT staff,411,30,381,0.17,0.15,0.13


In [30]:
var = 'OCCUPATION_TYPE'
def WOE(var):
    d = df.copy()
    d[var] = d[var].fillna('NoData')
    k = d[[var,'TARGET']].groupby(var)['TARGET'].agg(['count','sum']).reset_index()
    k.columns = [var,'Count','Bad']
    k['Good'] = k['Count'] - k['Bad']
    k['Good %'] = (k['Good']/k['Good'].sum()*100).round(2)
    k['Bad %'] = (k['Bad']/k['Bad'].sum()*100).round(2)
    k[var + '_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
    k = k.sort_values(by=var + '_WOE', ascending=False)
    return (k)

In [31]:
df2 = df_train[['TARGET','OCCUPATION_TYPE']].merge(k,left_on=var,right_on=var,how='left')
df2.head()

Unnamed: 0,TARGET,OCCUPATION_TYPE,Count,Bad,Good,Good %,Bad %,OCCUPATION_TYPE_WOE
0,0,NoData,77106,5023,72083,31.88,25.27,0.23
1,0,Laborers,44201,4673,39528,17.48,23.51,-0.3
2,1,Drivers,14886,1659,13227,5.85,8.35,-0.36
3,0,High skill tech staff,9180,559,8621,3.81,2.81,0.3
4,0,Laborers,44201,4673,39528,17.48,23.51,-0.3


#### (3.1.2) Use Category_encoders for WOE Binary Target

In [33]:
#########################
# Category_Encoders WOE #
#########################
ec = ce.WOEEncoder()
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna('NoData')
X_WOE = ec.fit(df_train['OCCUPATION_TYPE'], df_train['TARGET'])
df_train_cleaned = ec.transform(df_train['OCCUPATION_TYPE'])
df_train.round(2)

Unnamed: 0,OCCUPATION_TYPE,TARGET,OCCUPATION_TYPE_Mean_Encoded
123473,NoData,0,0.07
10118,Laborers,0,0.11
64716,Drivers,1,0.11
234940,High skill tech staff,0,0.06
236051,Laborers,0,0.11
...,...,...,...
119879,Cooking staff,0,0.11
259178,Sales staff,0,0.10
131932,Managers,0,0.06
146867,Drivers,0,0.11


In [41]:
# Apply to test
df_test_cleaned = ec.transform(df_test['OCCUPATION_TYPE'])
df_test_cleaned.round(2)

Unnamed: 0,OCCUPATION_TYPE
245895,0.19
98194,-0.26
36463,0.19
249923,-0.30
158389,0.30
...,...
256564,0.30
278889,-0.25
221828,-0.23
190245,0.36


### (3.2) WOE for Continuous target

#### (3.2.1) Manual

In [35]:
########################################
# My Function for Continous-Target WOE #
########################################
def WOE_continous(df,var,target):
    df[var] = df[var].fillna('NoData')
    k = df[[var,target]].groupby(var)[target].agg(['count','sum']).reset_index()
    k.columns = [var,'Count','Sum']
    k['Sum %'] = (k['Sum'] / k['Sum'].sum()*100).round(2)
    k['Count %'] = (k['Count'] / k['Count'].sum()*100).round(2)
    k[var+'_WOE'] = np.log(k['Sum %'] / k['Count %']).round(2)
    k = k.sort_values(by=var+'_WOE')
    return(k)
k = WOE_continous(data, 'OCCUPATION_TYPE','AMT_INCOME_TOTAL')
k

Unnamed: 0,OCCUPATION_TYPE,Count,Sum,Sum %,Count %,OCCUPATION_TYPE_WOE
1,Cleaning staff,4653,608570000.0,1.17,1.51,-0.26
9,Low-skill Laborers,2093,278846200.0,0.54,0.68,-0.23
2,Cooking staff,5946,822905600.0,1.59,1.93,-0.19
18,Waiters/barmen staff,1348,194479400.0,0.37,0.44,-0.17
17,Security staff,6721,1005883000.0,1.94,2.19,-0.12
11,Medicine staff,8537,1278071000.0,2.46,2.78,-0.12
15,Sales staff,32102,4889227000.0,9.42,10.44,-0.1
12,NoData,96391,14797560000.0,28.51,31.35,-0.09
16,Secretaries,1305,209506900.0,0.4,0.42,-0.05
8,Laborers,55186,9180604000.0,17.69,17.95,-0.01


## (4) Leave-One-Out (LOO)

### (4.1) Manual LOO

In [53]:
#################################
# My Function for Leave-One-Out #
#################################
def LOO(var,target):
    # Get the count and the sum statistics by category
    df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna('NoData')
    h = df[['OCCUPATION_TYPE','TARGET']].groupby('OCCUPATION_TYPE')['TARGET'].agg(['count','sum']).reset_index()
    h.columns = ['OCCUPATION_TYPE','Count','Sum']
    # Append to the data
    df2 = pd.merge(df[[var,target]],h,left_on='OCCUPATION_TYPE',right_on='OCCUPATION_TYPE',how='left')
    # Get the mean excluding the row itself to avoid direct target leakage
    df2[var + '_LOO'] = ((df2['Sum'] - df2[target])/(df2['Count'] - 1)).round(2)
    df2 = df2.drop([target,'Count','Sum'],axis=1)
    return(df2)
    
k = LOO('OCCUPATION_TYPE','TARGET')
k.head()

Unnamed: 0,OCCUPATION_TYPE,OCCUPATION_TYPE_LOO
0,Laborers,0.11
1,Core staff,0.06
2,Laborers,0.11
3,Laborers,0.11
4,Core staff,0.06


### (4.2) Category_encoders for LOO

In [54]:
from category_encoders import leave_one_out as loo
ec = loo.LeaveOneOutEncoder()
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna('NoData')
X = df['OCCUPATION_TYPE']
y = df['TARGET']

LOO = ec.fit(X, y)
X_LOO = ec.transform(X).round(2)
X_LOO.columns = ['OCCUPATION_TYPE_LOO']
X_LOO.head()

Unnamed: 0,OCCUPATION_TYPE_LOO
0,0.11
1,0.06
2,0.11
3,0.11
4,0.06
