## Exploring Different Styles of Categorical Encoding
- https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from tools.supporting_scripts import *

In [2]:
fname = '../data/clients.csv'
df = pd.read_csv(fname)

In [3]:
label_enc = LabelEncoder()
knn = KNNImputer(n_neighbors=3)

df = pd.read_csv(fname)
df.loc[:, 'loan_approval'] = label_enc.fit_transform(df['loan_approval'])
y = df['loan_approval']
df.drop(['id', 'loan_approval'], axis=1, inplace=True)
df = cast_spouse_income(df)
df = df.replace({'credit_history': {1.0: 'Yes', 0.0: 'No'}})
df.loc[:, 'monthly_payment'] = df.astype({'monthly_payment': 'object'})
num_feats = df.columns[df.dtypes != 'object'].tolist()
df_cat = df.loc[:, df.dtypes=='object'].copy()
df_num = df.loc[:, num_feats].copy()
df_cat.fillna(value='MISSING', axis=0, inplace=True)
df_num_transformed = knn.fit_transform(df_num)
df_num = pd.DataFrame(df_num_transformed, columns=num_feats)
df_num['income/loan'] = df_num['income'] / (df_num['loan_in_thousands'] * 1000)
df_num['income'] = df_num['income'] + df_num['spouse_income']
df_num.drop(['spouse_income'], axis=1, inplace=True)
df = pd.concat([df_num, df_cat, y], axis=1)

In [4]:
df

Unnamed: 0,income,loan_in_thousands,income/loan,sex,married,dependents,education,working,monthly_payment,credit_history,property_type,loan_approval
0,5849.0,138.0,0.042384,Male,No,0,Graduate,No,360.0,Yes,Urban,1
1,6091.0,128.0,0.035805,Male,Yes,1,Graduate,No,360.0,Yes,Rural,0
2,3000.0,66.0,0.045455,Male,Yes,0,Graduate,Yes,360.0,Yes,Urban,1
3,4941.0,120.0,0.021525,Male,Yes,0,Not Graduate,No,360.0,Yes,Urban,1
4,6000.0,141.0,0.042553,Male,No,0,Graduate,No,360.0,Yes,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900.0,71.0,0.040845,Female,No,0,Graduate,No,360.0,Yes,Rural,1
610,4106.0,40.0,0.102650,Male,Yes,3+,Graduate,No,180.0,Yes,Rural,1
611,8312.0,253.0,0.031905,Male,Yes,1,Graduate,No,360.0,Yes,Urban,1
612,7583.0,187.0,0.040551,Male,Yes,2,Graduate,No,360.0,Yes,Urban,1


### Frequency Encoding

In [5]:
freq = df.groupby('sex').size() / len(df)
df.loc[:, 'sex'] = df['sex'].map(freq)

In [6]:
df.loc[:5, 'sex']

0    0.796417
1    0.796417
2    0.796417
3    0.796417
4    0.796417
5    0.796417
Name: sex, dtype: float64

### Mean Encoding or Target Encoding
* does not affect the volume of the data
* helps in faster learning
* NOTE: notorious for over-fitting, to be used with regularization and cross-validation

In [7]:
mean_encode = df.groupby('dependents')['loan_approval'].mean()
print(mean_encode)
df.loc[:, 'dependents'] = df['dependents'].map(mean_encode)

dependents
0          0.689855
1          0.647059
2          0.752475
3+         0.647059
MISSING    0.600000
Name: loan_approval, dtype: float64


In [8]:
df.loc[:5, 'dependents']

0    0.689855
1    0.647059
2    0.689855
3    0.689855
4    0.689855
5    0.752475
Name: dependents, dtype: float64

In [9]:
# another variation of target encoding using smoothing
mean = df['loan_approval'].mean()
agg = df.groupby('dependents')['loan_approval'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 100
# compute smoothed means
smooth = (counts * means + weight * mean) / (counts + weight)
print(smooth)
df.loc[:, 'dependents'] = df['dependents'].map(smooth)

dependents
0.600000    0.675910
0.647059    0.662963
0.689855    0.689280
0.752475    0.720048
dtype: float64


In [10]:
df.loc[:5, 'dependents']

0    0.689280
1    0.662963
2    0.689280
3    0.689280
4    0.689280
5    0.720048
Name: dependents, dtype: float64

### Weight of Evidence Encoding (Woe)
##### https://towardsdatascience.com/attribute-relevance-analysis-in-python-iv-and-woe-b5651443fc04
##### https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
##### see also IV (Information Value)
* measures strength of grouping technique to separate good from bad
* primarily developed for loan evaluation
* WoE = (ln(distr good / distr bad)) * 100 NOTE: WoE will be 0 if the P(Goods) / P(Bads) = 1, If P(Bads) > P(Goods) the odds ratio will be < 1 and the WoE will be < 0; if, on the other hand, P(Goods) > P(Bads) in a group, then WoE > 0
* Well suited for Logistic Regression (binary classification)
#### Advantages
1. It can transform an independent variable to establish a monotonic relationship to the dependent variable. It does more than this — to secure a monotonic relationship it would be enough to “recode” it to any ordered measure (for example 1,2,3,4…), but the WoE transformation orders the categories on a “logistic” scale which is natural for Logistic Regression
2. For variables with too many (sparsely populated) discrete values, these can be grouped into categories (densely populated), and the WoE can be used to express information for the whole category
3. The (univariate) effect of each category on the dependent variable can be compared across categories and variables because WoE is a standardized value (for example, you can compare WoE of married people to WoE of manual workers)
#### Caveats
1. Loss of information (variation) due to binning to a few categories
2. It is a “univariate” measure, so it does not take into account the correlation between independent variables
3. It is easy to manipulate (over-fit) the effect of variables according to how categories are created
#### Prerequisites
1. Data must be clean (nan could be substituted with "MISSING" to see how it affects target as well
2. There should not be any continuous features (use qcut)

In [11]:
# WoE = (ln(Relative Frequency of Good / Relative Frequency of Bad)) * 100
# IV = Σ(DistributedGood_i - DistributedBad_i) * WoE_i

In [12]:
num_feats = ['income', 'spouse_income', 'loan_in_thousands', 'monthly_payment']
CONTINUOUS = ['income', 'loan_in_thousands', 'income/loan']

In [13]:
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature]
        })

    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    np.seterr(divide='ignore')                                  # lifting-up the warning because it is handled below
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    np.seterr(divide='warn')
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()

    dset = dset.sort_values(by='WoE')

    return dset, iv

In [14]:
target = 'loan_approval'
df_train = df.copy(deep=True)
# df_train['income'] = pd.qcut(df_train['income'], 20)
# calculate_woe_iv(df_train, 'income', target)

In [15]:
for col in df_train.columns:
    if col == target: continue
    woe, iv = calculate_woe_iv(df_train, col, target)
    print(f'WoE and IV for {col}:')
    values_names = woe['Value'].tolist()
    values_vals = woe['WoE'].tolist()
    values_to_replace = {k: v for k, v in zip(values_names, values_vals)}
    if iv > 0.02:
        df_train = df_train.replace({col: values_to_replace})
    print(woe)
    print(f'IV score is : {iv}')
    print('''
    ''')

WoE and IV for income:
      Value  All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
133  4583.0    4     1    3    0.002370   0.015625 -1.886122  0.025001
43   6277.0    3     1    2    0.002370   0.010417 -1.480657  0.011915
208  5000.0    3     1    2    0.002370   0.010417 -1.480657  0.011915
64   4166.0    3     1    2    0.002370   0.010417 -1.480657  0.011915
181  4885.0    2     1    1    0.002370   0.005208 -0.787510  0.002235
..      ...  ...   ...  ...         ...        ...       ...       ...
197  6506.0    1     1    0    0.002370   0.000000  0.000000  0.000000
191  5100.0    1     1    0    0.002370   0.000000  0.000000  0.000000
71   3750.0    4     3    1    0.007109   0.005208  0.311102  0.000591
312  2500.0    4     3    1    0.007109   0.005208  0.311102  0.000591
4    6000.0    5     4    1    0.009479   0.005208  0.598784  0.002557

[554 rows x 8 columns]
IV score is : 0.10034147531267246

    
WoE and IV for loan_in_thousands:
     Value  All  Good  Bad 