# 1. import libraries

In [1]:
import io
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn import tree

In [3]:
warnings.filterwarnings("ignore")
%matplotlib inline
plt.style.use('seaborn-white')

# 2. read data

In [4]:
df = pd.read_csv("../outputs/cleaned.csv")
df.shape

(1000, 21)

# 3. categorical

In [5]:
def shortlist_var(x):
    gb = df.groupby(x)['Y'].agg(['sum','count'])
    gb['bad'] = gb['sum']
    gb['good'] = gb['count'] - gb['sum']
    gb['pct_bad'] = gb['bad'] / gb['bad'].sum()
    gb['pct_good'] = gb['good'] / gb['good'].sum()
    gb['WOE'] = np.log(gb['pct_bad'] / gb['pct_good'])
    gb['IV'] = gb['WOE'] * (gb['pct_bad'] - gb['pct_good'])
    print(x,': ',round(gb['IV'].sum(),3))

In [6]:
shortlist_var('sex')
shortlist_var('status')
shortlist_var('dependents')
shortlist_var('house')
shortlist_var('house_years')
shortlist_var('employ')
shortlist_var('employ_year')
shortlist_var('balance')
shortlist_var('purpose')
shortlist_var('emi_pct')
shortlist_var('emi_plan')
shortlist_var('property')
shortlist_var('coapplicant')
shortlist_var('guarantor')
shortlist_var('history')

sex :  0.027
status :  0.045
dependents :  0.0
house :  0.083
house_years :  0.004
employ :  0.009
employ_year :  0.086
balance :  0.069
purpose :  0.169
emi_pct :  0.026
emi_plan :  0.058
property :  0.113
coapplicant :  0.017
guarantor :  0.016
history :  0.293


# 4. numerical

In [7]:
def shortlist_var(x,y,z,i,j):
    df[x] = df[x].fillna(-1)
    dtree = tree.DecisionTreeClassifier(random_state=42, max_depth=i, min_samples_leaf=j)
    dtree = dtree.fit(df[[x]],df[y])
    
    df[z] = dtree.apply(df[[x]])
    gb = df.groupby(z).agg({'loan':'count', x:['min','max'], y:'mean'}).reset_index()
    gb.columns = ['bin', 'count', 'min', 'max', 'bad']
    
    gb['good'] = 1 - gb['bad']
    gb['pct_bad'] = gb['bad'] / gb['bad'].sum()
    gb['pct_good'] = gb['good'] / gb['good'].sum()
    gb['WOE'] = np.log(gb['pct_bad'] / gb['pct_good'])
    gb['IV'] = gb['WOE'] * (gb['pct_bad'] - gb['pct_good'])
    print(x,': ',round(gb['IV'].sum(),3))

In [8]:
shortlist_var('age', 'Y', 'bin', 3, 50)
shortlist_var('months', 'Y', 'bin', 3, 50)
shortlist_var('loan_amount', 'Y', 'bin', 3, 50)

age :  0.223
months :  0.541
loan_amount :  0.548


# 5. Export Dataset

In [9]:
df = df[['account', 'loan', 'loan_amount', 'months', 'history', 'age', 'purpose', 'property', 'Y']]
df.to_csv('../outputs/master.csv', index=False)
df.shape

(1000, 9)