# 最优分箱

In [2]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

  return f(*args, **kwds)


In [5]:
dat=pd.read_csv('day08_rankingcard.csv')
print(dat.shape)
dat=dat.iloc[:,1:]

(150000, 12)


In [15]:
dat.head(5)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [8]:
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
    利用决策树获得最优分箱的边界值列表
    '''
    boundary = []  # 待return的分箱边界值列表
    
    x = x.fillna(nan).values  # 填充缺失值
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',    #“信息熵”最小化准则划分
                                 max_leaf_nodes=6,       # 最大叶子节点数
                                 min_samples_leaf=0.05)  # 叶子节点样本数量最小占比

    clf.fit(x.reshape(-1, 1), y)  # 训练决策树
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时，能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]
    return boundary

In [11]:
optimal_binning_boundary(x=dat['RevolvingUtilizationOfUnsecuredLines'],y=dat['SeriousDlqin2yrs'])

[0.0,
 0.11458224803209305,
 0.21776090562343597,
 0.49497613310813904,
 0.6981423199176788,
 0.8596274554729462,
 50708.1]

In [13]:
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
        计算变量各个分箱的WOE、IV值，返回一个DataFrame
    '''
    
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan)        # 获得最优分箱边界值列表
    df = pd.concat([x, y], axis=1)                        # 合并x、y为一个DataFrame，方便后续计算
    df.columns = ['x', 'y']                               # 特征变量、目标变量字段的重命名
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  # 获得每个x值所在的分箱区间
    grouped = df.groupby('bins')['y']                     # 统计各分箱区间的好、坏、总客户数量
    result_df = grouped.agg([('good',  lambda y: (y == 0).sum()), 
                             ('bad',   lambda y: (y == 1).sum()),
                             ('total', 'count')])
    result_df['good_pct'] = result_df['good'] / result_df['good'].sum()       # 好客户占比
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()          # 坏客户占比
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()    # 总客户占比
    result_df['bad_rate'] = result_df['bad'] / result_df['total']             # 坏比率
    result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct'])              # WOE
    result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe']  # IV
    print(f"该变量IV = {result_df['iv'].sum()}")
    return result_df

In [14]:
feature_woe_iv(x=dat['RevolvingUtilizationOfUnsecuredLines'], y=dat['SeriousDlqin2yrs'])

该变量IV = 1.1025918750620314


Unnamed: 0_level_0,good,bad,total,good_pct,bad_pct,total_pct,bad_rate,woe,iv
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"[0.0, 0.115)",66466,1226,67692,0.474845,0.122282,0.45128,0.018111,1.356659,0.478308
"[0.115, 0.218)",15776,486,16262,0.112707,0.048474,0.108413,0.029886,0.843761,0.054197
"[0.218, 0.495)",23162,1245,24407,0.165474,0.124177,0.162713,0.05101,0.287103,0.011856
"[0.495, 0.698)",10499,1100,11599,0.075007,0.109715,0.077327,0.094836,-0.380305,0.0132
"[0.698, 0.86)",6716,1097,7813,0.04798,0.109416,0.052087,0.140407,-0.824361,0.050645
"[0.86, 50708.1)",17355,4872,22227,0.123987,0.485937,0.14818,0.219193,-1.365899,0.494386


In [17]:
for i in dat.columns:
    if i !='SeriousDlqin2yrs':
        feature_woe_iv(dat[i],dat['SeriousDlqin2yrs'])

该变量IV = 1.1025918750620314
该变量IV = 0.2588002937275757
该变量IV = 0.7404812872794013
该变量IV = 0.07500037832078503
该变量IV = 0.08951432210475875
该变量IV = 0.08339038139056673
该变量IV = 0.8375513427285136
该变量IV = 0.05535386543277276
该变量IV = 0.5723728876090994
该变量IV = 0.033818251077554096


In [24]:

dd=pd.read_csv('/Users/gengbh/code/vscode/Data-Analysis-Notes/caicai-sklearn/1决策树/day08_data.csv')

## 类别转换

In [27]:
def cate_var_transform(X,Y):
    ##取出数据类型
    d_type = X.dtypes #转成一列series
    object_var = X.iloc[:,np.where(d_type == "object")[0]]#筛选出字符型的列，
    num_var = X.iloc[:,np.where(d_type != "object")[0]]#筛选出数值型的列
    
    #object_transfer_rule用于记录每个类别变量的数值转换规则
    object_transfer_rule = list(np.zeros([len(object_var.columns)])) 
    
    #object_transform是类别变量数值化转化后的值
    object_transform = pd.DataFrame(np.zeros(object_var.shape),
                                    columns=object_var.columns) 
    
    for i in range(0,len(object_var.columns)):
        
        temp_var = object_var.iloc[:,i]
        
        ##除空值外的取值种类
        unique_value=np.unique(temp_var.iloc[np.where(~temp_var.isna() )[0]])
    
        transform_rule=pd.concat([pd.DataFrame(unique_value,columns=['raw data']),
                                       pd.DataFrame(np.zeros([len(unique_value),2]),
                                                    columns=['transform data','bad rate'])],axis=1) 
        for j in range(0,len(unique_value)):
            bad_num=len(np.where( (Y == 1) & (temp_var == unique_value[j]) )[0])
            all_num=len(np.where(temp_var == unique_value[j])[0])
            
            #计算badprob
            if all_num == 0:#防止all_num=0的情况，报错
                all_num=0.5  
            transform_rule.iloc[j,2] = 1.0000000*bad_num/all_num
        
        #按照badprob排序，给出转换后的数值
        transform_rule = transform_rule.sort_values(by='bad rate')
        transform_rule.iloc[:,1]=list(range(len(unique_value),0,-1))
         
        #保存转换规则
        object_transfer_rule[i] = transform_rule
        #转换变量
        for k in range(0,len(unique_value)):
            transfer_value = transform_rule.iloc[np.where(transform_rule.iloc[:,0] == unique_value[k])[0],1]
            object_transform.iloc[np.where(temp_var == unique_value[k])[0],i] = float(transfer_value)
        object_transform.iloc[np.where(object_transform.iloc[:,i] == 0)[0],i] = np.nan 
    
    X_transformed = pd.concat([num_var,object_transform],axis = 1) 
    return(X_transformed,object_transfer_rule)

In [49]:
# a,b=cate_var_transform(dd.loc[:,['Sex','Embarked']],dd.Survived)
X=dd.loc[:,['Sex','Embarked']]

In [54]:
d_type = X.dtypes #转成一列series
object_var = X.iloc[:,np.where(d_type == "object")[0]]#筛选出字符型的列
object_var.shape

(891, 2)

In [55]:
object_transfer_rule = list(np.zeros([len(object_var.columns)]))
object_transfer_rule

[0.0, 0.0]

In [59]:
temp_var = object_var.iloc[:,1]
temp_var.shape

(891,)

In [61]:
unique_value=np.unique(temp_var.iloc[np.where(~temp_var.isna() )[0]])
unique_value

array(['C', 'Q', 'S'], dtype=object)

In [73]:
temp_var.iloc[np.where(~temp_var.isna() )[0]]

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 889, dtype: object

In [75]:
np.zeros([len(unique_value),2])

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [None]:
##除空值外的取值种类
unique_value=np.unique(temp_var.iloc[np.where(~temp_var.isna() )[0]])

transform_rule=pd.concat([pd.DataFrame(unique_value,columns=['raw data']),
            pd.DataFrame(np.zeros([len(unique_value),2]),
                        columns=['transform data','bad rate'])],axis=1) 
for j in range(0,len(unique_value)):
bad_num=len(np.where( (Y == 1) & (temp_var == unique_value[j]) )[0])
all_num=len(np.where(temp_var == unique_value[j])[0])

#计算badprob
if all_num == 0:#防止all_num=0的情况，报错
all_num=0.5  
transform_rule.iloc[j,2] = 1.0000000*bad_num/all_num

#按照badprob排序，给出转换后的数值
transform_rule = transform_rule.sort_values(by='bad rate')
transform_rule.iloc[:,1]=list(range(len(unique_value),0,-1))

#保存转换规则
object_transfer_rule[i] = transform_rule
#转换变量
for k in range(0,len(unique_value)):
transfer_value = transform_rule.iloc[np.where(transform_rule.iloc[:,0] == unique_value[k])[0],1]
object_transform.iloc[np.where(temp_var == unique_value[k])[0],i] = float(transfer_value)
object_transform.iloc[np.where(object_transform.iloc[:,i] == 0)[0],i] = np.nan 

X_transformed = pd.concat([num_var,object_transform],axis = 1) 
