In [1]:
import pandas as pd, numpy as np, os, re, math, time

# to check monotonicity of a series
def is_monotonic(temp_series):
    return all(temp_series[i] <= temp_series[i + 1] for i in range(len(temp_series) - 1)) or all(temp_series[i] >= temp_series[i + 1] for i in range(len(temp_series) - 1))

def prepare_bins(bin_data, c_i, target_col, max_bins):
    force_bin = True
    binned = False
    remarks = np.nan
    # ----------------- Monotonic binning -----------------
    for n_bins in range(max_bins, 2, -1):
        try:
            bin_data[c_i + "_bins"] = pd.qcut(bin_data[c_i], n_bins, duplicates="drop")
            monotonic_series = bin_data.groupby(c_i + "_bins")[target_col].mean().reset_index(drop=True)
            if is_monotonic(monotonic_series):
                force_bin = False
                binned = True
                remarks = "binned monotonically"
                break
        except:
            pass
    # ----------------- Force binning -----------------
    # creating 2 bins forcefully because 2 bins will always be monotonic
    if force_bin or (c_i + "_bins" in bin_data and bin_data[c_i + "_bins"].nunique() < 2):
        _min=bin_data[c_i].min()
        _mean=bin_data[c_i].mean()
        _max=bin_data[c_i].max()
        bin_data[c_i + "_bins"] = pd.cut(bin_data[c_i], [_min, _mean, _max], include_lowest=True)
        if bin_data[c_i + "_bins"].nunique() == 2:
            binned = True
            remarks = "binned forcefully"
    
    if binned:
        return c_i + "_bins", remarks, bin_data[[c_i, c_i+"_bins", target_col]].copy()
    else:
        remarks = "couldn't bin"
        return c_i, remarks, bin_data[[c_i, target_col]].copy()

# calculate WOE and IV for every group/bin/class for a provided feature
def iv_woe_4iter(binned_data, target_col, class_col):
    if "_bins" in class_col:
        binned_data[class_col] = binned_data[class_col].cat.add_categories(['Missing'])
        binned_data[class_col] = binned_data[class_col].fillna("Missing")
        temp_groupby = binned_data.groupby(class_col).agg({class_col.replace("_bins", ""):["min", "max"],
                                                           target_col: ["count", "sum", "mean"]}).reset_index()
    else:
        binned_data[class_col] = binned_data[class_col].fillna("Missing")
        temp_groupby = binned_data.groupby(class_col).agg({class_col:["first", "first"],
                                                           target_col: ["count", "sum", "mean"]}).reset_index()
    
    temp_groupby.columns = ["sample_class", "min_value", "max_value", "sample_count", "event_count", "event_rate"]
    temp_groupby["non_event_count"] = temp_groupby["sample_count"] - temp_groupby["event_count"]
    temp_groupby["non_event_rate"] = 1 - temp_groupby["event_rate"]
    temp_groupby = temp_groupby[["sample_class", "min_value", "max_value", "sample_count",
                                 "non_event_count", "non_event_rate", "event_count", "event_rate"]]
    
    if "_bins" not in class_col and "Missing" in temp_groupby["min_value"]:
        temp_groupby["min_value"] = temp_groupby["min_value"].replace({"Missing": np.nan})
        temp_groupby["max_value"] = temp_groupby["max_value"].replace({"Missing": np.nan})
    temp_groupby["feature"] = class_col
    if "_bins" in class_col:
        temp_groupby["sample_class_label"]=temp_groupby["sample_class"].replace({"Missing": np.nan}).astype('category').cat.codes.replace({-1: np.nan})
    else:
        temp_groupby["sample_class_label"]=np.nan
    temp_groupby = temp_groupby[["feature", "sample_class", "sample_class_label", "sample_count", "min_value", "max_value",
                                 "non_event_count", "non_event_rate", "event_count", "event_rate"]]
    
    """
    **********get distribution of good and bad
    """
    temp_groupby['distbn_non_event'] = temp_groupby["non_event_count"]/temp_groupby["non_event_count"].sum()
    temp_groupby['distbn_event'] = temp_groupby["event_count"]/temp_groupby["event_count"].sum()

    temp_groupby['woe'] = np.log(temp_groupby['distbn_non_event'] / temp_groupby['distbn_event'])
    temp_groupby['iv'] = (temp_groupby['distbn_non_event'] - temp_groupby['distbn_event']) * temp_groupby['woe']
    
    temp_groupby["woe"] = temp_groupby["woe"].replace([np.inf,-np.inf],0)
    temp_groupby["iv"] = temp_groupby["iv"].replace([np.inf,-np.inf],0)
    
    return temp_groupby

"""
- iterate over all features.
- calculate WOE & IV for there classes.
- append to one DataFrame woe_iv.
"""
def var_iter(data, target_col, max_bins):
    woe_iv = pd.DataFrame()
    remarks_list = []
    for c_i in data.columns:
        if c_i not in [target_col]:
            # check if binning is required. if yes, then prepare bins and calculate woe and iv.
            """
            ----logic---
            binning is done only when feature is continuous and non-binary.
            Note: Make sure dtype of continuous columns in dataframe is not object.
            """
            c_i_start_time=time.time()
            if np.issubdtype(data[c_i], np.number) and data[c_i].nunique() > 2:
                class_col, remarks, binned_data = prepare_bins(data[[c_i, target_col]].copy(), c_i, target_col, max_bins)
                agg_data = iv_woe_4iter(binned_data.copy(), target_col, class_col)
                remarks_list.append({"feature": c_i, "remarks": remarks})
            else:
                agg_data = iv_woe_4iter(data[[c_i, target_col]].copy(), target_col, c_i)
                remarks_list.append({"feature": c_i, "remarks": "categorical"})
            # print("---{} seconds. c_i: {}----".format(round(time.time() - c_i_start_time, 2), c_i))
            woe_iv = woe_iv._append(agg_data)
    return woe_iv, pd.DataFrame(remarks_list)

# after getting woe and iv for all classes of features calculate aggregated IV values for features.
def get_iv_woe(data, target_col, max_bins):
    func_start_time = time.time()
    woe_iv, binning_remarks = var_iter(data, target_col, max_bins)
    print("------------------IV and WOE calculated for individual groups.------------------")
    print("Total time elapsed: {} minutes".format(round((time.time() - func_start_time) / 60, 3)))
    
    woe_iv["feature"] = woe_iv["feature"].replace("_bins", "", regex=True)    
    woe_iv = woe_iv[["feature", "sample_class", "sample_class_label", "sample_count", "min_value", "max_value",
                     "non_event_count", "non_event_rate", "event_count", "event_rate", 'distbn_non_event',
                     'distbn_event', 'woe', 'iv']]
    
    iv = woe_iv.groupby("feature")[["iv"]].agg(["sum", "count"]).reset_index()
    print("------------------Aggregated IV values for features calculated.------------------")
    print("Total time elapsed: {} minutes".format(round((time.time() - func_start_time) / 60, 3)))
    
    iv.columns = ["feature", "iv", "number_of_classes"]
    null_percent_data=pd.DataFrame(data.isnull().mean()).reset_index()
    null_percent_data.columns=["feature", "feature_null_percent"]
    iv=iv.merge(null_percent_data, on="feature", how="left")
    print("------------------Null percent calculated in features.------------------")
    print("Total time elapsed: {} minutes".format(round((time.time() - func_start_time) / 60, 3)))
    iv = iv.merge(binning_remarks, on="feature", how="left")
    woe_iv = woe_iv.merge(iv[["feature", "iv", "remarks"]].rename(columns={"iv": "iv_sum"}), on="feature", how="left")
    print("------------------Binning remarks added and process is complete.------------------")
    print("Total time elapsed: {} minutes".format(round((time.time() - func_start_time) / 60, 3)))
    return iv, woe_iv.replace({"Missing": np.nan})

In [2]:
master_data=pd.read_csv('../artifacts/training_data.csv')


In [3]:
master_data.isnull().sum().sum()
'Var196_1K8T'
master_data['Var196_1K8T'].isnull().sum()

np.int64(0)

In [4]:
iv, woe_iv = get_iv_woe(master_data.loc[:,~master_data.columns.isin(['Acct id'])].copy(), target_col="Label", max_bins=20)

  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_col: ["count", "sum", "mean"]}).reset_index()
  target_c

------------------IV and WOE calculated for individual groups.------------------
Total time elapsed: 1.822 minutes
------------------Aggregated IV values for features calculated.------------------
Total time elapsed: 1.822 minutes
------------------Null percent calculated in features.------------------
Total time elapsed: 1.824 minutes
------------------Binning remarks added and process is complete.------------------
Total time elapsed: 1.824 minutes


In [5]:
iv.sort_values(['iv'],ascending=False)

Unnamed: 0,feature,iv,number_of_classes,feature_null_percent,remarks
515,Var217_embed_27,0.436326,11,0.0,binned monotonically
537,Var217_embed_47,0.428020,14,0.0,binned monotonically
511,Var217_embed_23,0.421952,12,0.0,binned monotonically
508,Var217_embed_20,0.412461,8,0.0,binned monotonically
499,Var217_embed_12,0.405638,8,0.0,binned monotonically
...,...,...,...,...,...
18,Var173,0.000000,1,0.0,categorical
713,Var44,0.000000,1,0.0,categorical
711,Var35,0.000000,1,0.0,categorical
19,Var181,0.000000,1,0.0,categorical


In [6]:
iv.sort_values(['iv'],ascending=False).to_csv('../data-analysis/iv.csv',index=False)
woe_iv.sort_values(['iv'],ascending=False).to_csv('../data-analysis/woe.csv',index=False)

In [7]:
woe_iv[woe_iv['feature']=='Var74']

Unnamed: 0,feature,sample_class,sample_class_label,sample_count,min_value,max_value,non_event_count,non_event_rate,event_count,event_rate,distbn_non_event,distbn_event,woe,iv,iv_sum,remarks
191,Var74,"(-0.001, 21.0]",0.0,21627,0.0,21.0,19775,0.914366,1852,0.085634,0.6097,0.721746,-0.168707,0.018903,0.060601,binned monotonically
192,Var74,"(21.0, 105.0]",1.0,6388,22.4,105.0,6011,0.940983,377,0.059017,0.18533,0.146921,0.232242,0.00892,0.060601,binned monotonically
193,Var74,"(105.0, 227.5]",2.0,6985,105.7,227.5,6648,0.951754,337,0.048246,0.20497,0.131333,0.445129,0.032778,0.060601,binned monotonically
194,Var74,,,0,,,0,,0,,0.0,0.0,,,0.060601,binned monotonically


In [8]:
woe_iv.sort_values(['iv'],ascending=False)

Unnamed: 0,feature,sample_class,sample_class_label,sample_count,min_value,max_value,non_event_count,non_event_rate,event_count,event_rate,distbn_non_event,distbn_event,woe,iv,iv_sum,remarks
2824,Var217_embed_15,"(-0.112, -0.048]",0.0,7003,-0.110800,-0.047982,6860,0.979580,143,0.020420,0.211506,0.055729,1.333759,0.207770,0.332239,binned monotonically
1742,Var202_embed_11,"(-0.108, -0.0544]",0.0,5001,-0.107425,-0.054402,4927,0.985203,74,0.014797,0.151908,0.028839,1.661561,0.204488,0.312231,binned monotonically
1664,Var202_embed_0,"(0.051, 0.137]",4.0,7000,0.050987,0.136648,6853,0.979000,147,0.021000,0.211291,0.057288,1.305150,0.200997,0.318270,binned monotonically
2897,Var217_embed_23,"(-0.0785, -0.0313]",0.0,2921,-0.077547,-0.031343,2322,0.794933,599,0.205067,0.071592,0.233437,-1.181936,0.191291,0.421952,binned monotonically
1689,Var202_embed_4,"(-0.119, -0.0579]",0.0,4377,-0.117868,-0.057931,4317,0.986292,60,0.013708,0.133101,0.023383,1.739112,0.190813,0.328376,binned monotonically
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3884,Var228_embed_10,,,0,,,0,,0,,0.000000,0.000000,,,0.063840,binned monotonically
3887,Var228_embed_11,,,0,,,0,,0,,0.000000,0.000000,,,0.030437,binned monotonically
3890,Var228_embed_12,,,0,,,0,,0,,0.000000,0.000000,,,0.066139,binned monotonically
3893,Var228_embed_13,,,0,,,0,,0,,0.000000,0.000000,,,0.003107,binned monotonically
