In [2]:
import pandas as pd
import numpy as np
import time
import warnings

In [3]:
def WOETransformation(Data, Target, CoverBound = 0.01, MeanDiffBound = 0.01, IVBound = 0.02, EE=0):

    """
        This function groupping columns/feature of given data.
        Then calculate WOE and IV value for each group and IV value for each column.

        Parameters:
            Data          : The name of dataset (should be a dataframe) will be used for groupping.
            Target        : Target column name in data.
            CoverBound    : Minimum coverage percentile of data
            MeanDiffBound : Minimum difference between groups target rate

        Returns:
         AllBinning(DataFrame): Bin, WOE, IV values for each group of columns.
        ,FeatureSelection(DataFrame): Column list of data with their IV and selection is 1 if columns' IV is higher than 0.02 
        ,BinResult(DataFrame): Bin, WOE, IV values for each group of columns in FeatureSelection with selection is 1.

        Example:
        AllBinning, FeatureSelection, BinResult = WOETransformation(Train, 'Target')
    """
    import warnings
    
    warnings.simplefilter('ignore')
    
    ExecutionTime = pd.DataFrame(columns=['ColumnName','Action','StartTime','EndTime'])
    
    AllBinning = pd.DataFrame({  'ColumnName': pd.Series(dtype='str')
                                ,'Bin': pd.Series(dtype='int')
                                ,'LowerBound': pd.Series(dtype='float')
                                ,'UpperBound': pd.Series(dtype='float')
                                ,'Coverage': pd.Series(dtype='float')
                                ,'Rate': pd.Series(dtype='float')
                                ,'Freq': pd.Series(dtype='int')
                                ,'Target': pd.Series(dtype='int')
                                ,'NonTarget': pd.Series(dtype='int')
                                ,'BinFreq': pd.Series(dtype='int')
                                ,'BinTarget': pd.Series(dtype='int')
                                ,'BinNonTarget': pd.Series(dtype='int')
                                ,'BinPercTarget': pd.Series(dtype='float')
                                ,'BinPercNonTarget': pd.Series(dtype='float')
                                ,'WOE': pd.Series(dtype='float')
                                ,'IV': pd.Series(dtype='float')
                                ,'BinClass': pd.Series(dtype='str')
                            })
    
    print('WOE Calculation...')
    
    for ColumnName in Data.columns.drop(Target):
        #print(ColumnName + ' binning')
        
        StartTime = time.strftime('%H:%M:%S', time.localtime())                      

        Bounds = Data[ColumnName][Data[Target]==1].unique()
        Bounds = np.sort(Bounds[~np.isnan(Bounds)])
        
        if len(Bounds) > round(len(Data)*CoverBound):
            GroupNumber = round(len(Data)*(CoverBound**2))
            Bounds=pd.DataFrame({ColumnName: np.sort(Data[ColumnName].unique())}).dropna()
            Bounds['SortNumber'] = Bounds.reset_index().index + 1
            Bounds['Group'] = Bounds['SortNumber']/(len(Bounds)/GroupNumber)
            Bounds['Group'] = Bounds['Group'].apply(np.floor)+1
            BoundsAgg = Bounds.groupby('Group').aggregate(Count=('Group','count'), Max=(ColumnName,'max'))
            if BoundsAgg.iloc[len(BoundsAgg)-1,0] == 1: BoundsAgg.drop(BoundsAgg.index.max(), inplace=True)
            Bounds = list(BoundsAgg['Max'].unique())
        
        Cuts = pd.DataFrame({'LowerBound':pd.Series(dtype='float')
                            ,'UpperBound':pd.Series(dtype='float')
                            ,'Count': pd.Series(dtype='int')
                            ,'Target': pd.Series(dtype='float')
                            })

        i=0
        while i < len(Bounds):
            if len(Bounds) == 1:
                Cuts.loc[len(Cuts)] = [np.NINF,np.PINF, np.nan, np.nan]
            elif i == 0:
                Cuts.loc[len(Cuts)] = [np.NINF,Bounds[i], np.nan, np.nan]
            elif i == (len(Bounds)-1):
                Cuts.loc[len(Cuts)] = [Bounds[i-1],np.PINF, np.nan, np.nan]    
            else:
                Cuts.loc[len(Cuts)] = [Bounds[i-1],Bounds[i], np.nan, np.nan]
            i=i+1            
        del i
                  
        for i in Cuts.index:
            Cuts.loc[i,['Count','Target']] = [Data[Target].where((Data[ColumnName] > Cuts.iloc[i,0]) & (Data[ColumnName] <= Cuts.iloc[i,1])).count()
                                             ,Data[Target].where((Data[ColumnName] > Cuts.iloc[i,0]) & (Data[ColumnName] <= Cuts.iloc[i,1])).sum()]                
                
        Cuts['Rate'] = Cuts['Target'] / Cuts['Count']
        Cuts['Coverage'] = Cuts['Count'] / Cuts['Count'].sum()
        
        i = 0
        while (i<len(Cuts)) & (len(Cuts)>1):
            #print('iteration: ' + str(i+1)  + ' max iteration ' + str(len(Cuts)))

            if i == 0 and ((Cuts['Coverage'][i]<=CoverBound) or (abs(Cuts['Rate'][i]-Cuts['Rate'][i+1])<=MeanDiffBound)):
                Cuts['Count'][i+1]=Cuts['Count'][i]+Cuts['Count'][i+1]
                Cuts['Target'][i+1]=Cuts['Target'][i]+Cuts['Target'][i+1]
                Cuts['LowerBound'][i+1]=np.NINF
                Cuts = Cuts.drop(i).reset_index(drop=True)  
                i = 0

            elif i == Cuts.index.max() and ((Cuts['Coverage'][i]<=CoverBound) or (abs(Cuts['Rate'][i]-Cuts['Rate'][i-1])<=MeanDiffBound)):           
                Cuts['Count'][i]=Cuts['Count'][i-1]+Cuts['Count'][i]
                Cuts['Target'][i]=Cuts['Target'][i-1]+Cuts['Target'][i]
                Cuts['LowerBound'][i]=Cuts['LowerBound'][i-1]
                Cuts = Cuts.drop(i-1).reset_index(drop=True) 
                i = 0   

            elif (i != 0) and (i != Cuts.index.max()) and ((Cuts['Coverage'][i]<=CoverBound) or (abs(Cuts['Rate'][i]-Cuts['Rate'][i-1])<=MeanDiffBound) or (abs(Cuts['Rate'][i]-Cuts['Rate'][i+1])<=MeanDiffBound)):
                if abs(Cuts['Rate'][i]-Cuts['Rate'][i+1]) < abs(Cuts['Rate'][i]-Cuts['Rate'][i-1]):
                    Cuts['Count'][i+1]=Cuts['Count'][i]+Cuts['Count'][i+1]
                    Cuts['Target'][i+1]=Cuts['Target'][i]+Cuts['Target'][i+1]
                    Cuts['LowerBound'][i+1]=Cuts['LowerBound'][i]
                    Cuts = Cuts.drop(i).reset_index(drop=True)
                else:
                    Cuts['Count'][i]=Cuts['Count'][i-1]+Cuts['Count'][i]
                    Cuts['Target'][i]=Cuts['Target'][i-1]+Cuts['Target'][i]
                    Cuts['LowerBound'][i]=Cuts['LowerBound'][i-1]
                    Cuts = Cuts.drop(i-1).reset_index(drop=True)
                i = 0 

            else: 
                i=i+1

            Cuts['Rate']=Cuts['Target']/Cuts['Count']
            Cuts['Coverage']=Cuts['Count']/Cuts['Count'].sum()    
  
        #Add Null Value
        NullData = Data[[ColumnName,Target]].loc[Data[ColumnName].isnull()]
        if NullData[Target].count() > 0:
            Cuts.loc[len(Cuts.index)] = [np.nan, np.nan, NullData[Target].count(), NullData[Target].sum(),0,0]
            Cuts['Rate']=Cuts['Target']/Cuts['Count']
            Cuts['Coverage']=(Cuts['Count']/Cuts['Count'].sum())

        ExecutionTime.loc[len(ExecutionTime)] = [ColumnName,'Binning',StartTime,time.strftime('%H:%M:%S', time.localtime())]                      
        
        #Calculate WOE
        #print(ColumnName + ' WOE Calculation')
        StartTime = time.strftime('%H:%M:%S', time.localtime())                      
     
        Cuts.columns = ['LowerBound','UpperBound','Freq','Target','Rate','Coverage']
        Cuts = Cuts.iloc[:, [0,1,5,4,2,3]]

        Cuts.insert(loc=0, column='Bin', value=(1 + Cuts.reset_index().index))
        Cuts.insert(loc=0, column='ColumnName', value=ColumnName)

        if (NullData[Target].count() > 0) and (NullData[Target].count() < round(len(Data)*CoverBound)):   
            Rate = NullData[Target].sum() / NullData[Target].count()
            CutBin = 1
            Diff = abs(Cuts['Rate'][0]-Rate)
            i=1
            while i < Cuts.index.max():
                if Diff > abs(Cuts['Rate'][i]-Rate):
                    CutBin = Cuts['Bin'][i]
                    Diff = abs(Cuts['Rate'][i]-Rate)
                i=i+1

            Cuts.loc[Cuts.index[-1], 'Bin'] = CutBin
            
            del Rate
            del CutBin
            del Diff
            del i

        Cuts['NonTarget'] = Cuts['Freq'] - Cuts['Target']
        
        Bin = Cuts.groupby('Bin').aggregate(BinFreq=('Freq','sum'), BinTarget=('Target','sum')).sort_values(by=['Bin']).reset_index()
        Bin['BinNonTarget'] = Bin['BinFreq'] - Bin['BinTarget']
        Bin['BinPercTarget'] = Bin['BinTarget'] / Bin['BinTarget'].sum()                             
        Bin['BinPercNonTarget'] = Bin['BinNonTarget'] / Bin['BinNonTarget'].sum()
        Bin['WOE'] = np.log(Bin['BinPercNonTarget']/Bin['BinPercTarget'])
        Bin['IV'] = (Bin['BinPercNonTarget']-Bin['BinPercTarget'])*Bin['WOE']    
        Bin['BinClass']=0
        
        Bin.sort_values(by=['WOE'], inplace=True)
        Proportion = round(len(Bin)/3)
        if Proportion >= 1 :           
            Bin['BinClass'].iloc[:Proportion]=-1
            Bin['BinClass'].iloc[-Proportion:]=1
            
            
        Bin.sort_values(by=['Bin'], inplace=True)           
        
        Cuts = Cuts.merge(Bin, on='Bin', how='left') 

        ExecutionTime.loc[len(ExecutionTime)] = [ColumnName,'WOE and IV Calculation',StartTime,time.strftime('%H:%M:%S', time.localtime())]       
        
        AllBinning = AllBinning.append(Cuts, ignore_index=True)
        
        #print('Completed features: ' + str(len(AllBinning['ColumnName'].value_counts())) + ' Remaining Features: ' + str(len(Data.columns.drop(Target)) - len(AllBinning['ColumnName'].value_counts())))


    #Feature Selection
    print('Feature Selection...')
    StartTime = time.strftime('%H:%M:%S', time.localtime()) 
    
    Bin = AllBinning[['ColumnName','Bin','IV']].drop_duplicates()
    
    FeatureSelection = Bin.groupby('ColumnName').aggregate(IV=('IV','sum')).sort_values(by=['IV'], ascending=False).reset_index()
    FeatureSelection['Selection']=[-1 if x == np.inf else 1 if x > IVBound else 0 for x in FeatureSelection['IV']]

    ExecutionTime.loc[len(ExecutionTime)] = ['All','Feature Selection',StartTime,time.strftime('%H:%M:%S', time.localtime())]  
    
    BinResult = AllBinning[AllBinning.ColumnName.isin(FeatureSelection['ColumnName'][FeatureSelection['Selection']==1])]
    
    if EE == 1:
        print('Excel Export...')    
        with pd.ExcelWriter('WOETransformationNew.xlsx') as writer:
            AllBinning.to_excel(writer, sheet_name='AllBinning', index=False)
            FeatureSelection.to_excel(writer, sheet_name='FeatureSelection', index=False)
            BinResult.to_excel(writer, sheet_name='BinResult', index=False)
            ExecutionTime.to_excel(writer, sheet_name='ExecutionTime', index=False)

    print('WOETransformation Completed... \n')
    
    print(str(len(AllBinning.ColumnName.unique())) + ' değişkenden ' + str(len(BinResult.ColumnName.unique())) + ' değişken seçildi.\n' + 'Değişken listesi: ' + str(BinResult.ColumnName.unique())) 
    
    return AllBinning, FeatureSelection, BinResult

In [4]:
def ApplyWOE(Data, BinResult):
    """
        This function apply WOETransformation result to given data.
        Create 2 new column for each column in BinResult (ColumnName_Bin and ColumnName_WOE).

        Parameters:
        Data   : The name of dataset (should be a dataframe) will be transforming.
        BinResult : BinResult of WOETransformation.

        Returns:
         Data(DataFrame): Given data + 2 new column for each column in BinResult.

        Example:
        Data = ApplyWOE(Train, BinResult)
    """
    warnings.simplefilter('ignore')
    
    Sample = Data.copy()
    
    for i in BinResult['ColumnName'].unique():
        #print(i)
        
        Sample[i+'_Bin'] = np.nan
        Sample[i+'_WOE'] = np.nan

        BinValues = BinResult[['LowerBound','UpperBound','Bin','WOE']][BinResult['ColumnName']==i].sort_values(by=['LowerBound']).reset_index(drop=True)

        if BinValues['LowerBound'].isnull().sum() == 0:
            BinValues.loc[len(BinValues)] = [np.nan,np.nan,BinValues.iloc[0,2],BinValues.iloc[0,3]]       
        
        for j in BinValues.index:           
            if np.isnan(BinValues.iloc[j,0]):
                Sample[i+'_Bin'][np.isnan(Sample[i])] = BinValues.iloc[j, 2]
                Sample[i+'_WOE'][np.isnan(Sample[i])] = BinValues.iloc[j, 3]       
            else:
                Sample[i+'_Bin'][(Sample[i] > BinValues.iloc[j, 0]) & (Sample[i] <= BinValues.iloc[j, 1])] = BinValues.iloc[j, 2]
                Sample[i+'_WOE'][(Sample[i] > BinValues.iloc[j, 0]) & (Sample[i] <= BinValues.iloc[j, 1])] = BinValues.iloc[j, 3]
                
    return Sample