WOE - Weight of Evidence

IV - Information Value

In [7]:
# import Libraries here
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler
import scipy.stats.stats as stats

import numpy as np
import pandas as pd

In [2]:
spark = SparkSession.builder.getOrCreate()
dataset = 2 # load either Melbourne Housing dataset (2) or Bank dataset(1)

# if option is set as 1, load bank dataset
if dataset == 1:
    filename = 'bank/bank-full.csv'
    target_variable_name = 'y'
    df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
    df = df.withColumn(target_variable_name, F.when(df['target_variable_name'] == 'no', 
                                            0).otherwise(1))
else:
    filename = 'melb_data.csv'
    target_variable_name = 'type'
    df = spark.read.csv(filename, header=True, inferSchema=True, sep=',')
    df = df.withColumn(target_variable_name, F.when(df[target_variable_name] == 'h',
                                             0).otherwise(1))

df.show(5, truncate=False)


+----------+----------------+-----+----+---------+------+-------+---------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+---------------------+-------------+
|Suburb    |Address         |Rooms|type|Price    |Method|SellerG|Date     |Distance|Postcode|Bedroom2|Bathroom|Car|Landsize|BuildingArea|YearBuilt|CouncilArea|Lattitude|Longtitude|Regionname           |Propertycount|
+----------+----------------+-----+----+---------+------+-------+---------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+---------------------+-------------+
|Abbotsford|85 Turner St    |2    |0   |1480000.0|S     |Biggin |3/12/2016|2.5     |3067.0  |2.0     |1.0     |1.0|202.0   |null        |null     |Yarra      |-37.7996 |144.9984  |Northern Metropolitan|4019.0       |
|Abbotsford|25 Bloomburg St |2    |0   |1035000.0|S     |Biggin |4/02/2016|2.5     |3067.0  |2.0     |1.0     |0.0|156.0   |79.0    

In [3]:
df.groupBy(target_variable_name).count().show()

+----+-----+
|type|count|
+----+-----+
|   1| 4131|
|   0| 9449|
+----+-----+



In [5]:
# identify variable types and perform some operations
def variable_type(df):
    vars_list = df.dtypes
    char_vars, num_vars = [], []
    for i in vars_list:
        char_vars.append(i[0]) if i[1] in ('string') else num_vars.append(i[0])
    return char_vars, num_vars

char_vars, num_vars = variable_type(df)
print('Character Variable: {}'.format(char_vars)) 

Character Variable: ['Suburb', 'Address', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']


In [8]:
# prepare the final dataset
if dataset != 1:
    char_vars.remove('Address')
    char_vars.remove('SellerG')
    char_vars.remove('Date')
    char_vars.remove('Suburb')

num_vars.remove(target_variable_name)
final_vars = char_vars + num_vars

final_vars

['Method',
 'CouncilArea',
 'Regionname',
 'Rooms',
 'Price',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [None]:
custom_rho = 1
max_bin = 20

def calculcate_woe(count_df, event_df, min_value, max_value, feature):
    woe_df = pd.merge(left=count_df, right=event_df)
    woe_df['min_value'], woe_df['max_value'] = min_value, max_value
    woe_df['non_event'] = woe_df['count'] - woe_df['event']
    woe_df['event_rate'] = woe_df['event']/woe_df['count']
    woe_df['nonevent_rate'] = woe_df['non_event']/woe_df['count']
    woe_df['dist_event'] = woe_df['event']/woe_df['event'].sum()
    woe_df['dist_nonevent'] = woe_df['non_event']/woe_df['non_event'].sum()
    woe_df['woe'] = np.log(woe_df['dist_event']/woe_df['dist_nonevent'])
    woe_df['iv'] = (woe_df['dist_event'] - woe_df['dist_nonevent'])*woe_df['woe']
    woe_df['varname'] = [feature]* len(woe_df)
    woe_df = woe_df[['varname','min_value', 'max_value', 'count','event', 'non_event', 
            'event_rate', 'nonevent_rate', 'dist_event','dist_nonevent','woe', 'iv']]
    # replace Positive & Negative Infinity values with zero
    woe_df = woe_df.replace([np.inf, -np.inf], 0) 
    woe_df['iv'] = woe_df['iv'].sum()
    return woe_df

# monotonic binning function implemented alng with Spearman correlation
def mono_bin(temp_df, feature, target, n = max_bin):
    r = 0
    while np.abs(r) < custom_rho and n > 1:
        try:
            # Quantile Discretizer cuts data into equal no of obervations
            qds = QuantileDiscretizer(numBuckets=n, inputCol=feature, outputCol='buckets', 
                                        relativeError=0.01)
            bucketizer = qds.fit(temp_df)
            temp_df = bucketizer.transform(temp_df)
            corr_df = temp_df.groupBy('buckets').agg({feature:'avg', target:'avg'}).toPandas()
            corr_df.columns = ['buckets', feature, target]
            r, p = stats.spearmanr(corr_df[feature], corr_df[target])
            n = n - 1
        except Exception as e:
            n = n - 1
        return temp_df

# excute WOE for all the variables in the Dataset
def execute_woe(df, target):
    count = -1
    for feature in final_vars:
        if feature != target:
            count +=1
            temp_df = df.select([feature, target])
            # perform Monotonic binnning for numeric variables before WOE calc
            if feature in num_vars:
                temp_df = mono_bin(temp_df, feature, target, n=max_bin)
                # group buckets in numerical
                grouped = temp_df.groupBy('buckets')
            else:
                # just group categories in categorical
                grouped = temp_df.groupby(feature)
            # count and event value for each group
            count_df = grouped.agg(F.count(target).alias('count')).toPandas()
            event_df = grouped.agg(F.sum(target).alias('event')).toPandas()
            # store min and max values for variables. For category, both takes the same value.
            if feature in num_vars:
                min_value = grouped.agg(F.min(feature).alias('min')).toPandas()['min']
                max_value = grouped.agg(F.max(feature).alias('max')).toPandas()['max']
            else:
                min_value, max_value = count_df[feature], count_df[feature]

            # calc WOE & IV
            temp_woe_df = calculcate_woe(count_df, event_df, min_value, max_value, feature)
            # final dataset creation
            if count == 0:
                final_woe_df = temp_woe_df
            else:
                final_woe_df = final_woe_df.append(temp_woe_df, ignore_index=True)

        # seperate IV dataset creation
        iv = pd.DataFrame({'IV': final_woe_df.groupby('varname').iv.max()})
        iv = iv.reset_index()
    return final_woe_df, iv

output, iv = execute_woe(df, target_variable_name)