### Reduce item_type_code_name levels by using weight of evidence

In [1]:
%%capture

%run '../lib/init.ipynb'

import pandas as pd
import numpy as np

from lib import utilities as util
from lib.woe import WOE_IV

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

sc = SparkContext.getOrCreate()
spark = SparkSession( sc )

In [2]:
filterDf  = util.load_df('filterDf')
types = util.get_var_types()

###### Display levels

In [26]:
util.count_values(filterDf, 'item_type_code_name').show(50, False)

+----------------------+-----+
|item_type_code_name   |count|
+----------------------+-----+
|multipleChoice        |26704|
|fillInTheBlank        |10497|
|equationEntry         |7841 |
|trueFalse             |4561 |
|cloze                 |3024 |
|multipleSelect        |2295 |
|graphing              |2076 |
|MultipleChoiceResponse|1550 |
|choiceMatrix          |1203 |
|matching              |924  |
|shortAnswer           |853  |
|selectText            |813  |
|bucketing             |799  |
|sortable              |549  |
|essay                 |531  |
|numberLine            |302  |
|aheAlgo               |219  |
|imageLabel            |80   |
|RubricResponse        |11   |
|FillinBlankResponse   |5    |
|fileUpload            |1    |
+----------------------+-----+



###### Display item_type_code_name and scoring_type_code Corelation

In [27]:
dfPd = filterDf.select('item_type_code_name','scoring_type_code').toPandas()
pd.crosstab(dfPd.item_type_code_name.fillna('null'), dfPd.scoring_type_code.fillna('null'), margins=True, margins_name="Total")

scoring_type_code,[unassigned],automatic,external,manual,Total
item_type_code_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FillinBlankResponse,5,0,0,0,5
MultipleChoiceResponse,1550,0,0,0,1550
RubricResponse,11,0,0,0,11
aheAlgo,0,0,219,0,219
bucketing,0,799,0,0,799
choiceMatrix,0,1203,0,0,1203
cloze,0,3024,0,0,3024
equationEntry,0,7841,0,0,7841
essay,9,0,0,522,531
fileUpload,0,0,0,1,1


- unassigned
    - FillinBlankResponse
    - MultipleChoiceResponse
    - shortAnswer
- manual
  - aheAlg

###### Demo approach 1 to of weight of evidence calculation

In [28]:
df = pd.DataFrame(
    {'cat': ['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c'],
     'target': [1, 0, 0, 1, 0, 0, 1, 1, 0]
     })

feature,target = 'cat','target'
df_woe_iv = (pd.crosstab(df[feature],df[target],
                      normalize='columns')
             .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
             .assign(iv=lambda dfx: np.sum(dfx['woe']*
                                           (dfx[1]-dfx[0]))))

In [29]:
df_woe_iv

target,0,1,woe,iv
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.6,0.25,-0.875469,0.592458
b,0.2,0.5,0.916291,0.592458
c,0.2,0.25,0.223144,0.592458


###### Demo approach 2 of Weight of evidence calculation using existing library

In [30]:
df = pd.DataFrame({'cat': ['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c'],
     'target': [1, 0, 0, 1, 0, 0, 1, 1, 0]
     })

spDf = spark.createDataFrame(df)

spDf.show()

+---+------+
|cat|target|
+---+------+
|  a|     1|
|  b|     0|
|  a|     0|
|  b|     1|
|  a|     0|
|  a|     0|
|  b|     1|
|  c|     1|
|  c|     0|
+---+------+



In [31]:
woe = WOE_IV(spDf, ['cat'], 'target', 1, 0)

In [32]:
woe.fit()

In [33]:
df_woe_iv

target,0,1,woe,iv
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.6,0.25,-0.875469,0.592458
b,0.2,0.5,0.916291,0.592458
c,0.2,0.25,0.223144,0.592458


In [34]:
woe.transform(spDf).show()

+---+------+-------------------+
|cat|target|            cat_woe|
+---+------+-------------------+
|  a|     1|-0.8754687373538999|
|  b|     0| 0.9162907318741551|
|  a|     0|-0.8754687373538999|
|  b|     1| 0.9162907318741551|
|  a|     0|-0.8754687373538999|
|  a|     0|-0.8754687373538999|
|  b|     1| 0.9162907318741551|
|  c|     1|0.22314355131420976|
|  c|     0|0.22314355131420976|
+---+------+-------------------+



In [35]:
woe.compute_iv()


{'cat': 0.5924584552018219}

##### Actual weight of evidence calculation

###### Add target variable

In [3]:
targetDf = util.create_target_var_from_raw_score(filterDf)
targetDf.show()

+------------------------+------------------+------------------+---------------------------------+-----------------------+--------------------------+--------------------------------+---------------------+-------------------+--------------------------+-------------------------+-------------------------+-------------------------+-----------------------------+------------------------+---------------+--------------------------+----------------------+---------------------+----------------------+----------------------+--------------------------+----------------------+-------------------+-------------------+-----------------+---------+--------------------+-------------------------------+-----------------------------+--------------------+----------+------+----------+-------------+----------------------+-----------------------------+------------------------------+----------+--------------------------------+---------------------------+--------------------------+---------------------+------------

###### Calculate weight of evidence

In [37]:
util.add_weight_of_evidence(targetDf, 'item_type_code_name', 'target').show()

+------------------------+------------------+------------------+---------------------------------+-----------------------+--------------------------+--------------------------------+---------------------+-------------------+--------------------------+-------------------------+-------------------------+-------------------------+-----------------------------+------------------------+---------------+--------------------------+----------------------+---------------------+----------------------+----------------------+--------------------------+----------------------+-------------------+-------------------+-----------------+---------+--------------------+-------------------------------+-----------------------------+--------------------+----------+------+----------+-------------+----------------------+-----------------------------+------------------------------+----------+--------------------------------+---------------------------+--------------------------+---------------------+------------

###### Calculate smoothed weight of evidence

In [38]:
util.add_weight_of_evidence(targetDf, 'item_type_code_name', 'target', 24).show()

+------------------------+------------------+------------------+---------------------------------+-----------------------+--------------------------+--------------------------------+---------------------+-------------------+--------------------------+-------------------------+-------------------------+-------------------------+-----------------------------+------------------------+---------------+--------------------------+----------------------+---------------------+----------------------+----------------------+--------------------------+----------------------+-------------------+-------------------+-----------------+---------+--------------------+-------------------------------+-----------------------------+--------------------+----------+------+----------+-------------+----------------------+-----------------------------+------------------------------+----------+--------------------------------+---------------------------+--------------------------+---------------------+------------

In [39]:
targetDf.agg( F.avg( F.col('target') ).alias('mean') ).collect()[0]['mean']

0.7112033067028595

###### Smoothed weight of evidence test

###### Create data

In [40]:
# Data and test results from SAS Certification course Part 2, Lesson 3: Preparing the Input Variables
#
#vResults Should be
# +---+---+---+--------------------+
# |cat|  1|  0|                swoe|
# +---+---+---+--------------------+
# |  A|  7| 28| -1.3990550617101052|
# |  B|  0| 16|  -2.021151265806204|
# |  C| 11| 94|  -1.977829660040054|
# |  D| 21| 23|-0.49954730872828346|
# +---+---+---+--------------------+

df2 = pd.DataFrame(columns = ['cat', 'target'])

for m in range(28):
    df2 = df2.append({'cat' : 'A', 'target' : 0}, ignore_index = True)

for m in range(7):
    df2 = df2.append({'cat' : 'A', 'target' : 1}, ignore_index = True)

for m in range(16):
    df2 = df2.append({'cat' : 'B', 'target' : 0}, ignore_index = True)

for m in range(94):
    df2 = df2.append({'cat' : 'C', 'target' : 0}, ignore_index = True)

for m in range(11):
    df2 = df2.append({'cat' : 'C', 'target' : 1}, ignore_index = True)

for m in range(23):
    df2 = df2.append({'cat' : 'D', 'target' : 0}, ignore_index = True)

for m in range(21):
    df2 = df2.append({'cat' : 'D', 'target' : 1}, ignore_index = True)


pd.crosstab(df2.cat, df2.target)

target,0,1
cat,Unnamed: 1_level_1,Unnamed: 2_level_1
A,28,7
B,16,0
C,94,11
D,23,21


###### Create dataframe

In [41]:
testDf = spark.createDataFrame(df2)

In [42]:
# util.add_weight_of_evidence(testDf, 'cat', 'target', 24).distinct().show()

woe = WOE_IV(testDf, ['cat'], 'target', 1, 24)
woe.fit()
woe.transform(testDf).distinct().show()

+---+------+------------------+
|cat|target|           cat_woe|
+---+------+------------------+
|  A|     0|3.8557076750075017|
|  A|     1|3.8557076750075017|
|  D|     1|4.0226557522642405|
|  B|     0| 4.197825909502335|
|  C|     1| 3.325713219942633|
|  C|     0| 3.325713219942633|
|  D|     0|4.0226557522642405|
+---+------+------------------+



In [43]:
woe.rhol

0.195

###### Use pandas crosstabe to calculate swoe

In [44]:
util.add_swoe(testDf, 'target', 'cat', 24).distinct().show()

+---+------+--------------------+
|cat|target|            cat_swoe|
+---+------+--------------------+
|  B|     0|  -2.021151265806204|
|  D|     0|-0.49954730872828346|
|  D|     1|-0.49954730872828346|
|  C|     0|  -1.977829660040054|
|  C|     1|  -1.977829660040054|
|  A|     0| -1.3990550617101052|
|  A|     1| -1.3990550617101052|
+---+------+--------------------+



In [45]:
# Should be
# +---+---+---+--------------------+
# |cat|  1|  0|                swoe|
# +---+---+---+--------------------+
# |  A|  7| 28| -1.3990550617101052|
# |  B|  0| 16|  -2.021151265806204|
# |  C| 11| 94|  -1.977829660040054|
# |  D| 21| 23|-0.49954730872828346|
# +---+---+---+--------------------+

###### Test with real data

In [4]:
util.add_swoe(targetDf, 'target', 'item_type_code_name', 24).select('target', 'item_type_code_name', 'item_type_code_name_swoe').distinct().show()

+------+--------------------+------------------------+
|target| item_type_code_name|item_type_code_name_swoe|
+------+--------------------+------------------------+
|     0|            matching|      1.6484411234822027|
|     0|       equationEntry|      0.6998692812113955|
|     0|          selectText|      0.1944883042850973|
|     0|               essay|      2.4538814070726778|
|     0|MultipleChoiceRes...|      0.4954550053599087|
|     0|          numberLine|      0.6484141415912662|
|     1|       equationEntry|      0.6998692812113955|
|     1|             aheAlgo|     0.34187369559511316|
|     0|           bucketing|      0.5092389215192233|
|     1|           trueFalse|      2.1096355178584147|
|     1|      multipleChoice|      1.2219898236832512|
|     1|            matching|      1.6484411234822027|
|     0|         shortAnswer|      1.7673389419782077|
|     1| FillinBlankResponse|      0.6523843277995943|
|     0|           trueFalse|      2.1096355178584147|
|     1|  