# !!! DEPRECATION NOTICE !!!
The scripts contained in this notebook were meant to work with exported local feature contribution values from early versions of Driverless AI. These scripts may not be applicable to newer versions of local contribution files exported from Driverless AI.
***

## License 

Copyright 2019 H2O.ai team

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Format reason codes from Driverless AI

This example notebook reads in Shapley and k-LIME reason code data files downloaded from the MLI GUI and formats them for an Excel or Desktop analysis workflow.

#### Imports and global constants

In [1]:
# Import libraries
import multiprocessing
import pandas as pd
import numpy as np
import datatable as dt

# Global variables
NUM_PARTITIONS = multiprocessing.cpu_count() - 1
NUM_CORES = multiprocessing.cpu_count() - 1

#### Utility function to parallize actions across a dataframe

In [2]:
def parallelize_dataframe(df, func):
    """
    Compute function (func) on a Pandas dataframe (df) in parallel
    
    :param df: Input dataframe
    :func: Function to call in parallel
    :return: 
    """
    df_split = np.array_split(df, NUM_PARTITIONS)
    pool = multiprocessing.Pool(NUM_CORES)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

#### Read in Shapley reason codes (downloaded from MLI GUI)

In [3]:
# Look at original data
shapley = dt.fread('data/shapley.csv')
shapley

Unnamed: 0_level_0,0_CVTE:AGE.0,2_CVTE:LIMIT_BAL.0,4_CVTE:PAY_0.0,5_CVTE:PAY_2.0,6_CVTE:PAY_3.0,8_CVTE:PAY_5.0,12_BILL_AMT1,13_BILL_AMT2,19_LIMIT_BAL,21_PAY_0,…,27_PAY_AMT1,28_PAY_AMT2,29_PAY_AMT3,30_PAY_AMT4,bias
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,…,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,0.125763,0.107709,0.728396,0.147137,−0.127325,−0.0840966,0.108975,0.0229046,0.0992958,0.688592,…,0.0823605,0.0231967,0.157784,0.105548,−1.5067
1,−0.0797229,−0.0173814,0.065187,0.284838,−0.033062,−0.0301014,0.166342,−0.00763347,0.0708706,−0.0707393,…,0.214859,0.0981565,−0.0938981,−0.014702,−1.5067
2,−0.0986447,−0.00916571,−0.246197,0.00820618,−0.0593971,−0.0365887,−0.199944,−0.172448,0.106319,−0.168737,…,0.113145,0.253127,−0.13106,−0.0285687,−1.5067
3,0.113559,0.0717613,−0.242494,0.0382579,−0.0438202,−0.036579,−0.117394,0.108876,0.309207,−0.159504,…,0.127461,0.0264272,−0.090039,−0.109301,−1.5067
4,−0.0563555,−0.0193644,−0.0185628,−0.00557991,−0.102132,−0.0398685,0.0221133,−0.0663479,0.118359,−0.115901,…,0.138267,−0.372892,−0.128766,−0.0976108,−1.5067
5,0.105329,0.0536787,−0.228283,0.0119653,−0.0291124,−0.0211087,−0.102646,0.0145059,0.314049,−0.157391,…,0.179562,−0.0202132,0.0594024,−0.018746,−1.5067
6,−0.0825688,0.0401524,−0.18247,−0.0434143,−0.032737,−0.049051,0.368337,0.27116,−0.290624,−0.167167,…,−0.182792,−0.314513,−0.122729,0.0494329,−1.5067
7,0.102416,−0.036149,−0.252761,−0.0533035,−0.08321,−0.0107705,−0.113317,−0.134469,0.0168813,−0.162415,…,0.0469942,0.0092096,0.263542,−0.00285059,−1.5067
8,−0.0535641,0.00715208,−0.183782,0.00138597,0.605251,9.87028e-05,−0.0259589,−0.0400683,0.00790348,−0.141193,…,0.0631202,0.163502,0.0823208,−0.0379284,−1.5067
9,−0.0595466,0.0696821,−0.183448,−0.0506932,−0.0980245,−0.00508782,0.134332,−0.0999866,0.11591,−0.113923,…,0.111696,0.0199954,0.153728,−0.125614,−1.5067


#### Function to reformat downloaded Shapley reason code file

In [4]:
def format_rc_shapley(orig):
    """
    
    Format Shapley reason codes into format:
    
    rc_1_var_name | rc_1_contrib | rc_2_var_name | rc_2_contrib | ... | rc_p_var_name | rc_p_contrib

    :param orig: Original Shapley reason code file downloaded from MLI GUI.
    :return: Reformatted reason code frame.
    """
    names = ['name_' + str(i) for i in range(1, len(orig.columns))]
    contribs = ['contrib_' + str(i) for i in range(1, len(orig.columns))]
    columns_ = [elem for pair in zip(names, contribs) for elem in pair]
    bias = orig['bias'].values[0] # Should always be the same bias for Shapley

    data = []
    for row in range(0, orig.shape[0]):
        names = list(orig.iloc[row, :-1].sort_values(axis=0, ascending=False).index)
        contribs = list(orig.iloc[row, :-1].sort_values(axis=0, ascending=False))
        vals = [elem for pair in zip(names, contribs) for elem in pair]
        data.append(dict(zip(columns_, vals)))

    formatted_frame = pd.DataFrame(data, columns=columns_)
    formatted_frame['bias'] = bias

    return formatted_frame

#### Reformat Shapley reason code file

In [5]:
%time shapley_formatted_rc = parallelize_dataframe(shapley.to_pandas(), format_rc_shapley)
shapley_formatted_rc

CPU times: user 312 ms, sys: 208 ms, total: 520 ms
Wall time: 2 s


Unnamed: 0,name_1,contrib_1,name_2,contrib_2,name_3,contrib_3,name_4,contrib_4,name_5,contrib_5,...,contrib_11,name_12,contrib_12,name_13,contrib_13,name_14,contrib_14,name_15,contrib_15,bias
0,4_CVTE:PAY_0.0,0.728396,21_PAY_0,0.688592,29_PAY_AMT3,0.157784,22_PAY_2,0.156285,5_CVTE:PAY_2.0,0.147137,...,0.082361,28_PAY_AMT2,0.023197,13_BILL_AMT2,0.022905,8_CVTE:PAY_5.0,-0.084097,6_CVTE:PAY_3.0,-0.127325,-1.506704
1,5_CVTE:PAY_2.0,0.284838,22_PAY_2,0.246963,27_PAY_AMT1,0.214859,12_BILL_AMT1,0.166342,28_PAY_AMT2,0.098157,...,-0.030101,6_CVTE:PAY_3.0,-0.033062,21_PAY_0,-0.070739,0_CVTE:AGE.0,-0.079723,29_PAY_AMT3,-0.093898,-1.506704
2,28_PAY_AMT2,0.253127,27_PAY_AMT1,0.113145,19_LIMIT_BAL,0.106319,5_CVTE:PAY_2.0,0.008206,2_CVTE:LIMIT_BAL.0,-0.009166,...,-0.131060,21_PAY_0,-0.168737,13_BILL_AMT2,-0.172448,12_BILL_AMT1,-0.199944,4_CVTE:PAY_0.0,-0.246197,-1.506704
3,19_LIMIT_BAL,0.309207,27_PAY_AMT1,0.127461,0_CVTE:AGE.0,0.113559,13_BILL_AMT2,0.108876,2_CVTE:LIMIT_BAL.0,0.071761,...,-0.090039,30_PAY_AMT4,-0.109301,12_BILL_AMT1,-0.117394,21_PAY_0,-0.159504,4_CVTE:PAY_0.0,-0.242494,-1.506704
4,27_PAY_AMT1,0.138267,19_LIMIT_BAL,0.118359,12_BILL_AMT1,0.022113,5_CVTE:PAY_2.0,-0.005580,22_PAY_2,-0.007813,...,-0.097611,6_CVTE:PAY_3.0,-0.102132,21_PAY_0,-0.115901,29_PAY_AMT3,-0.128766,28_PAY_AMT2,-0.372892,-1.506704
5,19_LIMIT_BAL,0.314049,27_PAY_AMT1,0.179562,0_CVTE:AGE.0,0.105329,29_PAY_AMT3,0.059402,2_CVTE:LIMIT_BAL.0,0.053679,...,-0.021109,6_CVTE:PAY_3.0,-0.029112,12_BILL_AMT1,-0.102646,21_PAY_0,-0.157391,4_CVTE:PAY_0.0,-0.228283,-1.506704
6,12_BILL_AMT1,0.368337,13_BILL_AMT2,0.271160,30_PAY_AMT4,0.049433,2_CVTE:LIMIT_BAL.0,0.040152,22_PAY_2,-0.024015,...,-0.167167,4_CVTE:PAY_0.0,-0.182470,27_PAY_AMT1,-0.182792,19_LIMIT_BAL,-0.290624,28_PAY_AMT2,-0.314513,-1.506704
7,29_PAY_AMT3,0.263542,0_CVTE:AGE.0,0.102416,27_PAY_AMT1,0.046994,19_LIMIT_BAL,0.016881,28_PAY_AMT2,0.009210,...,-0.083210,12_BILL_AMT1,-0.113317,13_BILL_AMT2,-0.134469,21_PAY_0,-0.162415,4_CVTE:PAY_0.0,-0.252761,-1.506704
8,6_CVTE:PAY_3.0,0.605251,28_PAY_AMT2,0.163502,29_PAY_AMT3,0.082321,27_PAY_AMT1,0.063120,19_LIMIT_BAL,0.007903,...,-0.037928,13_BILL_AMT2,-0.040068,0_CVTE:AGE.0,-0.053564,21_PAY_0,-0.141193,4_CVTE:PAY_0.0,-0.183782,-1.506704
9,29_PAY_AMT3,0.153728,12_BILL_AMT1,0.134332,19_LIMIT_BAL,0.115910,27_PAY_AMT1,0.111696,2_CVTE:LIMIT_BAL.0,0.069682,...,-0.098024,13_BILL_AMT2,-0.099987,21_PAY_0,-0.113923,30_PAY_AMT4,-0.125614,4_CVTE:PAY_0.0,-0.183448,-1.506704


#### Read in LIME reason codes (downloaded from MLI GUI)

In [6]:
# Read in lime data
lime = dt.fread("data/klime_frame.csv")
lime

Unnamed: 0_level_0,predict_klime,cluster_klime,rc_PAY_3,rc_PAY_0,rc_PAY_2,rc_PAY_5,rc_PAY_AMT2,rc_BILL_AMT1,rc_AGE,rc_PAY_AMT1,…,rc_BILL_AMT2,rc_PAY_AMT3,model_pred,h2oframe_idx,pred_actual
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,…,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪
0,0.656323,1,−0.0137582,0.352129,0.0488794,0.0205709,−0.000149512,−0.000219406,0.0167853,0,…,0.000544234,0,0.697584,0,1
1,0.181074,0,−0.0124922,−0.0257828,0.00494531,−0.0193726,−0.000234622,−8.65119e-05,0.0190327,0,…,0.000273639,−0.000190309,0.330322,1,1
2,0.128555,0,−0.0124922,−0.0940895,0.0076832,−0.0193726,−0.000351932,−0.000943148,0.0248889,−0.000687155,…,0.00222512,−0.000190309,0.100091,2,0
3,0.14442,0,−0.0124922,−0.0940895,0.0076832,−0.0193726,−0.000473701,−0.00151573,0.0270849,−0.000905343,…,0.00765125,−0.00022837,0.177475,3,0
4,0.205841,0,−0.017863,−0.0257828,0.0076832,−0.0193726,−0.00860615,−0.000277954,0.0417254,−0.000905343,…,0.000899438,−0.00190309,0.0945625,4,0
5,0.145196,0,−0.0124922,−0.0940895,0.0076832,−0.0193726,−0.000425838,−0.00207732,0.0270849,−0.00113168,…,0.00905292,−0.000125033,0.203448,5,0
6,0.0404454,0,−0.0124922,−0.0940895,0.0076832,−0.0193726,−0.00938486,−0.0118693,0.0212287,−0.0248969,…,0.0653597,−0.00723173,0.0936635,6,0
7,0.075818,0,−0.017863,−0.0940895,−0.0286855,−0.0193726,−0.000141008,−0.000383078,0.0168366,−0.000172015,…,6.02798e-05,-0,0.123857,7,0
8,0.194741,0,0.0693608,−0.0940895,0.0076832,−0.0193726,-0,−0.000364015,0.0204967,−0.00150694,…,0.00223606,−8.22134e-05,0.254229,8,0
9,0.151561,1,−0.0147773,−0.0544787,−0.0178864,−0.015101,-0,-0,0.0244786,-0,…,0,-0,0.157877,9,0


#### Function to reformat downloaded LIME reason code file

In [7]:
def format_rc_lime(orig):
    
    """
    
    Format LIME reason codes into format:
    
    rc_1_var_name | rc_1_contrib | rc_2_var_name | rc_2_contrib | ... | rc_p_var_name | rc_p_contrib | cluster_klime | predict_klime | model_pred | pred_actual
    
    where:
    
    cluster_klime = k-LIME cluster this row belongs to
    pred_klime = k-LIME prediction
    model_pred = Model prediction
    pred_actual = Actual value of target
    
    :param orig: Original LIME reason code file downloaded from MLI GUI.
    :return: Reformatted reason code frame.
    """
    
    names = ['name_' + str(i) for i in range(1, len(orig.columns[orig.columns.str.contains("rc_")]))]
    contribs = ['contrib_' + str(i) for i in range(1, len(orig.columns[orig.columns.str.contains("rc_")]))]
    columns_ = [elem for pair in zip(names, contribs) for elem in pair]
    cluster_klime = orig['cluster_klime'].values
    predict_klime = orig['predict_klime'].values
    model_pred = orig['model_pred'].values
    pred_actual = orig['pred_actual'].values

    data = []
    for row in range(0, orig.shape[0]):
        names = list(orig[orig.columns[orig.columns.str.contains("rc_")]].iloc[row].sort_values(axis=0,
                                                                                                     ascending=False).index)
        names = [name[3:] for name in names] 
        contribs = list(
            orig[orig.columns[orig.columns.str.contains("rc_")]].iloc[row].sort_values(axis=0, ascending=False))
        vals = [elem for pair in zip(names, contribs) for elem in pair]
        data.append(dict(zip(columns_, vals)))

    formatted_frame = pd.DataFrame(data, columns=columns_)
    formatted_frame['cluster_klime'] = cluster_klime
    formatted_frame['predict_klime'] = predict_klime
    formatted_frame['model_pred'] = model_pred
    formatted_frame['pred_actual'] = pred_actual

    return formatted_frame

#### Reformat LIME reason code file

In [8]:
%time klime_formatted_rc = parallelize_dataframe(lime.to_pandas(), format_rc_lime)
klime_formatted_rc

CPU times: user 376 ms, sys: 136 ms, total: 512 ms
Wall time: 3.62 s


Unnamed: 0,name_1,contrib_1,name_2,contrib_2,name_3,contrib_3,name_4,contrib_4,name_5,contrib_5,...,name_9,contrib_9,name_10,contrib_10,name_11,contrib_11,cluster_klime,predict_klime,model_pred,pred_actual
0,PAY_0,0.352129,PAY_2,0.048879,PAY_5,0.020571,AGE,1.678531e-02,BILL_AMT2,0.000544,...,PAY_AMT2,-0.000150,BILL_AMT1,-0.000219,LIMIT_BAL,-0.004386,1,0.656323,0.697584,1
1,AGE,0.019033,PAY_2,0.004945,BILL_AMT2,0.000274,PAY_AMT1,0.000000e+00,BILL_AMT1,-0.000087,...,PAY_3,-0.012492,PAY_5,-0.019373,PAY_0,-0.025783,0,0.181074,0.330322,1
2,AGE,0.024889,PAY_2,0.007683,BILL_AMT2,0.002225,PAY_AMT4,-1.055745e-04,PAY_AMT3,-0.000190,...,PAY_3,-0.012492,PAY_5,-0.019373,LIMIT_BAL,-0.020709,0,0.128555,0.100091,0
3,AGE,0.027085,PAY_2,0.007683,BILL_AMT2,0.007651,PAY_AMT4,-1.161319e-04,PAY_AMT3,-0.000228,...,LIMIT_BAL,-0.011505,PAY_3,-0.012492,PAY_5,-0.019373,0,0.144420,0.177475,0
4,AGE,0.041725,PAY_2,0.007683,BILL_AMT2,0.000899,BILL_AMT1,-2.779543e-04,PAY_AMT1,-0.000905,...,LIMIT_BAL,-0.011505,PAY_3,-0.017863,PAY_5,-0.019373,0,0.205841,0.094563,0
5,AGE,0.027085,BILL_AMT2,0.009053,PAY_2,0.007683,PAY_AMT4,-1.055745e-04,PAY_AMT3,-0.000125,...,LIMIT_BAL,-0.011505,PAY_3,-0.012492,PAY_5,-0.019373,0,0.145196,0.203448,0
6,BILL_AMT2,0.065360,AGE,0.021229,PAY_2,0.007683,PAY_AMT4,-2.136722e-03,PAY_AMT3,-0.007232,...,PAY_5,-0.019373,PAY_AMT1,-0.024897,PAY_0,-0.094089,0,0.040445,0.093663,0
7,AGE,0.016837,BILL_AMT2,0.000060,PAY_AMT3,-0.000000,PAY_AMT4,-6.133878e-05,PAY_AMT2,-0.000141,...,PAY_5,-0.019373,LIMIT_BAL,-0.023010,PAY_2,-0.028685,0,0.075818,0.123857,0
8,PAY_3,0.069361,AGE,0.020497,PAY_2,0.007683,BILL_AMT2,2.236064e-03,PAY_AMT2,-0.000000,...,PAY_AMT1,-0.001507,PAY_5,-0.019373,LIMIT_BAL,-0.032215,0,0.194741,0.254229,0
9,AGE,0.024479,PAY_AMT3,-0.000000,BILL_AMT2,0.000000,PAY_AMT1,-0.000000e+00,BILL_AMT1,-0.000000,...,PAY_3,-0.014777,PAY_5,-0.015101,PAY_2,-0.017886,1,0.151561,0.157877,0
