In [1]:
import cohortintgrad as csig
import pandas as pd
import numpy as np
import shap

#### Data load

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
x = data['data'][:10000]
y = data['target'][:10000]

In [3]:
pd.concat([pd.DataFrame(x, columns=data['feature_names']).describe(), pd.DataFrame(y, columns=['Target']).describe()], axis=1)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3.71804,31.8471,5.212102,1.090389,1395.5887,3.061855,35.49382,-119.472328,2.04949
std,1.916912,11.821967,2.752832,0.547035,1090.838717,6.098183,1.959545,1.808913,1.16595
min,0.4999,1.0,0.846154,0.5,3.0,0.75,32.67,-124.35,0.14999
25%,2.4119,23.0,4.253385,1.007078,779.75,2.45283,34.01,-121.59,1.17975
50%,3.3289,33.0,5.031476,1.049645,1137.5,2.851168,34.17,-118.41,1.766
75%,4.544825,40.0,5.830935,1.097466,1687.0,3.373184,37.63,-118.21,2.58025
max,15.0001,52.0,141.909091,34.066667,28566.0,599.714286,41.95,-114.55,5.00001


#### Construct a Linear Model

In [4]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(
        x, y, test_size=0.2, random_state=1018
    )

In [5]:
#randomly picked data id
data_id=1018

In [6]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_x, train_y)
print(f'coefficient of the model = {lr.coef_}') 
print(f'intercept of the model = {lr.intercept_}')

coefficient of the model = [ 4.24955434e-01  6.18867650e-03 -9.62249735e-02  5.35832074e-01
  2.29219637e-05 -2.57747854e-01 -4.52738626e-01 -3.76962637e-01]
intercept of the model = -28.034035536407274


In [7]:
print(f'Preficted value for Data id={data_id}: {lr.predict(test_x[data_id].reshape(1,-1)).item()}')
print(f'Annotated value = {test_y[data_id]}')
print(np.sum(lr.coef_ * test_x[data_id]) + lr.intercept_)

Preficted value for Data id=1018: 2.16016976283116
Annotated value = 3.611
2.1601697628311634


#### Usual Kearnel SHAP

In [8]:
explainer = shap.KernelExplainer(lr.predict, data=np.average(test_x, axis=0).reshape(1, test_x.shape[1]))
sv = explainer.shap_values(test_x[data_id], max_evals=2**x.shape[1], silent=False)
print(f'Kernel Shapley value for Data id={data_id}: {sv}')
print(f'Linear coefficient times the data values comparing to the average data: {lr.coef_ * (test_x[data_id] - np.average(test_x, axis=0))}')

Kernel Shapley value for Data id=1018: [ 0.46804745 -0.02325395 -0.12718621  0.00272706 -0.01220642  0.15034828
 -1.30177295  1.06308176]
Linear coefficient times the data values comparing to the average data: [ 0.46804745 -0.02325395 -0.12718621  0.00272706 -0.01220642  0.15034828
 -1.30177295  1.06308176]


#### IGCS for annotated value

In [9]:
#build instance 
IG = csig.CohortIntGrad(test_x,test_y)
#calculate CSIG values and Remaining Delta (residue of efficiency axiom)
ig =IG.igcs_single(t_id=data_id)
print(f'IGCS for annotated y for Data id={data_id}: {ig.to("cpu").detach().numpy()}')

IGCS for annotated y for Data id=1018: [ 2.3998897e-01  2.5853822e-02 -1.3655618e-04  7.6993988e-03
 -3.1481896e-02  3.7488531e-04  2.2337019e-02  1.3618467e-02]


#### IGCS for predicted value

In [10]:
pred = lr.predict(test_x)
IG_pred = csig.CohortIntGrad(test_x,pred)
ig_pred,_ =IG_pred.igcs_stack(stack_target=list(range(test_x.shape[0]))) # IGCS for all data

100%|██████████| 2000/2000 [00:25<00:00, 77.44it/s]


In [11]:
#data shape: (#cohort, (shape of feat))~same as original data
ig_pred.shape

torch.Size([2000, 8])

In [12]:
print(f'IGCS for predicted y for Data id={data_id}: {ig_pred[data_id].to("cpu").detach().numpy()}')

IGCS for predicted y for Data id=1018: [ 0.25093773 -0.02425705  0.00229068 -0.00126117  0.0124348   0.03522481
  0.04133091  0.04580621]


#### Cohort Shapley for predicted value

In [16]:
#the case where #feat is very few and n_step is large, CS can be faster than IGCS
ks = IG_pred.cohort_kernel_shap(t_id=data_id)
print(f'CS for annotated y for Data id={data_id}: {ks}')

CS for annotated y for Data id=1018: [ 0.26491452 -0.01957971  0.00210659 -0.00432399 -0.00153431  0.02176702
  0.04633841  0.05284106]
