In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from cohortowen import cohortowen as co
from cohortowen.similarity import generate_similarity_in_distance
from cohortowen.dataset import titanic

In [2]:
X,Y,categorical = titanic()
model = GradientBoostingClassifier(n_estimators=10000,max_depth=4,learning_rate=0.1,subsample=0.75)
ct = ColumnTransformer(
    [
        ("cont", StandardScaler(),X.columns[categorical == 0]),
        ("cat", OneHotEncoder(handle_unknown='infrequent_if_exist',sparse_output=False),X.columns[categorical == 1])
    ],
remainder='drop',verbose_feature_names_out=True,verbose=True)
pipe = Pipeline([('col',ct),('classifier',model)])
pipe.fit(X,Y.flatten())
Yhat = pipe.predict_proba(X)
Z = pd.DataFrame(pipe['col'].transform(X))
Z.columns = pipe['col'].get_feature_names_out()


[ColumnTransformer] .......... (1 of 2) Processing cont, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s


In [3]:
print(Z.columns)

Index(['cont__age', 'cont__sibsp', 'cont__parch', 'cont__fare',
       'cat__pclass_1', 'cat__pclass_2', 'cat__pclass_3', 'cat__sex_0',
       'cat__sex_1'],
      dtype='object')


In [5]:
#Cohort Shapley original features

cs1 = co.CohortShapley(model = None,similarity = generate_similarity_in_distance,subject_indices = [0,1],data = X.values, y = Yhat)
cs1.compute_cohort_shapley()
vals1 = cs1.shapley_values
vals1 = pd.DataFrame(vals1)
vals1.columns = X.columns
#Cohort Shapley transformed features
cs2 = co.CohortShapley(model = None,similarity = generate_similarity_in_distance,subject_indices = [0,1],data = Z.values, y = Yhat)
cs2.compute_cohort_shapley()
vals2 = cs2.shapley_values
vals2 = pd.DataFrame(vals2)
vals2.columns = Z.columns
#Cohort Owen transformed features
union_structure = [[0],[1],[2],[3],[4,5,6],[7,8]]
co1 = co.CohortOwen(union_structure=union_structure,model=None,similarity = generate_similarity_in_distance, subject_indices = [0,1], data=Z.values, y=Yhat)
vals3 = co1.compute_cohort_shapley()
vals3 = co1.shapley_values
vals3 = pd.DataFrame(vals3)
vals3.columns = Z.columns
print('Union Structure:')
for union in union_structure:
    print(Z.columns[union].values)


use given y values instead of model prediction.
use given y values instead of model prediction.
use given y values instead of model prediction.
Union Structure:
['cont__age']
['cont__sibsp']
['cont__parch']
['cont__fare']
['cat__pclass_1' 'cat__pclass_2' 'cat__pclass_3']
['cat__sex_0' 'cat__sex_1']


In [6]:
vals1 = vals1[['age','sibsp','parch','fare','pclass','sex']] #reordering columns for comparison
print(round(vals1,2))
print(round(vals2,2))
print(round(vals3,2))

    age  sibsp  parch  fare  pclass   sex
0  0.02  -0.00   0.00  0.04    0.01 -0.07
1 -0.03   0.04   0.01  0.22    0.03  0.22
   cont__age  cont__sibsp  cont__parch  cont__fare  cat__pclass_1  \
0       0.01        -0.00         0.01        0.01           0.01   
1       0.02         0.06         0.02        0.23           0.01   

   cat__pclass_2  cat__pclass_3  cat__sex_0  cat__sex_1  
0           -0.0           0.00       -0.02       -0.02  
1            0.0           0.01        0.07        0.07  
   cont__age  cont__sibsp  cont__parch  cont__fare  cat__pclass_1  \
0       0.02        -0.00         0.00        0.04           0.00   
1      -0.03         0.04         0.01        0.22           0.02   

   cat__pclass_2  cat__pclass_3  cat__sex_0  cat__sex_1  
0           -0.0           0.00       -0.03       -0.03  
1            0.0           0.01        0.11        0.11  


In [7]:
#compare results summed across unions for subject 0
for (i,union) in enumerate(union_structure):
    print(vals1.columns.values[i])
    print(round(vals1.iloc[0,i],2))
    print(round(vals2.iloc[0,union].sum(),2))
    print(round(vals3.iloc[0,union].sum(),2))

age
0.02
0.01
0.02
sibsp
-0.0
-0.0
-0.0
parch
0.0
0.01
0.0
fare
0.04
0.01
0.04
pclass
0.01
0.01
0.01
sex
-0.07
-0.03
-0.07


In [8]:
#compare results summed across unions for subject 1
for (i,union) in enumerate(union_structure):
    print(vals1.columns.values[i])
    print(round(vals1.iloc[1,i],2))
    print(round(vals2.iloc[1,union].sum(),2))
    print(round(vals3.iloc[1,union].sum(),2))

age
-0.03
0.02
-0.03
sibsp
0.04
0.06
0.04
parch
0.01
0.02
0.01
fare
0.22
0.23
0.22
pclass
0.03
0.03
0.03
sex
0.22
0.13
0.22
