In [1]:

import numpy as np
import pandas as pd
import time
import warnings

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, MinMaxScaler, Normalizer, OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

from tableshift import get_dataset
from tableshift.core.features import PreprocessorConfig
from tableshift.core.tasks import get_task_config
from ud_bagging import UDBaggingClassifier, balanced_weight_vector
from ud_naive_bayes import InterpretableBernoulliNB, InterpretableMultinomialNB, InterpretableCategoricalNB


from util import *

drop_columns = {
    'anes': ['VCF0901b'],
    'acsfoodstamps': ['ST'],
    'acsincome': ['ST']
}

data = [
    [ 'ASSISTments',             'assistments'             ],
    [ 'Childhood Lead',          'nhanes_lead'             ],
    [ 'College Scorecard',       'college_scorecard'       ], 
    [ 'Diabetes',                'brfss_diabetes'          ],
    [ 'FICO HELOC',              'heloc'                   ],
    [ 'Food Stamps',             'acsfoodstamps'           ],
    [ 'Hospital Readmission',    'diabetes_readmission'    ],    
    [ 'Hypertension',            'brfss_blood_pressure'    ],    
    #[ 'ICU Length of Stay'       'mimic_extract_los_3'     ],    
    #[ 'ICU Mortality',           'mimic_extract_mort_hosp' ],        
    [ 'Income',                  'acsincome'               ],
    #[ 'Public Health Insurance', 'acspubcov'               ],
    [ 'Sepsis',                  'physionet'               ],
    [ 'Unemployment',            'acsunemployment'         ],
    [ 'Voting',                  'anes'                    ]
    ]

df_result = {
    'acc': create_table(np.array(data)[:,1]),
    'f-1': create_table(np.array(data)[:,1])}

df_result['acc'].iloc[:,0] = np.array(data)[:,0]
df_result['f-1'].iloc[:,0] = np.array(data)[:,0]

cols = df_result['acc'].columns

dataset,identifier = ['Hospital Readmission','diabetes_readmission']

dset = get_dataset(
    name=identifier,
    initialize_data=False,
    use_cached=True
)

X_a, y_a, _, _ = dset.get_pandas('train')
X_id, y_id, _, _ = dset.get_pandas('id_test')
X_b, y_b, _, _ = dset.get_pandas('ood_test')

drop=False
fillna = False
sqrt_=int(np.sqrt(X_a.shape[1]))
log2_=int(np.log2(X_a.shape[1]))

balanced = True

X_a.reset_index(drop=True, inplace=True)
X_id.reset_index(drop=True, inplace=True)
X_b.reset_index(drop=True, inplace=True)

y_a.reset_index(drop=True, inplace=True)
y_id.reset_index(drop=True, inplace=True)
y_b.reset_index(drop=True, inplace=True)

if drop and identifier in drop_columns.keys():
    X_a.drop(drop_columns[identifier], inplace=True, axis=1, errors='ignore')
    X_id.drop(drop_columns[identifier], inplace=True, axis=1, errors='ignore')
    X_b.drop(drop_columns[identifier], inplace=True, axis=1, errors='ignore')

cat_dict = {
    f.name: f.value_mapping
    for f in get_task_config(identifier).feature_list.features 
    if f.kind.__name__ == 'CategoricalDtype' 
    and not f.is_target 
    and f.name in X_a.columns
}

cat_hidden = {
    f.name
    for f in get_task_config(identifier).feature_list.features 
    if f.kind.__name__ != 'CategoricalDtype' 
    and not f.is_target 
    and f.value_mapping 
    and f.name in X_a.columns
}

cat_feats = set(cat_dict.keys())
obj_feats = set(X_a.columns[X_a.dtypes == 'object']) - cat_feats
num_feats = set(X_a.columns) - obj_feats - cat_feats - cat_hidden

for feat in num_feats:
    try:
        out,bins = pd.qcut(X_a[feat],5,retbins=True,duplicates='drop')
        X_a[feat] = out
        X_id[feat] = pd.cut(X_id[feat],bins)
        X_b[feat] = pd.cut(X_b[feat],bins)
    except:
        print(feat)

for feat in cat_feats:
    if cat_dict[feat]:
        try:
            cat_type = pd.CategoricalDtype(categories={v: float(k) for k,v in cat_dict[feat].items()})
        except:
            cat_type = pd.CategoricalDtype(categories={k: v for k,v in cat_dict[feat].items()})
    else:
        cat_type = X_a[feat].astype(str).astype('category').dtype
    X_a[feat] = X_a[feat].astype(cat_type)
    X_id[feat] = X_id[feat].astype(cat_type)
    X_b[feat] = X_b[feat].astype(cat_type)

# Verify if all features are categories
verify = X_a.columns[X_a.dtypes != 'category']
if len(verify) > 0:
    for feat in verify:
        try:
            out,bins = pd.qcut(X_a[feat],5,retbins=True,duplicates='drop')
            X_a[feat] = out
            X_id[feat] = pd.cut(X_id[feat],bins)
            X_b[feat] = pd.cut(X_b[feat],bins)
        except:
            print(feat)


In [2]:

X_a = X_a.apply(lambda Xj: Xj.cat.codes)
X_id = X_id.apply(lambda Xj: Xj.cat.codes)
X_b = X_b.apply(lambda Xj: Xj.cat.codes)


In [3]:

model = UDBaggingClassifier(
    estimator=InterpretableCategoricalNB(
        min_categories=2, 
        alpha=1e-10, 
        force_alpha=True
    ),
    verbose=0,
    n_estimators=500, max_features=sqrt_, 
    n_jobs=1, random_state=2
)

if balanced:
    sample_weight=balanced_weight_vector(y_a)
else:
    sample_weight=None

model.fit(X_a.values, y_a.values, sample_weight=sample_weight)

y_hat_id = model.predict(X_id.values)
acc_id = accuracy_score(y_id.values, y_hat_id)
f1_id = f1_score(y_id.values, y_hat_id)
f1_ood = None

df_result['acc'].loc[identifier,cols[1]] = acc_id
df_result['f-1'].loc[identifier,cols[1]] = f1_id

y_hat_b = model.predict(X_b.values)

acc_ood = accuracy_score(y_b.values, y_hat_b)
f1_ood = f1_score(y_b.values, y_hat_b)

df_result['acc'].loc[identifier,cols[2]] = acc_ood
df_result['f-1'].loc[identifier,cols[2]] = f1_ood


In [4]:
mssf_b = model.sufficiency_based_feature_importances(X_b.values)

In [5]:
mssf_b

array([0.00059143, 0.00066936, 0.03686262, 0.02896756, 0.02339971,
       0.04173934, 0.03555199, 0.03867279, 0.04046682, 0.04056893,
       0.02839015, 0.05291466, 0.00092993, 0.00106483, 0.03228752,
       0.04830259, 0.03842892, 0.05352635, 0.04212519, 0.01630744,
       0.02427614, 0.03265881, 0.0079149 , 0.01130695, 0.00476952,
       0.00821202, 0.00171018, 0.01779479, 0.01533159, 0.00759551,
       0.01670381, 0.0228021 , 0.00700467, 0.00702443, 0.00254166,
       0.00846683, 0.00175452, 0.00311284, 0.04785347, 0.00911868,
       0.0032886 , 0.00122142, 0.00182876, 0.00413538, 0.05496418,
       0.07484013])

In [None]:

if f1_ood > 0:
    
    dbcp_a = model._dbcp(X_id.values)
    dbcp_b = model._dbcp(X_b.values)
    
    mssf_a = model.sufficiency_based_feature_importances(X_id.values)
    mssf_b = model.sufficiency_based_feature_importances(X_b.values)
    
    strategies = [dbcp_b, dbcp_b-dbcp_a, mssf_b, mssf_b-mssf_a]
    
    for i,s in enumerate(strategies):

        min, max = s.min(), s.max()
        w = (s-min)/(max-min)
        p = w/w.sum()
        model_adapted_brs = UDBaggingClassifier(
            estimator=InterpretableCategoricalNB(
                min_categories=2, 
                alpha=1e-10, 
                force_alpha=True
            ),
            n_estimators=500, max_features=sqrt_, 
            n_jobs=-1, random_state=2, 
            biased_subspaces=True, feature_bias=p
        )
        
        model_adapted_brs.fit(X_a.values, y_a, sample_weight=sample_weight)
        y_hat = model_adapted_brs.predict(X_b.values)
        acc_oodi = accuracy_score(y_b.values, y_hat)
        f1_oodi = f1_score(y_b.values, y_hat)
        
        df_result['acc'].loc[identifier,cols[i+3]] = acc_oodi
        df_result['f-1'].loc[identifier,cols[i+3]] = f1_oodi
        


In [None]:
np.ones((5)).sum(axis=1)

In [128]:
df_result['acc']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Strategy,Strategy,Strategy,Strategy
Unnamed: 0_level_1,Dataset,ID,OOD,1,2,3,4
assistments,ASSISTments,,,,,,
nhanes_lead,Childhood Lead,,,,,,
college_scorecard,College Scorecard,,,,,,
brfss_diabetes,Diabetes,,,,,,
heloc,FICO HELOC,,,,,,
acsfoodstamps,Food Stamps,,,,,,
diabetes_readmission,Hospital Readmission,,,,,,
brfss_blood_pressure,Hypertension,,,,,,
acsincome,Income,,,,,,
physionet,Sepsis,,,,,,


In [129]:
df_result['f-1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Strategy,Strategy,Strategy,Strategy
Unnamed: 0_level_1,Dataset,ID,OOD,1,2,3,4
assistments,ASSISTments,,,,,,
nhanes_lead,Childhood Lead,,,,,,
college_scorecard,College Scorecard,,,,,,
brfss_diabetes,Diabetes,,,,,,
heloc,FICO HELOC,,,,,,
acsfoodstamps,Food Stamps,,,,,,
diabetes_readmission,Hospital Readmission,,,,,,
brfss_blood_pressure,Hypertension,,,,,,
acsincome,Income,,,,,,
physionet,Sepsis,,,,,,
