In [1]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
import lifelines
from matplotlib import pyplot as plt
from scipy import stats
from sklearn_pandas import DataFrameMapper
# from sklearn_pandas import CategoricalImputer
import lime
import lime.lime_tabular
import kendall_w as kw
from numpy.random import default_rng

In [2]:
#setting seed
random_state = 20

In [3]:
#importing data
data = pd.read_table('../../data/brca_metabric_clinical_data.tsv')
data.head()

Unnamed: 0,Study ID,Patient ID,Sample ID,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,...,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Number of Samples Per Patient,Sample Type,Sex,3-Gene classifier subtype,Tumor Size,Tumor Stage,Patient's Vital Status
0,brca_metabric,MB-0000,MB-0000,75.65,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,,NO,claudin-low,...,YES,138.65,0:Not Recurred,1,Primary,Female,ER-/HER2-,22.0,2.0,Living
1,brca_metabric,MB-0002,MB-0002,43.19,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,NO,LumA,...,YES,83.52,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,10.0,1.0,Living
2,brca_metabric,MB-0005,MB-0005,48.87,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,YES,LumB,...,NO,151.28,1:Recurred,1,Primary,Female,,15.0,2.0,Died of Disease
3,brca_metabric,MB-0006,MB-0006,47.68,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,YES,LumB,...,YES,162.76,0:Not Recurred,1,Primary,Female,,25.0,2.0,Living
4,brca_metabric,MB-0008,MB-0008,76.97,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,YES,LumB,...,YES,18.55,1:Recurred,1,Primary,Female,ER+/HER2- High Prolif,40.0,2.0,Died of Disease


In [4]:
data.shape

(2509, 38)

In [5]:
data.columns

Index(['Study ID', 'Patient ID', 'Sample ID', 'Age at Diagnosis',
       'Type of Breast Surgery', 'Cancer Type', 'Cancer Type Detailed',
       'Cellularity', 'Chemotherapy', 'Pam50 + Claudin-low subtype', 'Cohort',
       'ER status measured by IHC', 'ER Status', 'Neoplasm Histologic Grade',
       'HER2 status measured by SNP6', 'HER2 Status',
       'Tumor Other Histologic Subtype', 'Hormone Therapy',
       'Inferred Menopausal State', 'Integrative Cluster',
       'Primary Tumor Laterality', 'Lymph nodes examined positive',
       'Mutation Count', 'Nottingham prognostic index', 'Oncotree Code',
       'Overall Survival (Months)', 'Overall Survival Status', 'PR Status',
       'Radio Therapy', 'Relapse Free Status (Months)', 'Relapse Free Status',
       'Number of Samples Per Patient', 'Sample Type', 'Sex',
       '3-Gene classifier subtype', 'Tumor Size', 'Tumor Stage',
       'Patient's Vital Status'],
      dtype='object')

In [6]:
#Removing non informative columns
data_ = data.drop(['Study ID', 'Patient ID', 'Sample ID', 'Type of Breast Surgery', 'Cancer Type Detailed', 'Cohort'
                  , 'HER2 status measured by SNP6', 'Hormone Therapy', 'Integrative Cluster', 'Oncotree Code', 'Pam50 + Claudin-low subtype'
                  , 'ER status measured by IHC', 'Number of Samples Per Patient', 'Patient\'s Vital Status', 'Radio Therapy'
                   , 'Sex', 'Cancer Type', 'Tumor Stage', 'Sample Type', '3-Gene classifier subtype', 'Tumor Other Histologic Subtype'], axis = 1)

In [7]:
data_.shape

(2509, 17)

In [8]:
data_['Overall Survival Status'] 

0         0:LIVING
1         0:LIVING
2       1:DECEASED
3         0:LIVING
4       1:DECEASED
           ...    
2504           NaN
2505           NaN
2506           NaN
2507           NaN
2508           NaN
Name: Overall Survival Status, Length: 2509, dtype: object

In [9]:
#Categorizing Columns
leave_columns = ['Cellularity', 'Chemotherapy', 'ER Status', 'HER2 Status', 
                 'Inferred Menopausal State', 'Primary Tumor Laterality', 'PR Status', 'Neoplasm Histologic Grade']
numerical_columns = ['Age at Diagnosis', 'Lymph nodes examined positive', 'Mutation Count',
                    'Nottingham prognostic index', 'Relapse Free Status (Months)', 'Tumor Size']
labels = ['Overall Survival Status', 'Overall Survival (Months)']


In [10]:
#Selecting those rows for which Overall Survival Status is not NA
data_ = data_[data_['Overall Survival Status'].notna()]
#Selecting those samples for which Overall Survival Time is greater than 0
data_ = data_[data_['Overall Survival (Months)'] > 0]

In [11]:
d = {'0:LIVING': False, '1:DECEASED': True}
data_['Overall Survival Status'] = data_['Overall Survival Status'].map(d)

In [12]:
#Separating X and Y
X = data_[numerical_columns+leave_columns]
Y = data_[labels]

In [13]:
#Dividing training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)

In [14]:
# cat = X_train[leave_columns].to_numpy()
X_train[leave_columns]

Unnamed: 0,Cellularity,Chemotherapy,ER Status,HER2 Status,Inferred Menopausal State,Primary Tumor Laterality,PR Status,Neoplasm Histologic Grade
99,Low,YES,Positive,Negative,Pre,Right,Positive,2.0
1534,Moderate,NO,Positive,Negative,Pre,Left,Positive,2.0
909,Moderate,NO,Positive,Negative,Post,Left,Positive,3.0
1645,Moderate,NO,Negative,Negative,Post,Right,Negative,3.0
285,Low,YES,Negative,Negative,Post,Left,Negative,3.0
...,...,...,...,...,...,...,...,...
929,High,YES,Negative,Negative,Pre,Right,Negative,3.0
1252,High,NO,Positive,Negative,Post,Right,Negative,2.0
275,High,YES,Positive,Positive,Pre,Right,Positive,3.0
479,Moderate,NO,Negative,Negative,Post,Right,Negative,3.0


In [15]:
numer_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'mean')]) for col in numerical_columns]
col_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')]) for col in leave_columns]
imputer_mapper = DataFrameMapper(numer_imputer + col_imputer, df_out = True)

In [16]:
X_train = imputer_mapper.fit_transform(X_train)
X_test = imputer_mapper.transform(X_test)
# x_train_double_temp = inference_mapper.fit_transform(X_train)
# categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
categorical_names = {}
i = 6
for feature in leave_columns:
    le = LabelEncoder()
    le.fit(X_train[feature])
    X_train[feature] = le.transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])
    categorical_names[i] = le.classes_
    i += 1
categorical_names

{6: array(['High', 'Low', 'Moderate'], dtype=object),
 7: array(['NO', 'YES'], dtype=object),
 8: array(['Negative', 'Positive'], dtype=object),
 9: array(['Negative', 'Positive'], dtype=object),
 10: array(['Post', 'Pre'], dtype=object),
 11: array(['Left', 'Right'], dtype=object),
 12: array(['Negative', 'Positive'], dtype=object),
 13: array([1., 2., 3.])}

In [17]:
numer_preprocess = [([col], [MinMaxScaler()]) for col in numerical_columns]
leave_preprocess = [([col], [OneHotEncoder()]) for col in leave_columns]
encoder_mapper = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = False)
# numer_preprocess_1 = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'mean')]) for col in numerical_columns]
# leave_preprocess_1 = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')]) for col in leave_columns]
# inference_mapper = DataFrameMapper(numer_preprocess_1+leave_preprocess_1, df_out = False)

In [18]:
x_mapper_temp = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = True)
# inf_mapper_temp = DataFrameMapper(numer_preprocess_1+leave_preprocess_1, df_out = True)

In [19]:
x_temp = x_mapper_temp.fit_transform(X_train)
# inf_temp = inf_mapper_temp.fit_transform(X_train)

In [20]:
x_temp.head()

Unnamed: 0,Age at Diagnosis,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Relapse Free Status (Months),Tumor Size,Cellularity_x0_0,Cellularity_x0_1,Cellularity_x0_2,Chemotherapy_x0_0,...,HER2 Status_x0_1,Inferred Menopausal State_x0_0,Inferred Menopausal State_x0_1,Primary Tumor Laterality_x0_0,Primary Tumor Laterality_x0_1,PR Status_x0_0,PR Status_x0_1,Neoplasm Histologic Grade_x0_0,Neoplasm Histologic Grade_x0_1,Neoplasm Histologic Grade_x0_2
99,0.372881,0.022222,0.075949,0.567164,0.393441,0.104972,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1534,0.346817,0.0,0.164557,0.378731,0.066488,0.077348,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
909,0.696624,0.022222,0.101266,0.76306,0.486893,0.243094,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1645,0.550776,0.0,0.063291,0.576866,0.728218,0.248619,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
285,0.662869,0.244444,0.012658,0.949627,0.192794,0.243094,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
# inf_temp.head()

In [22]:
x_train = encoder_mapper.fit_transform(X_train)

In [23]:
x_train.shape

(1584, 24)

In [24]:
y_train

Unnamed: 0,Overall Survival Status,Overall Survival (Months)
99,False,138.100000
1534,True,155.366667
909,True,170.900000
1645,False,255.600000
285,True,137.100000
...,...,...
929,True,282.833333
1252,True,112.633333
275,False,96.966667
479,False,88.000000


In [25]:
y_train_final = y_train.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})
y_train_final

rec.array([(False, 138.1       ), ( True, 155.3666667 ),
           ( True, 170.9       ), ..., (False,  96.96666667),
           (False,  88.        ), (False,   2.53333333)],
          dtype=[('Overall Survival Status', '?'), ('Overall Survival (Months)', '<f8')])

In [26]:
x_train.shape

(1584, 24)

In [27]:
#Converting survival time to log scale for survival svm model
y_train_log_t = y_train_final.copy()
y_train_log_t['Overall Survival (Months)'] = np.log1p(y_train_final['Overall Survival (Months)'])

In [28]:
y_train_log_t

rec.array([(False, 4.9351931 ), ( True, 5.05220368), ( True, 5.14691291),
           ..., (False, 4.58462728), (False, 4.48863637),
           (False, 1.26224171)],
          dtype=[('Overall Survival Status', '?'), ('Overall Survival (Months)', '<f8')])

In [29]:
ref_estimator = FastSurvivalSVM(rank_ratio=0.0, max_iter=1000, tol=1e-5, random_state=0)
ref_estimator.fit(x_train, y_train_log_t)

cindex = concordance_index_censored(
    y_train_final['Overall Survival Status'],
    y_train_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_train),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

0.915


In [30]:
y_test_final = y_test.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})
y_test_final
x_test = encoder_mapper.transform(X_test)

In [31]:
#generating predictions
pred_y = np.expm1(ref_estimator.predict(x_test))

In [32]:
cindex = concordance_index_censored(
    y_test_final['Overall Survival Status'],
    y_test_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_test),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

0.896


In [33]:
x_train_dummy = x_mapper_temp.fit_transform(X_train)

In [34]:
x_train_dummy.head()

Unnamed: 0,Age at Diagnosis,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Relapse Free Status (Months),Tumor Size,Cellularity_x0_0,Cellularity_x0_1,Cellularity_x0_2,Chemotherapy_x0_0,...,HER2 Status_x0_1,Inferred Menopausal State_x0_0,Inferred Menopausal State_x0_1,Primary Tumor Laterality_x0_0,Primary Tumor Laterality_x0_1,PR Status_x0_0,PR Status_x0_1,Neoplasm Histologic Grade_x0_0,Neoplasm Histologic Grade_x0_1,Neoplasm Histologic Grade_x0_2
99,0.372881,0.022222,0.075949,0.567164,0.393441,0.104972,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1534,0.346817,0.0,0.164557,0.378731,0.066488,0.077348,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
909,0.696624,0.022222,0.101266,0.76306,0.486893,0.243094,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1645,0.550776,0.0,0.063291,0.576866,0.728218,0.248619,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
285,0.662869,0.244444,0.012658,0.949627,0.192794,0.243094,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [35]:
'''

Lime explainer

'''
# numer_preprocess = [([col], None) for col in numerical_columns]
# leave_preprocess = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')]) for col in leave_columns]
# x_mapper_double_temp = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = False)
# x_train_double_temp = x_mapper_double_temp.fit_transform(X_train)
# x_train_double_temp = inference_mapper.fit_transform(X_train)
categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
# categorical_names = {}
# for feature in categorical_features:
#     le = LabelEncoder()
#     le.fit(x_train_double_temp[:, feature])
#     x_train_double_temp[:, feature] = le.transform(x_train_double_temp[:, feature])
#     categorical_names[feature] = le.classes_
# categorical_names

In [36]:
# predict_fn = lambda x : ref_estimator.predict(encoder_mapper.transform(x))
def predict_fn(x):
    df = pd.DataFrame(x, columns = numerical_columns+leave_columns)
    return ref_estimator.predict(encoder_mapper.transform(df))

In [37]:
print(categorical_names)

{6: array(['High', 'Low', 'Moderate'], dtype=object), 7: array(['NO', 'YES'], dtype=object), 8: array(['Negative', 'Positive'], dtype=object), 9: array(['Negative', 'Positive'], dtype=object), 10: array(['Post', 'Pre'], dtype=object), 11: array(['Left', 'Right'], dtype=object), 12: array(['Negative', 'Positive'], dtype=object), 13: array([1., 2., 3.])}


In [38]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')

In [46]:
#Active Learning based sampling
# x_test = X_test.to_numpy()
# i = 5
# samples, y, distances = explainer.generate_samples(x_test[i], predict_fn, 5000)

14


In [None]:
# y_test_final['Overall Survival Status'][i]

In [39]:
X_test.head()

Unnamed: 0,Age at Diagnosis,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Relapse Free Status (Months),Tumor Size,Cellularity,Chemotherapy,ER Status,HER2 Status,Inferred Menopausal State,Primary Tumor Laterality,PR Status,Neoplasm Histologic Grade
1510,57.75,0.0,5.0,2.052,226.81,26.0,2,0,1,0,0,1,1,0
601,60.48,0.0,6.0,3.028,265.46,14.0,2,0,1,0,0,0,0,1
117,75.18,21.0,2.0,6.056,92.43,28.0,0,0,1,0,0,1,0,2
1204,59.12,0.0,8.0,1.05,12.43,25.0,2,0,1,0,0,0,0,2
687,43.68,0.0,4.0,3.05,139.84,25.0,2,0,1,0,1,1,1,1


In [53]:
import warnings
import arviz as az
import numpy as np
import pymc3 as pm
import scipy as sp
import seaborn as sns
from pymc3.distributions import Interpolated
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from statsmodels import datasets
from theano import shared
from theano import tensor as tt

def softmax(x):
    y = np.exp(x)
    f_x = y / np.sum(np.exp(x))
    return f_x

def censored_distances(survival_status, event, distances):
    cens_dist = np.abs(survival_status - event)
    return np.sqrt(np.square(distances)+cens_dist)

def gumbel_sf(y, μ, σ):
    return 1.0 - tt.exp(-tt.exp(-(y - μ) / σ))

def train_model(X, y_std, cens, distances):
#     cens = event == 0
    model = pm.Model()
    cens_ = shared(cens)
    with model:
        distances_ = pm.Data("distances", distances)
        X_ = pm.Data("data", X)
#         cens_ = pm.Data("cens", cens)
        sigma_squared = pm.HalfNormal("sigma_squared", 5)
        beta = pm.Normal("beta", 0.0, sigma_squared, shape = X.shape[1])
        eta = beta.dot(X_.T)
        y_obs = pm.Gumbel("y_obs", eta[~cens_], sigma_squared/distances_[~cens_], observed=y_std[~cens])
        y_cens = pm.Potential("y_cens", gumbel_sf(y_std[cens], eta[cens_], sigma_squared/distances_[cens_]))
    SEED = 845199  # from random.org, for reproducibility
    SAMPLE_KWARGS = {"chains": 3, "tune": 1000, "random_seed": [SEED, SEED + 1, SEED + 2], "target_accept" : 0.9}
    with model:
        weibull_trace = pm.sample(**SAMPLE_KWARGS)
    return model, weibull_trace, cens_

def censored_focussed_sampling(explainer, point, i, y_train, y_test_final, S, N, A, batch_size, predict_fn):
#     x_test = X_test.to_numpy()
    samples, y_std, distances = explainer.generate_samples(point, predict_fn, S)
    unique, counts = np.unique(y_train['Overall Survival Status'], return_counts=True)
    p1 = counts[0]/sum(counts)
    p2 = counts[1]/sum(counts)
    event = np.random.choice([0, 1], size = S-1, p = [p1, p2])
    X = np.empty((samples.shape[0], samples.shape[1]+1))
    X[:, 0] = 1
    for i in range(samples.shape[1]):
        X[:, i+1] = samples[:, i]
    X = X[1:, :]
    y_std = y_std[1:]
    distances = distances[1:]
    distances = censored_distances(y_test_final['Overall Survival Status'][i], event, distances)
    model, weibull_trace, cens_ = train_model(X, y_std, event==0, distances)
    for _ in range(0, N-S, batch_size):
        samples, y_std_sampled, distances_sampled = explainer.generate_samples(point, predict_fn, A)
        X_sampled = np.empty((samples.shape[0], samples.shape[1]+1))
        X_sampled[:, 0] = 1
        for i in range(samples.shape[1]):
            X_sampled[:, i+1] = samples[:, i]
        X_sampled = X_sampled[1:, :]
        y_std_sampled = y_std_sampled[1:]
        rng = default_rng(random_state)
        event_sampled = rng.choice([0, 1], size = A-1, p = [p1, p2])
        distances_sampled = censored_distances(y_test_final['Overall Survival Status'][i], event_sampled, distances_sampled[1:])
        rng_ = default_rng(random_state)
        cens_pp = rng.choice([False, True], size = A-1, p = [1, 0])
        cens_.set_value(cens_pp)
        with model:
            pm.set_data({"data" : X_sampled, "distances" : distances_sampled})
            pp_weibull_trace = pm.sample_posterior_predictive(weibull_trace, samples=1500)
        p_test_pred = np.square(pp_weibull_trace["y_obs"].std(axis=0))
        normalized_sd = softmax(p_test_pred)
        rng__ = default_rng(random_state)
        top_k = rng__.choice([i for i in range(p_test_pred.shape[0])], size = batch_size, p = normalized_sd, replace = False)
        selected_X = X_sampled[top_k]
        selected_dist = distances_sampled[top_k]
        selected_event = event_sampled[top_k]
#         cens_.set_value(selected_event == 0)
        selected_y_std = y_std_sampled[top_k]
        X = np.vstack([X, selected_X])
        y_std = np.hstack([y_std, selected_y_std])
        distances = np.hstack([distances, selected_dist])
        event = np.hstack([event, selected_event])
        model, weibull_trace, cens_ = train_model(X, y_std, event==0, distances)
    df = az.summary(weibull_trace)
    return df['mean'].to_numpy()


In [56]:
# censored_focussed_sampling(explainer, X_test, 3, y_train, y_test_final, 100, 5000, 1000, 50, predict_fn)

import time

def kendall_w(expt_ratings):
    if expt_ratings.ndim!=2:
        raise 'ratings matrix must be 2-dimensional'
    m = expt_ratings.shape[0] #raters
    n = expt_ratings.shape[1] # items rated
    denom = m**2*(n**3-n)
    rating_sums = np.sum(expt_ratings, axis=0)
    S = n*np.var(rating_sums)
    return 12*S/denom

'''
Kendall's W
'''

S = 50
A = 100
batch_size = 10
index = 3
k = 20
val = []

start_time = time.time()
for N in [70, 80, 90, 100]:
    print("Num of Samples : "+str(N))
    rankings = {}
    point = X_test.to_numpy()[index]
    for j in range(0, k):
        print("Kendall Iteration : "+str(j))
#         importance_score_bias = censored_focussed_sampling(explainer, X_test, i, y_train, y_test_final, N//5, N, N*4, batch_size, predict_fn)
        importance_score_bias = censored_focussed_sampling(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)
        importance_score = importance_score_bias[1:]
        print(np.abs(importance_score))
        indices_sort = np.argsort(-np.abs(importance_score)).argsort()
        for j1 in range(len(indices_sort)):
            if j1 not in rankings:
                rankings[j1] = [indices_sort[j1]]
            else:
                rankings[j1].append(indices_sort[j1])
    final_list = []
#     print(rankings)
    for key in rankings:
        final_list.append(rankings[key])
    value = kendall_w(np.array(final_list))
    print(final_list)
    print(value)
    val.append(value)
print("--- %s seconds ---" % (time.time() - start_time))

# '''
# max senstivity
# '''

# def max_sensitivity(radius, num_points, index, X_test):
#     for i in range(num_points):
#         max_diff = -math.inf
#         point = X_test.to_numpy()[index]
# #         expl = censored_focussed_sampling(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)
# #         norm = np.linalg.norm(expl)
#         for _ in range(num_points):
#             noise = np.random.uniform(-1 * radius, radius, point.shape[0])
#             point_ = point+noise
# #             print(point_.shape)
#             expl_ = censored_focussed_sampling(explainer, point_, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)
#             max_diff = max(max_diff, np.linalg.norm(expl-expl_))/norm
#     return max_diff
            

# metric = max_sensitivity(0.2, 10, 3, X_test)

# '''
# Shap Values Baseline - global
# kernel shap - local 
# insertion deletion
# '''


Num of Samples : 70
Kendall Iteration : 0


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 29 seconds.
There were 7 divergences after tuning. Increase `target_accept` or reparameterize.
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
  "samples parameter is smaller than nchains times ndraws, some draws "


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 33 seconds.
  "samples parameter is smaller than nchains times ndraws, some draws "


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 29 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.


[ 0.577  0.437  0.229  0.879 -1.464  0.082 -0.466 -0.123 -0.402  1.573
  0.971 -0.134  0.332  0.558  1.594]
Kendall Iteration : 1


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 28 seconds.
  "samples parameter is smaller than nchains times ndraws, some draws "


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 31 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
  "samples parameter is smaller than nchains times ndraws, some draws "


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 28 seconds.


[ 0.337  0.036  0.44  -0.465 -1.386 -0.146  0.044  0.31   0.504 -0.091
  0.295  0.031 -0.275 -0.109  1.473]
[[5, 5], [8, 13], [11, 4], [4, 3], [2, 1], [14, 9], [7, 12], [13, 6], [9, 2], [1, 11], [3, 7], [12, 14], [10, 8], [6, 10], [0, 0]]
0.0
--- 220.91158390045166 seconds ---


In [None]:
az.summary(trace)

In [None]:
az.summary(trace_)

In [None]:
values = model["data"].get_value()

In [None]:
values_ = model_["data"].get_value()
comparison = values == values_
print(comparison.all())

In [None]:
unique, counts = np.unique(y_train['Overall Survival Status'], return_counts=True)
p1 = counts[0]/sum(counts)
p2 = counts[1]/sum(counts)

In [None]:
# x_double_test = X_test.to_numpy()
x_test = X_test.to_numpy()
i = 5
event = np.random.choice([False, True], size = 5000, p = [p1, p2])
event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i])*1)
exp = explainer.explain_instance(x_test[i], predict_fn, event_distances, num_features=5)
exp.show_in_notebook(show_all=False)

In [None]:
exp.local_exp[0]

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
x_test = X_test.to_numpy()
i = 5
event = np.random.choice([False, True], size = 5000, p = [p1, p2])
event_distances = 0
exp = explainer.explain_instance(x_test[i], predict_fn, event_distances, num_features=5)
exp.show_in_notebook(show_all=False)

In [None]:
exp.local_exp[0]

In [None]:
from lifelines import WeibullAFTFitter, LogNormalAFTFitter, LogLogisticAFTFitter

In [None]:
class aft_model:
    def __init__(self, p1, p2, event, penalizer = 0.01, aft_fitter = 'weibull', num_samples = 5000, fit_intercept = True, ancillary=False):
        self.model = WeibullAFTFitter(fit_intercept, penalizer = penalizer)
        if aft_fitter == 'lognormal':
            self.model = LogNormalAFTFitter(fit_intercept)
        elif aft_fitter == 'loglogistic':
            self.model = LogLogisticAFTFitter(fit_intercept)
        self.num_samples = num_samples
        self.coef_ = None
        self.intercept_ = None
        self.p1 = p1
        self.p2 = p2
        self.ancillary = ancillary
        self.event = event
        
    def generate_event(self):
        return np.random.choice([False, True], size = self.num_samples, p = [self.p1, self.p2])
        
    def fit(self, X, y, sample_weight):
        
        
#         sample_weight = np.sqrt(sample_weight)
#         event = generate_event()
#         print(event.shape)
#         print(self.real_event)
#         print(self.real_event.shape)
#         sample_weight = np.sqrt(np.exp(-(np.log(sample_weight) + np.abs(event*1 - self.real_event*1))))
        X = np.multiply(X, sample_weight[:, np.newaxis])
        y = y*sample_weight
        df = pd.DataFrame(X)
        df[str(X.shape[1])] = self.event
        df[str(X.shape[1]+1)] = y
        self.model.fit(df, str(X.shape[1]+1), str(X.shape[1]), ancillary = self.ancillary)
        params_ = self.model.params_['lambda_']
        self.intercept_ = params_['Intercept']
        self.coef_ = params_.drop('Intercept', inplace=False)
        
    def score(self, X, y, sample_weight):
        return self.model.concordance_index_
    
    def predict(self, X):
        return 20
    
    def print_here(self):
        print("Here I am!!")

In [None]:
i = 5

In [None]:
event = np.random.choice([False, True], size = 100, p = [p1, p2])
event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i])*1)

In [None]:
regressor = aft_model(p1 = p1, p2 = p2, event = event)

In [None]:
explainer_1 = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')

In [None]:
exp = explainer_1.explain_instance(x_test[i], predict_fn, event_distances, num_features=5, model_regressor = regressor, num_samples = 100)
exp.show_in_notebook(show_all=False)

In [None]:
import warnings

import arviz as az
import numpy as np
import pymc3 as pm
import scipy as sp
import seaborn as sns

from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from statsmodels import datasets
from theano import shared
from theano import tensor as tt

In [None]:
class bayesian_aft_model:
    def __init__(self, p1, p2, event, prior_mean = 0):
        self.p1 = p1
        self.p2 = p2
        self.event = event
        self.prior_mean = prior_mean
        
    def generate_event(self):
        return np.random.choice([False, True], size = self.num_samples, p = [self.p1, self.p2])
    
    def gumbel_sf(self, y, mu, sigma):
        return 1.0 - tt.exp(-tt.exp(-(y - mu) / sigma))
        
    def fit(self, X, y, sample_weight):
        
        
#         sample_weight = np.sqrt(sample_weight)
#         event = generate_event()
#         print(event.shape)
#         print(self.real_event)
#         print(self.real_event.shape)
#         sample_weight = np.sqrt(np.exp(-(np.log(sample_weight) + np.abs(event*1 - self.real_event*1))))
        X = np.multiply(X, sample_weight[:, np.newaxis])
        y = y*sample_weight
#         df = pd.DataFrame(X)
        VAGUE_PRIOR_SD = 5.0
        with pm.Model() as weibull_model:
            beta = pm.Normal("beta", self.prior_mean, VAGUE_PRIOR_SD, shape=X.shape[1])
        X_ = shared(X)
        with weibull_model:
            eta = beta.dot(X_.T)
        with weibull_model:
            s = pm.HalfNormal("s", 5.0)
        cens = event == 0
        cens_ = shared(cens)
        with weibull_model:
            y_obs = pm.Gumbel("y_obs", eta[~cens_], s, observed=y[~cens])
        
        with weibull_model:
            y_cens = pm.Potential("y_cens", self.gumbel_sf(y[cens], eta[cens_], s))
            
        SEED = 31415  # from random.org, for reproducibility

        SAMPLE_KWARGS = {"chains": 3, "tune": 1000, "random_seed": [SEED, SEED + 1, SEED + 2]}
        with weibull_model:
            weibull_trace = pm.sample(**SAMPLE_KWARGS)
        self.intercept_ = 0
        self.coef_ = az.summary(weibull_trace)['mean'][:-1].to_numpy()
        self.trace = weibull_trace
#         self.coef_ = 
#         y = y_train_log_t['Overall Survival (Months)']
#         y_std = (y - y.mean()) / y.std()
#         df[str(X.shape[1])] = self.event
#         df[str(X.shape[1]+1)] = y
#         self.model.fit(df, str(X.shape[1]+1), str(X.shape[1]), ancillary = self.ancillary)
#         params_ = self.model.params_['lambda_']
#         self.intercept_ = params_['Intercept']
#         self.coef_ = params_.drop('Intercept', inplace=False)
        
    def score(self, X, y, sample_weight):
        return 9
    
    def predict(self, X):
        return 20
    
    def print_here(self):
        print("Here I am!!")

In [None]:
event = np.random.choice([False, True], size = 100, p = [p1, p2])
event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i])*1)
regressor = bayesian_aft_model(p1 = p1, p2 = p2, event = event)
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
x_test = X_test.to_numpy()
i = 5
exp = explainer.explain_instance(x_test[i], predict_fn, event_distances, num_features=5, model_regressor = regressor, num_samples = 100)
exp.show_in_notebook(show_all=False)

In [None]:
trace = regressor.trace
az.summary(trace)
# az.plot_posterior(weibull_trace, lw=0, alpha=0.5);

In [None]:
(exp.local_exp[0])

In [None]:
import kendall_w as kw
import time
samples = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200]

In [None]:
'''
LIME with censoring
'''
k = 200
n = 200
val = []
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
start_time = time.time()
for i in samples:
    print(i)
    rankings = {}
    for j in range(k):
        i1 = 0
        event = np.random.choice([False, True], size = i, p = [p1, p2])
        event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i1])*1)
        exp = explainer.explain_instance(x_test[i1], predict_fn, event_distances = event_distances, num_features=5, num_samples = i)
#         exp.show_in_notebook(show_all=False)
        tuples = exp.local_exp[0]
        for j1 in range(len(tuples)):
            if tuples[j1][0] in rankings:
                rankings[tuples[j1][0]].append(j1)
            else:
                rankings[tuples[j1][0]] = [j1]
    final_list = []
    for key in rankings:
        final_list.append(rankings[key])
    value = kw.compute_w(final_list)
    val.append(value)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
plt.plot(samples, val, label = 'lime')

In [None]:
'''
LIME without censoring
'''
k = 200
n = 200
val_noncen = []
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
start_time = time.time()
for i in samples:
    print(i)
    rankings = {}
    for j in range(k):
        i1 = 0
#         event = np.random.choice([False, True], size = 5000, p = [p1, p2])
        event_distances = 0
        exp = explainer.explain_instance(x_test[i1], predict_fn, event_distances = event_distances, num_features=5, num_samples = i)
#         exp.show_in_notebook(show_all=False)
        tuples = exp.local_exp[0]
        for j1 in range(len(tuples)):
            if tuples[j1][0] in rankings:
                rankings[tuples[j1][0]].append(j1)
            else:
                rankings[tuples[j1][0]] = [j1]
    final_list = []
    for key in rankings:
        final_list.append(rankings[key])
    value = kw.compute_w(final_list)
    val_noncen.append(value)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# val = kendall_values
plt.plot(samples, val_noncen, label = 'lime without censored')

In [None]:
'''
AFT with censoring
'''
k = 200
n = 200
kendall_values_aft = []
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
for i in samples:
    print(i)
    rankings = {}
    for j in range(k):
#         print(j)
        i1 = 0
        event = np.random.choice([False, True], size = i, p = [p1, p2])
        event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i1])*1)
        regressor = aft_model(num_samples = i, p1 = p1, p2 = p2, event = event, penalizer = 0.1)
        exp = explainer.explain_instance(x_test[i1], predict_fn, event_distances = event_distances, num_features=5, num_samples = i, model_regressor = regressor)
#         exp.show_in_notebook(show_all=False)
        tuples = exp.local_exp[0]
        for j1 in range(len(tuples)):
            if tuples[j1][0] in rankings:
                rankings[tuples[j1][0]].append(j1)
            else:
                rankings[tuples[j1][0]] = [j1]
    final_list = []
    for key in rankings:
        final_list.append(rankings[key])
    value = kw.compute_w(final_list)
    kendall_values_aft.append(value)

In [None]:
plt.plot(samples, kendall_values_aft, label = 'aft weibull')

In [None]:
# '''
# Bayesian AFT with censoring
# '''
# k = 200
# n = 200
# kendall_values_bay = []
# explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
#                                                    categorical_features=categorical_features, 
#                                                    categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
# for i in samples:
#     print(i)
#     rankings = {}
#     for j in range(k):
# #         print(j)
#         i1 = 0
#         event = np.random.choice([False, True], size = i, p = [p1, p2])
#         event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i1])*1)
#         regressor = bayesian_aft_model(p1 = p1, p2 = p2, event = event)
#         exp = explainer.explain_instance(x_test[i1], predict_fn, event_distances = event_distances, num_features=5, num_samples = i, model_regressor = regressor)
# #         exp.show_in_notebook(show_all=False)
#         tuples = exp.local_exp[0]
#         for j1 in range(len(tuples)):
#             if tuples[j1][0] in rankings:
#                 rankings[tuples[j1][0]].append(j1)
#             else:
#                 rankings[tuples[j1][0]] = [j1]
#     final_list = []
#     for key in rankings:
#         final_list.append(rankings[key])
#     value = kw.compute_w(final_list)
#     kendall_values_bay.append(value)

In [None]:
# plt.plot(samples, kendall_values_bay, label = 'bayesian aft weibull')

In [None]:
plt.plot(samples, val, label = 'lime with censoring')
plt.plot(samples, val_noncen, label = 'lime without censoring')
plt.plot(samples, kendall_values_aft, label = 'aft_weibull')
# plt.plot(samples, kendall_values_bay, label = 'bayesian_aft_weibull')
plt.xlabel("num samples")
plt.ylabel("Kendall's W")
plt.title("lime versus aft")
plt.legend()
plt.show()

In [None]:

'''
LIME with censoring
'''
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
y_pred = predict_fn(X_train.to_numpy())
i = 5
y_cap = predict_fn(x_test[i].reshape((1, 14)))
y_cap
diff = -np.abs(y_cap-y_pred)
indices = np.argsort(diff)
top_k = 10
print(indices)
final_set = X_train.iloc[indices[:top_k], :].to_numpy()
explanations = []
for j in range(top_k):
#     sample = final_set[j].reshape((1, 14))
    event = np.random.choice([False, True], size = 100, p = [p1, p2])
    event_distances = np.abs(event*1 - np.array(y_train_final['Overall Survival Status'][j])*1)
    exp = explainer.explain_instance(final_set[j], predict_fn, event_distances, num_features=5, num_samples = 100)
    explanations.append(exp.local_exp[0])
#     exp.show_in_notebook(show_all=False)

In [None]:
average_explanations = []
for i in range(X_train.shape[1]):
    ex = []
    for j in range(len(explanations)):
        ex.append(explanations[j][i])
    average_explanations.append(np.mean(ex))
average_explanations

In [None]:
event = np.random.choice([False, True], size = 100, p = [p1, p2])
regressor = bayesian_aft_model(p1 = p1, p2 = p2, event = event, prior_mean = average_explanations)
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
x_test = X_test.to_numpy()
i = 5
event_distances = 0
exp = explainer.explain_instance(x_test[i], predict_fn, event_distances, num_features=5, model_regressor = regressor, num_samples = 100)
exp.show_in_notebook(show_all=False)

In [None]:
exp.show_in_notebook(show_all=False)

In [None]:
'''
AFT censoring
'''
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
y_pred = predict_fn(X_train.to_numpy())
i = 5
y_cap = predict_fn(x_test[i].reshape((1, 14)))
y_cap
diff = -np.abs(y_cap-y_pred)
indices = np.argsort(diff)
top_k = 10
print(indices)
final_set = X_train.iloc[indices[:top_k], :].to_numpy()
explanations = []
for j in range(top_k):
#     sample = final_set[j].reshape((1, 14))
    event = np.random.choice([False, True], size = 100, p = [p1, p2])
    event_distances = np.abs(event*1 - np.array(y_train_final['Overall Survival Status'][j])*1)
    regressor = aft_model(p1 = p1, p2 = p2, event = event)
    exp = explainer.explain_instance(final_set[j], predict_fn, event_distances, model_regressor = regressor, num_features=5, num_samples = 100)
    explanations.append(exp.local_exp[0])
#     exp.show_in_notebook(show_all=False)

In [None]:
average_explanations = []
for i in range(X_train.shape[1]):
    ex = []
    for j in range(len(explanations)):
        ex.append(explanations[j][i])
    average_explanations.append(np.mean(ex))
average_explanations

In [None]:
event = np.random.choice([False, True], size = 100, p = [p1, p2])
regressor = bayesian_aft_model(p1 = p1, p2 = p2, event = event, prior_mean = average_explanations)
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')
x_test = X_test.to_numpy()
i = 5
event_distances = np.abs(event*1 - np.array(y_test_final['Overall Survival Status'][i])*1)
exp = explainer.explain_instance(x_test[i], predict_fn, event_distances, num_features=5, model_regressor = regressor, num_samples = 100)
exp.show_in_notebook(show_all=False)

In [None]:
print(val)

In [None]:
print(kendall_values)

In [None]:
'''
To Do
1) Generate explanations with Kendall's w for censoring case.
2) Remove censoring case the generate explanations.
'''

In [None]:
# predict_fn = lambda x: ref_estimator.predict(x)

In [None]:
# columns = []
# for feature in feature_index:
#     if feature in categorical_features:
#         for i in range(len(categorical_features[feature]["values"])):
#             columns.append(feature+"_"+str(i))
#     else:
#         columns.append(feature)
# df = pd.DataFrame(X_train_final, columns = columns)
# df['time'] = y_train_log_t['Overall Survival (Months)']
# df['censoring'] = y_train_log_t['Overall Survival Status']
# df.head()

In [None]:
# aft = lifelines.WeibullAFTFitter()
# aft.fit(df, duration_col = 'time', event_col = 'censoring')

In [None]:
# class explainable_ai_aft:
    
#     def __init__(self, sampling_strategy, regression_model, model):
#         self.sampling_strategy = sampling_strategy
#         self.model = model
#         self.regression_model = regression_model
    
#     def generate_samples(self, categorical_features, numerical_features, feature_index, censoring_proba, num_samples):
#         list = []
#         for feature in feature_index:
#             if feature in categorical_features:
#                 sample = np.random.choice(categorical_features[feature]["values"], size = num_samples, p = categorical_features[feature]["freq"])
#                 enc = OneHotEncoder()
#                 sample_transform = enc.fit_transform(np.expand_dims(sample, axis = 1)).toarray()
#                 print(feature)
#                 print(sample_transform.shape)
#                 list.append(sample_transform)
#             else:
#                 sample = numerical_features[feature]["mean"]+np.random.normal(size = num_samples)*numerical_features[feature]["sigma"]
#                 scaler = MinMaxScaler()
#                 list.append(scaler.fit_transform(np.expand_dims(sample, axis = 1)))
#         censoring = np.random.choice([0, 1], size = num_samples, p = censoring_proba)
#         return np.hstack(list), censoring
    
#     def explain(self, test_data, index, categorical_features, numerical_features, feature_index, censoring_proba, num_samples):
#         samples, censoring = self.generate_samples(categorical_features, numerical_features, feature_index, censoring_proba, num_samples)
#         weights = np.exp(-np.sqrt((np.sum(np.square(samples-test_data[index]), axis = 1))))
#         print(samples.shape)
#         print(censoring.shape)
#         explainer = lifelines.WeibullAFTFitter()
#         if self.regression_model == "weibull":
#             explainer = lifelines.WeibullAFTFitter(penalizer = 0.05)
#         elif self.regression_model == "loglogistic":
#             explainer = lifelines.LogNormalAFTFitter(penalizer = 0.05)
#         elif self.regression_model == "LogLogisticAFTFitter":
#             explainer = lifelines.LogLogisticAFTFitter(penalizer = 0.05)
#         columns = []
#         for feature in feature_index:
#             if feature in categorical_features:
#                 for i in range(len(categorical_features[feature]["values"])):
#                     columns.append(feature+"_"+str(i))    
#             else:
#                 columns.append(feature)
#         print(len(columns))
#         df = pd.DataFrame(samples, columns = columns)
#         df['weights'] = weights
#         df['time'] = self.model(samples)
#         df['censoring'] = censoring
#         explainer.fit(df, duration_col = 'time', event_col = 'censoring', weights_col = 'weights', robust = True)
#         explainer.plot()
#         plt.show()
#         return df

In [None]:
# explainer = explainable_ai_aft(sampling_strategy = "standard_normal", regression_model = "weibull", model = predict_fn)
# df = explainer.explain(X_test_final, 0, categorical_features, numerical_features, feature_index, censoring_proba, 5000)
# df.head()

In [None]:
# categorical_features

In [None]:
# Cellularity
# (100, 3)
# Chemotherapy
# (100, 2)
# ER Status
# (100, 2)
# HER2 Status
# (100, 2)
# Tumor Other Histologic Subtype
# (100, 6)
# Inferred Menopausal State
# (100, 2)
# Primary Tumor Laterality
# (100, 2)
# PR Status
# (100, 2)
# Neoplasm Histologic Grade
# (100, 3)
# (100, 30)
# (100,)
# 32
# # numerical_features

In [None]:
y_train_final['Overall Survival Status']

In [None]:
x_train

In [None]:
x_train.shape

In [None]:
np.append(x_train, y_train_final['Overall Survival Status'], axis = 1)
x_train.shape