In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
import lifelines
from matplotlib import pyplot as plt
from scipy import stats
from sklearn_pandas import DataFrameMapper
# from sklearn_pandas import CategoricalImputer
import lime
import lime.lime_tabular
import kendall_w as kw

import warnings
import arviz as az
import numpy as np
import pymc3 as pm
import scipy as sp
import seaborn as sns
from pymc3.distributions import Interpolated
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from statsmodels import datasets
from theano import shared
from theano import tensor as tt

from numpy.random import default_rng

In [2]:
random_state = 20

In [3]:
data = pd.read_table('../../data/brca_metabric_clinical_data.tsv')

In [4]:
data_ = data.drop(['Study ID', 'Patient ID', 'Sample ID', 'Type of Breast Surgery', 'Cancer Type Detailed', 'Cohort'
                  , 'HER2 status measured by SNP6', 'Hormone Therapy', 'Integrative Cluster', 'Oncotree Code', 'Pam50 + Claudin-low subtype'
                  , 'ER status measured by IHC', 'Number of Samples Per Patient', 'Patient\'s Vital Status', 'Radio Therapy'
                   , 'Sex', 'Cancer Type', 'Tumor Stage', 'Sample Type', '3-Gene classifier subtype', 'Tumor Other Histologic Subtype'], axis = 1)

In [5]:
leave_columns = ['Cellularity', 'Chemotherapy', 'ER Status', 'HER2 Status', 
                 'Inferred Menopausal State', 'Primary Tumor Laterality', 'PR Status', 'Neoplasm Histologic Grade']
numerical_columns = ['Age at Diagnosis', 'Lymph nodes examined positive', 'Mutation Count',
                    'Nottingham prognostic index', 'Relapse Free Status (Months)', 'Tumor Size']
labels = ['Overall Survival Status', 'Overall Survival (Months)']

In [6]:
data_ = data_[data_['Overall Survival Status'].notna()]
data_ = data_[data_['Overall Survival (Months)'] > 0]

In [7]:
d = {'0:LIVING': False, '1:DECEASED': True}
data_['Overall Survival Status'] = data_['Overall Survival Status'].map(d)

In [8]:
X_data = data_[numerical_columns+leave_columns]
Y_data = data_[labels]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.20, random_state=random_state)

In [10]:
numer_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'mean')]) for col in numerical_columns]
col_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')]) for col in leave_columns]
imputer_mapper = DataFrameMapper(numer_imputer + col_imputer, df_out = True)

In [11]:
X_train = imputer_mapper.fit_transform(X_train)
X_test = imputer_mapper.transform(X_test)
# x_train_double_temp = inference_mapper.fit_transform(X_train)
# categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
categorical_names = {}
i = 6
for feature in leave_columns:
    le = LabelEncoder()
    le.fit(X_train[feature])
    X_train[feature] = le.transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])
    categorical_names[i] = le.classes_
    i += 1

In [12]:
numer_preprocess = [([col], [MinMaxScaler()]) for col in numerical_columns]
leave_preprocess = [([col], [OneHotEncoder()]) for col in leave_columns]
encoder_mapper = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = False)

In [13]:
x_mapper_temp = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = True)
x_temp = x_mapper_temp.fit_transform(X_train)

In [14]:
x_train = encoder_mapper.fit_transform(X_train)

In [15]:
y_train_final = y_train.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})

In [16]:
y_train_log_t = y_train_final.copy()
y_train_log_t['Overall Survival (Months)'] = np.log1p(y_train_final['Overall Survival (Months)'])

In [17]:
ref_estimator = FastSurvivalSVM(rank_ratio=0.0, max_iter=1000, tol=1e-5, random_state=0)
ref_estimator.fit(x_train, y_train_log_t)

cindex = concordance_index_censored(
    y_train_final['Overall Survival Status'],
    y_train_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_train),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

0.915


In [18]:
y_test_final = y_test.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})
y_test_final
x_test = encoder_mapper.transform(X_test)

In [19]:
# pred_y = np.expm1(ref_estimator.predict(x_test))
cindex = concordance_index_censored(
    y_test_final['Overall Survival Status'],
    y_test_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_test),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

0.896


In [20]:
categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
def predict_fn(x):
    df = pd.DataFrame(x, columns = numerical_columns+leave_columns)
    return ref_estimator.predict(encoder_mapper.transform(df))

In [21]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none', random_state = random_state)

In [22]:
def softmax(x):
    y = np.exp(x)
    f_x = y / np.sum(np.exp(x))
    return f_x

def censored_distances(survival_status, event, distances):
    cens_dist = np.abs(survival_status - event)
    return np.sqrt(np.square(distances)+cens_dist)

def gumbel_sf(y, μ, σ):
    return 1.0 - tt.exp(-tt.exp(-(y - μ) / σ))

def train_model(sample_X, y_std, cens, distances):
    cens_ = shared(cens)
    with pm.Model() as model:
        distances_ = pm.Data("distance", distances)
        sample_X_ = pm.Data("data", sample_X)
        sigma_squared = pm.HalfNormal("sigma_squared", 5)
        beta = pm.Normal("beta", 0.0, sigma_squared, shape = sample_X.shape[1])
        eta = beta.dot(sample_X_.T)
        y_obs = pm.Gumbel("y_obs", eta[~cens_], sigma_squared/distances_[~cens_], observed=y_std[~cens])
        y_cens = pm.Potential("y_cens", gumbel_sf(y_std[cens], eta[cens_], sigma_squared/distances_[cens_]))
    SEED = 845199  # from random.org, for reproducibility
    SAMPLE_KWARGS = {"chains": 3, "tune": 100, "random_seed": [SEED, SEED + 1, SEED + 2], "target_accept" : 0.9}
    with model:
        weibull_trace = pm.sample(**SAMPLE_KWARGS)
    return model, weibull_trace, cens_

In [23]:
def censored_focussed_sampling(explainer, point, i, y_train, y_test_final, S, N, A, batch_size, predict_fn):
#     x_test = X_test.to_numpy()
    samples, y_std, distances = explainer.generate_samples(point, predict_fn, S)
    unique, counts = np.unique(y_train['Overall Survival Status'], return_counts=True)
    p1 = counts[0]/sum(counts)
    p2 = counts[1]/sum(counts)
    rng = default_rng(random_state)
    event = rng.choice([0, 1], size = S-1, p = [p1, p2])
    X_sample = np.empty((samples.shape[0], samples.shape[1]+1))
    X_sample[:, 0] = 1
    for i in range(samples.shape[1]):
        X_sample[:, i+1] = samples[:, i]
    X_sample = X_sample[1:, :]
    y_std = y_std[1:]
    distances = distances[1:]
    distances = censored_distances(y_test_final['Overall Survival Status'][i], event, distances)
    model, weibull_trace, cens_ = train_model(X_sample, y_std, event==0, distances)
#     return model, weibull_trace, cens_
    for _ in range(0, N-S, batch_size):
        samples, y_std_sampled, distances_sampled = explainer.generate_samples(x_test[i], predict_fn, A)
        X_sampled = np.empty((samples.shape[0], samples.shape[1]+1))
        X_sampled[:, 0] = 1
        for i in range(samples.shape[1]):
            X_sampled[:, i+1] = samples[:, i]
        X_sampled = X_sampled[1:, :]
        y_std_sampled = y_std_sampled[1:]
        rng_ = default_rng(random_state)
        event_sampled = rng_.choice([0, 1], size = A-1, p = [p1, p2])
        distances_sampled = censored_distances(y_test_final['Overall Survival Status'][i], event_sampled, distances_sampled[1:])
        rng__ = default_rng(random_state)
        cens_pp = rng__.choice([False, True], size = A-1, p = [1, 0])
        cens_.set(cens_pp)
        with model:
            pm.set_data({"data" : X_sampled, "distances" : distances_sampled})
            pp_weibull_trace = pm.sample_posterior_predictive(weibull_trace, samples=1500)
        p_test_pred = np.square(pp_weibull_trace["y_obs"].std(axis=0))
        normalized_sd = softmax(p_test_pred)
        rng___ = default_rng(random_state)
        top_k = rng___.choice([i for i in range(p_test_pred.shape[0])], size = batch_size, p = normalized_sd, replace = False)
        selected_X = X_sampled[top_k]
        selected_dist = distances_sampled[top_k]
        selected_event = event_sampled[top_k]
#         cens_.set_value(selected_event == 0)
        selected_y_std = y_std_sampled[top_k]
        X = np.vstack([X, selected_X])
        y_std = np.hstack([y_std, selected_y_std])
        distances = np.hstack([distances, selected_dist])
        event = np.hstack([event, selected_event])
        model, weibull_trace = train_model(X, y_std, event==0, distances)
    df = az.summary(weibull_trace)
    return df['mean'].to_numpy()


In [26]:
batch_size = 10
S = 50
A = 100
N = 50
index = 5
point = X_test.to_numpy()[index]
model, trace, _  = censored_focussed_sampling(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 4 jobs)
NUTS: [beta, sigma_squared]


Sampling 3 chains for 100 tune and 1_000 draw iterations (300 + 3_000 draws total) took 23 seconds.
The acceptance probability does not match the target. It is 0.9621464759480016, but should be close to 0.9. Try to increase the number of tuning steps.
The number of effective samples is smaller than 25% for some parameters.


ValueError: too many values to unpack (expected 3)

In [None]:
comparison = model['data'].get_value() == model_['data'].get_value()
print(comparison.all())
# az.summary(trace)

In [None]:
az.summary(trace)

In [None]:
az.summary(trace_)

In [None]:
batch_size = 10
S = 50
A = 100
N = 50
index = 5
point = X_test.to_numpy()[index]
model_, trace_, _  = censored_focussed_sampling(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)
model__, trace__, _ = censored_focussed_sampling(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)

In [None]:
az.summary(trace_)

In [None]:
az.summary(trace__)

In [None]:
values = model_["data"].get_value()
values_ = model__["data"].get_value()
comparison = values == values_
print(comparison.all())

In [None]:
values = model["distance"].get_value()
values_ = model_["distance"].get_value()
comparison = values == values_
print(comparison.all())

In [None]:
values = c.get_value(borrow = True)
values_ = c.get_value(borrow = True)
comparison = values == values_
print(comparison.all())