In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
import lifelines
from matplotlib import pyplot as plt
from scipy import stats
from sklearn_pandas import DataFrameMapper
# from sklearn_pandas import CategoricalImputer
import lime
import lime.lime_tabular

In [2]:
data = pd.read_table('../../data/brca_metabric_clinical_data.tsv')
data_ = data.drop(['Study ID', 'Patient ID', 'Sample ID', 'Type of Breast Surgery', 'Cancer Type Detailed', 'Cohort'
                  , 'HER2 status measured by SNP6', 'Hormone Therapy', 'Integrative Cluster', 'Oncotree Code', 'Pam50 + Claudin-low subtype'
                  , 'ER status measured by IHC', 'Number of Samples Per Patient', 'Patient\'s Vital Status', 'Radio Therapy'
                   , 'Sex', 'Cancer Type', 'Tumor Stage', 'Sample Type', '3-Gene classifier subtype', 'Tumor Other Histologic Subtype'], axis = 1)

leave_columns = ['Cellularity', 'Chemotherapy', 'ER Status', 'HER2 Status', 
                 'Inferred Menopausal State', 'Primary Tumor Laterality', 'PR Status', 'Neoplasm Histologic Grade']
numerical_columns = ['Age at Diagnosis', 'Lymph nodes examined positive', 'Mutation Count',
                    'Nottingham prognostic index', 'Relapse Free Status (Months)', 'Tumor Size']
labels = ['Overall Survival Status', 'Overall Survival (Months)']

data_ = data_[data_['Overall Survival Status'].notna()]
data_ = data_[data_['Overall Survival (Months)'] > 0]

d = {'0:LIVING': False, '1:DECEASED': True}
data_['Overall Survival Status'] = data_['Overall Survival Status'].map(d)

X_data = data_[numerical_columns+leave_columns]
Y_data = data_[labels]

X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.20, random_state=20)

numer_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'mean')]) for col in numerical_columns]
col_imputer = [([col], [SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')]) for col in leave_columns]
imputer_mapper = DataFrameMapper(numer_imputer + col_imputer, df_out = True)

In [3]:
X_train = imputer_mapper.fit_transform(X_train)
X_test = imputer_mapper.transform(X_test)
# x_train_double_temp = inference_mapper.fit_transform(X_train)
# categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
categorical_names = {}
i = 6
for feature in leave_columns:
    le = LabelEncoder()
    le.fit(X_train[feature])
    X_train[feature] = le.transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])
    categorical_names[i] = le.classes_
    i += 1

In [4]:
numer_preprocess = [([col], [MinMaxScaler()]) for col in numerical_columns]
leave_preprocess = [([col], [OneHotEncoder()]) for col in leave_columns]
encoder_mapper = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = False)

x_mapper_temp = DataFrameMapper(numer_preprocess+leave_preprocess, df_out = True)
x_temp = x_mapper_temp.fit_transform(X_train)

x_train = encoder_mapper.fit_transform(X_train)
y_train_final = y_train.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})

y_train_log_t = y_train_final.copy()
y_train_log_t['Overall Survival (Months)'] = np.log1p(y_train_final['Overall Survival (Months)'])

In [5]:
ref_estimator = FastSurvivalSVM(rank_ratio=0.0, max_iter=1000, tol=1e-5, random_state=0)
ref_estimator.fit(x_train, y_train_log_t)

cindex = concordance_index_censored(
    y_train_final['Overall Survival Status'],
    y_train_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_train),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

y_test_final = y_test.to_records(index = False, column_dtypes = {'Overall Survival' : 'u1'})
y_test_final
x_test = encoder_mapper.transform(X_test)

cindex = concordance_index_censored(
    y_test_final['Overall Survival Status'],
    y_test_final['Overall Survival (Months)'],
    -ref_estimator.predict(x_test),  # flip sign to obtain risk scores
)
print(round(cindex[0], 3))

0.915
0.896


In [6]:
categorical_features = [6, 7, 8, 9, 10, 11, 12, 13]
def predict_fn(x):
    df = pd.DataFrame(x, columns = numerical_columns+leave_columns)
    return ref_estimator.predict(encoder_mapper.transform(df))

In [7]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.to_numpy() ,feature_names = numerical_columns+leave_columns,
                                                   categorical_features=categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3, mode='regression', feature_selection = 'none')

In [8]:
def censored_distances(survival_status, event, distances):
    cens_dist = np.abs(survival_status - event)
    return np.sqrt(np.square(distances)+cens_dist)

def generate_samples(explainer, point, i, y_train, y_test_final, S, N, A, batch_size, predict_fn):
#     x_test = X_test.to_numpy()
    samples, y_std, distances = explainer.generate_samples(point, predict_fn, S)
    unique, counts = np.unique(y_train['Overall Survival Status'], return_counts=True)
    p1 = counts[0]/sum(counts)
    p2 = counts[1]/sum(counts)
    event = np.random.choice([0, 1], size = S-1, p = [p1, p2])
    X_sample = np.empty((samples.shape[0], samples.shape[1]+1))
    X_sample[:, 0] = 1
    for i in range(samples.shape[1]):
        X_sample[:, i+1] = samples[:, i]
    X_sample = X_sample[1:, :]
    y_std = y_std[1:]
    distances = distances[1:]
    distances = censored_distances(y_test_final['Overall Survival Status'][i], event, distances)
#     return samples, X_sample, y_std, distances
#     model, weibull_trace, cens_ = train_model(X_sample, y_std, event==0, distances)
    return samples, X_sample, y_std, event==0, distances

In [154]:
from pyro.nn import PyroSample, PyroModule
import pyro.distributions as dist 
import torch
import pyro


class BayesianRegression(PyroModule):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.sigma_squared = PyroSample(dist.HalfNormal(5))
        self.linear = PyroModule[torch.nn.Linear](in_features, out_features)
        self.linear.weight = PyroSample(dist.Normal(0., self.sigma_squared).expand([out_features, in_features]).to_event(2))
        self.linear.bias = PyroSample(dist.Normal(0., self.sigma_squared).expand([out_features]).to_event(1))

    def forward(self, x, y, distances, truncation_label):
        eta = self.linear(x).squeeze(-1)
        with pyro.plate("data", x.shape[0]) as ind:
#             y_hidden_dist = dist.Gumbel(eta, self.sigma_squared/distances)
            with pyro.poutine.mask(mask = (truncation_label[ind] == False)):
                print("NON CENSORED")
                y_hidden_dist = dist.Gumbel(eta, self.sigma_squared/distances[ind])
                obs = pyro.sample("obs", y_hidden_dist, obs = y[ind])
            with pyro.poutine.mask(mask = (truncation_label[ind] == True)):
                print("CENSORED")
                y_hidden_dist_ = dist.Gumbel(eta, self.sigma_squared/distances[ind])
                truncation_prob = 1 - y_hidden_dist_.cdf(y)
                obs = pyro.sample("truncation_label", dist.Bernoulli(truncation_prob), obs = torch.tensor(1.))
        return eta
# #         sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
# #         mean = self.linear(x).squeeze(-1)
# #         with pyro.plate("data", x.shape[0]):
# #             obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y)

In [155]:
from pyro.infer.autoguide import AutoDiagonalNormal

model = BayesianRegression(15, 1)
guide = AutoDiagonalNormal(model)

In [156]:
from pyro.infer import SVI, Trace_ELBO


adam = pyro.optim.Adam({"lr": 0.03})
svi = SVI(model, guide, adam, loss=Trace_ELBO())

In [157]:
batch_size = 10
S = 50
A = 100
N = 50
index = 5
point = X_test.to_numpy()[index]
_, x_data, y_data, truncation_label, distances = generate_samples(explainer, point, index, y_train, y_test_final, S, N, A, batch_size, predict_fn)
pyro.clear_param_store()
for j in range(3):
    # calculate the loss and take a gradient step
    loss = svi.step(torch.tensor(x_data.astype(np.float32)), torch.tensor(y_data.astype(np.float32)), torch.tensor(distances.astype(np.float32)), torch.tensor(truncation_label.astype(np.bool)))
    if j % 100 == 0:
        print("[iteration %04d] loss: %.4f" % (j + 1, loss / len(data)))

NON CENSORED
CENSORED


ValueError: Expected value argument (Tensor of shape (49,)) to be within the support (Interval(lower_bound=tensor([1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38, 1.1755e-38,
        1.1755e-38]), upper_bound=tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000]))) of the distribution Uniform(low: torch.Size([49]), high: torch.Size([49])), but found invalid values:
tensor([6.3356e-02, 8.9421e-01, 9.3722e-01, 2.1201e-01, 3.4173e-01, 3.1594e-01,
        1.0000e+00, 8.4589e-01, 7.7460e-01, 2.2909e-21, 1.6701e-10, 9.0370e-01,
        9.8008e-01, 9.1565e-05, 9.6218e-01, 9.5452e-01, 1.5473e-07, 8.2344e-01,
        7.1049e-08, 4.0661e-04, 6.7408e-08, 2.1187e-03, 9.4497e-01, 9.9607e-01,
        9.5679e-01, 5.1922e-01, 8.8276e-01, 0.0000e+00, 8.8064e-01, 9.9985e-01,
        0.0000e+00, 8.7743e-01, 9.8375e-01, 9.3684e-01, 9.9998e-01, 3.4777e-01,
        8.0287e-02, 7.8010e-01, 9.9907e-01, 9.5035e-01, 3.0288e-01, 2.9452e-01,
        9.1218e-01, 9.8967e-01, 9.8519e-01, 8.1034e-01, 9.3482e-01, 9.8511e-43,
        9.7306e-01])
     Trace Shapes:          
      Param Sites:          
     Sample Sites:          
linear.weight dist    | 1 15
             value    | 1 15
  linear.bias dist    | 1   
             value    | 1   
sigma_squared dist 49 |     
             value 49 |     
Trace Shapes:
 Param Sites:
Sample Sites:

In [158]:
import probflow as pf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp

class CensoredSurvivalModel(pf.ContinuousModel):

    def __init__(self, d):
        self.layer = pf.Dense(d)

    def __call__(self, x):
        return tfd.Exponential(tf.nn.softplus(self.layer(x)))

    def log_likelihood(self, x, y):
        """If y>=0, that's the time to the observed event.
        If y<0, it has not yet been observed after -y time!"""

        # Predicted distributions
        dist = self(x)

        # Likelihoods of observed time-to-events
        obs_ll = dist.log_prob(y)[y>=0]

        # Likelihoods of events not yet observed
        non_ll = dist.log_survival_function(-y)[y<0]

        # Return the sum of log likelihoods
        return tf.reduce_sum(obs_ll) + tf.reduce_sum(non_ll)

NameError: name 'pf' is not defined