In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('/home/ruyogagp/medical_interpretability')
import numpy as np
from pysurvival.models import BaseModel
from pysurvival import utils
import scipy
import pandas as pandas
import copy
import random
from sklearn.preprocessing import StandardScaler
from source.utils import create_correlated_var
from pysurvival.models.simulations import SimulationModel
from lifelines import CoxPHFitter
import pandas as pd
from sklearn.model_selection import train_test_split
import networkx as nx
from cga import cga
from itertools import cycle
import tqdm as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Callable, TypeVar

In [4]:
DATA_DIRECTORY = '/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/interpretability/resample_multiplicities'

## Helper Functions

In [5]:
def fit_coxph(df):
    cph = CoxPHFitter()
    cph.fit(df, 'time', 'event')
    cph.print_summary()

def fit_coxph_norm(df):
    standard_scaler = StandardScaler()
    for col in df.columns:
        if col == 'time' or col == 'event':
            pass
        df[col] = standard_scaler.fit_transform(df[[col]])
    cph = CoxPHFitter()
    cph.fit(df, 'time', 'event')
    cph.print_summary()


def save_orig(df, name, output_dir):
    train, valid = train_test_split(df, test_size=0.3)
    train.to_csv(
        f"{output_dir}/{name}_train.csv",
        index=False,
    )
    valid.to_csv(
        f"{output_dir}/{name}_valid.csv",
        index=False,
    )
    print(f"Saved {output_dir}/{name}_train.csv")
    print(f"Saved {output_dir}/{name}_valid.csv")

def df2csv(
        df: pd.DataFrame,
        name: str,
        output_dir: str,
):
    """
    Writes csv given a dataframe + name
    """
    train, valid = train_test_split(df, test_size=0.3)
    train.to_csv(
        f"{output_dir}/{name}_train_details.csv",
        index=False,
    )
    valid.to_csv(
        f"{output_dir}/{name}_valid_details.csv",
        index=False,
    )

    train_df = train.loc[:, ['x_orig', 'y_orig', 'time_orig', 'event_orig']]
    valid_df = valid.loc[:, ['x_orig', 'y_orig', 'time_orig', 'event_orig']]
    train_df.rename(columns=dict(x_orig='x',
                                 y_orig='y',
                                 time_orig='time',
                                 event_orig='event'), inplace=True)

    valid_df.rename(columns=dict(x_orig='x',
                                 y_orig='y',
                                 time_orig='time',
                                 event_orig='event'), inplace=True)
    train_df.to_csv(
        f"{output_dir}/{name}_train.csv",
        index=False,
    )
    valid_df.to_csv(
        f"{output_dir}/{name}_valid.csv",
        index=False,
    )

## Simulation Model with correlations

In [6]:
class SimulationModelWithCorrelations(SimulationModel):
    """
    Subclasses `SimulationModel` to generated data from an predefined
    risk factor.
    """

    def generate_data(self,
                      df: pd.DataFrame,
                      feature_weights: list,
                      feature_names: list,
                      include_hazard: bool = False
                      ):

        def risk_function(x_std, feature_weights):
            """ Calculating the risk function based on the given risk type """

            # Dot product
            risk = np.dot(x_std, feature_weights )

            # Choosing the type of risk
            if self.risk_type.lower() == 'linear' :
                return risk.reshape(-1, 1)

            elif self.risk_type.lower() == 'square' :
                risk = np.square(risk*self.risk_parameter)


            elif self.risk_type.lower() == 'gaussian' :
                risk = np.square(risk)
                risk = np.exp( - risk*self.risk_parameter)

            return risk.reshape(-1, 1)

        input_data = df.loc[:, feature_names].to_numpy()
        self.dataset = copy.deepcopy(df)
        num_samples = input_data.shape[0]
        X_std = self.scaler.fit_transform(input_data)
        BX = risk_function(X_std, feature_weights)

        # Building the survival times
        T = self.time_function(BX)
        C = np.random.normal(loc=self.censored_parameter, scale=5, size=num_samples)
        C = np.maximum(C, 0.0)
        time = np.minimum(T, C)
        E = 1.0 * (T == time)

        # Building dataset
        self.dataset = copy.deepcopy(df)
        self.dataset['time'] = time
        self.dataset['event'] = E
        if include_hazard:
            self.dataset['hazard'] = BX

        # Building the time axis and time buckets
        self.times = np.linspace(0.0, max(self.dataset["time"]), self.bins)
        self.get_time_buckets()

        # Building baseline functions
        self.baseline_hazard = self.hazard_function(self.times, 0)
        self.baseline_survival = self.survival_function(self.times, 0)

        # Printing summary message
        message_to_print = "Number of data-points: {} - Number of events: {}"
        print(message_to_print.format(num_samples, sum(E)))
        return self.dataset

## Correlation Case Graph

In [7]:
@cga.node
def correlation_000(x: float, noise:float) -> float:
    """
    :param x: exisiting data to correlate
    :param coeff: correlation coefficient
    :param noise: noise variable
    :return: variable correlated by coeff to the exisiting variable x
    """
    correlate = create_correlated_var(x,
                                      mu=np.mean(x),
                                      sd=np.std(x),
                                      empirical=True,
                                      r=0.000)
    return correlate + noise

@cga.node
def correlation_025(x: float, noise:float) -> float:
    """
    :param x: exisiting data to correlate
    :param coeff: correlation coefficient
    :param noise: noise variable
    :return: variable correlated by coeff to the exisiting variable x
    """
    correlate = create_correlated_var(x,
                                      mu=np.mean(x),
                                      sd=np.std(x),
                                      empirical=True,
                                      r=0.250)
    return correlate + noise

@cga.node
def correlation_050(x: float, noise:float) -> float:
    """
    :param x: exisiting data to correlate
    :param coeff: correlation coefficient
    :param noise: noise variable
    :return: variable correlated by coeff to the exisiting variable x
    """
    correlate = create_correlated_var(x,
                                      mu=np.mean(x),
                                      sd=np.std(x),
                                      empirical=True,
                                      r=0.500)
    return correlate + noise

@cga.node
def correlation_075(x: float, noise:float) -> float:
    """
    :param x: exisiting data to correlate
    :param coeff: correlation coefficient
    :param noise: noise variable
    :return: variable correlated by coeff to the exisiting variable x
    """
    correlate = create_correlated_var(x,
                                      mu=np.mean(x),
                                      sd=np.std(x),
                                      empirical=True,
                                      r=0.750)
    return correlate + noise

@cga.node
def sample_random_normal(noise:float)->float:
    """
    :param n: sample size
    :param noise: noise variable
    :return: random normal variable
    """
    return np.random.normal(size=100) + noise

@cga.node
def correlation_coefficient(coeff:float) -> float:
    return coeff

class CorrelationCaseGraph(cga.Graph):
    def __init__(self):
        """
        causal graph for correlation case
        :param n: number of data points
        :param coeff: desired correlation coefficient between the two variables
        """
        noise = cga.node(lambda: np.random.normal(scale=0.1, size=100))
        rnorm_vector = cga.node(lambda: np.random.normal(size=100))
        self.rnorm = rnorm_vector(name="rnorm")
        self.noise0 = noise(name="noise0")
        self.noise1 = noise(name="noise1")
        self.noise2 = noise(name="noise2")
        self.noise3 = noise(name="noise3")
        self.noise4 = noise(name="noise4")
        self.feature0 = sample_random_normal(self.noise0, name='feature0')
        self.feature1 = correlation_000(self.feature0, self.noise1, name='feature1')
        self.feature2 = correlation_025(self.feature0, self.noise2, name='feature2')
        self.feature3 = correlation_050(self.feature0, self.noise3, name='feature3')
        self.feature4 = correlation_075(self.feature0, self.noise4, name='feature4')
        super().__init__([self.feature0, self.feature1, self.feature2, self.feature3, self.feature4])

    def get_interventions(self,
                          sim: SimulationModelWithCorrelations,
                          n_iterations: int,
                          feature_weights: list,
                          ) -> pd.DataFrame:
        data = None
        for node in [self.noise0, self.noise1, self.noise2, self.noise3, self.noise4]:
            for _ in tqdm.trange(n_iterations, desc=f"Intervention {node.name}"):
                # resample noise
                orig, interventions, *_ = self.sample_do(action=cga.Resample(node), n_samples=100)
                row = {'modified_attribute': [node.name] * 100}
                # add orig + do to the dictionary
                row.update({
                    n.name + "_orig": v
                    for n, v in orig.items()
                })
                for idx, intervention in enumerate(interventions):
                    row.update({
                        n.name + f"_intervention{idx}": v
                        for n, v in intervention.items()})

                data = row if data is None else data
                for key in row.keys():
                    row[key] = row[key].tolist() if isinstance(row[key], np.ndarray) else row[key]
                    data[key].extend(row[key])
        intervention_df = pd.DataFrame(data)

        orig_cols = ['feature0_orig', 'feature1_orig', 'feature2_orig', 'feature3_orig', 'feature4_orig']
        modified_attributes = [f'noise{i}' for i in range(len(orig_cols))]
        orig_df = sim.generate_data(intervention_df, feature_names=orig_cols,
                                    feature_weights=feature_weights,
                                    include_hazard=True)

        # split by modified attribute, to get the input data for attribution
        attribution_dfs = self.slice_dataframe(orig_df, modified_attributes)

        intervention_df['hazard_orig'] = orig_df.hazard
        intervention_df['event_orig'] = orig_df.event
        intervention_df['time_orig'] = orig_df.time

        return attribution_dfs, intervention_df

    def slice_dataframe(self, orig_df, modified_attributes):
        df_list = []
        for modified_attribute in modified_attributes:
            df = orig_df.loc[orig_df.modified_attribute==modified_attribute]\
                      .loc[:, ['feature0_orig', 'feature1_orig', 'feature2_orig', 'feature3_orig', 'feature4_orig', 'time', 'event']]\
                      .rename(columns=dict(feature0_orig='feature0',
                                           feature1_orig='feature1',
                                           feature2_orig='feature2',
                                           feature3_orig='feature3',
                                           feature4_orig='feature4'))
            df_list.append(df)
        return df_list

    def test_intervention(self, n_iterations):
        for node in [self.noise_x]:
            for _ in tqdm.trange(n_iterations, desc=f"Intervention {node.name}"):
                # resample noise
                orig, intervention0, intervention1 = self.sample_do(action=cga.Resample(node))
        return orig, intervention0, intervention1

## Sample from Graph

In [None]:
# Sample features
data = None
correlation_graph = CorrelationCaseGraph()
for _ in tqdm.trange(100, desc='sampling'):
    result = correlation_graph.sample()
    data = result if data is None else data
    for key in result.keys():
        result[key] = result[key].tolist() if isinstance(result[key], np.ndarray) else result[key]
        data[key].extend(result[key])
del data[correlation_graph.noise0]
del data[correlation_graph.noise1]
del data[correlation_graph.noise2]
del data[correlation_graph.noise3]
del data[correlation_graph.noise4]

# Generate data
training_features = pd.DataFrame(data)
sim = SimulationModelWithCorrelations(risk_type='linear', alpha=1.0, beta=5.0, censored_parameter=5.0, survival_distribution='weibull')
feature_weights = [np.log(2), np.log(1.5), np.log(1.5), np.log(1.5), np.log(1.5)]
feature_names = [correlation_graph.feature0, correlation_graph.feature1, correlation_graph.feature2, correlation_graph.feature3, correlation_graph.feature4]
training_df = sim.generate_data(training_features, feature_weights=feature_weights, feature_names=feature_names)

# Check correlations
training_df.corr()

In [None]:
training_df.corr()

## Save Training Data

In [None]:
directory = '/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/interpretability/resample_multiplicities'
save_orig(training_df, name='multi_correlation', output_dir=directory)

## Resample Features

In [None]:
# Resample from graph
correlation_graph = CorrelationCaseGraph()
feature_weights = [np.log(2), np.log(1.5), np.log(1.5), np.log(1.5), np.log(1.5)]
sim = SimulationModelWithCorrelations(risk_type='linear', alpha=1.0, beta=5.0, censored_parameter=5.0, survival_distribution='weibull')
attribution_dfs, intervention_details = correlation_graph.get_interventions(sim=sim, n_iterations=30, feature_weights=feature_weights)

## Save Attribution Data

In [None]:
experiment_name = 'multi-correlation'

In [None]:
for idx, df in enumerate(attribution_dfs):
    df.to_csv(f'{directory}/{experiment_name}_attribute_feature{idx}.csv', index=False)
intervention_details.to_csv(f'{directory}/{experiment_name}_attribute_details.csv', index=False)

In [None]:
intervention_details

In [None]:
pd.read_csv(f'{directory}/{experiment_name}_attribute_feature0.csv')

# Simpsons Paradox Graph

In [8]:
def get_simpsons_paradox(
        p: float = 2,
        q: float = 1,
        n: float = 500,
        n_groups: int = 5,
):

    k = np.random.choice(5, size=n)
    scaling = np.random.normal(size=n)

    noise_x = np.random.normal(scale=0.25, size=n)
    noise_y = np.random.normal(scale=0.25, size=n)
    y = scaling * np.sin(p / q) + k + noise_y
    x = scaling * np.cos(p / q) + k + noise_x
    return x, y


T = TypeVar("T")


def ifnone(maybe: Optional[T], default: T) -> T:
    if maybe is None:
        return default
    else:
        return maybe


@cga.node
def simpson_x(
        scaling: float,
        group: float,
        noise: float,
) -> float:
    return scaling * np.cos(2 / 1) + group + noise


@cga.node
def simpson_y(
        scaling: float,
        group: float,
        noise: float,
) -> float:
    return scaling * np.sin(2 / 1) + group + noise


@cga.node
def simpson_hazzard(
        scaling: float,
        group: float,
) -> float:
    return np.where(group == 2, scaling, -scaling).item()


class SimpsonsParadoxGraph(cga.Graph):
    def __init__(self):
        # define functions
        noise = cga.node(lambda: np.random.normal(scale=0.27))
        get_group = cga.node(lambda: np.random.choice(5))
        get_scaling = cga.node(lambda: np.random.normal())

        self.noise_x = noise(name="noise_x")
        self.noise_y = noise(name="noise_y")
        self.group = get_group(name="group")

        self.scaling = get_scaling(name="scaling")

        self.x = simpson_x(self.scaling, self.group, self.noise_x, name="x")
        self.y = simpson_y(self.scaling, self.group, self.noise_y, name="y")

        self.hazzard = simpson_hazzard(self.scaling, self.group, name="hazzard")
        super().__init__([self.x, self.y, self.hazzard])

    def get_interventions(self, g, sim, n_samples):
        data = []
        for node in [g.noise_x, g.noise_y]:
            for _ in tqdm.auto.trange(n_samples,
                                      desc=f"Intervention {node.name}"):
                orig, interventions = g.sample_do(
                    action=cga.Resample(node),
                    n_samples=100,
                )
                row = {'modified_attribute': node.name}
                row.update({
                    n.name + "_orig": v
                    for n, v in orig.items()
                })
                for idx, intervention in enumerate(interventions):
                    row.update({
                        n.name + f"_intervention{idx}": v
                        for n, v in intervention.items()})
                data.append(row)
            df = pd.DataFrame(data)
        sim_df = sim.generate_data(df, hazzard_column='hazzard_orig')
        df['event_orig'] = sim_df.event
        df['time_orig'] = sim_df.time

        # separate x and y to build attribute dataframe
        xdf = df.loc[df.modified_attribute=='noise_x']\
                  .loc[:, ['x_orig', 'y_orig', 'time_orig', 'event_orig']]\
            .rename(columns=dict(x_orig='x', y_orig='y', time_orig='time', event_orig='event'))
        ydf = df.loc[df.modified_attribute=='noise_y']\
                  .loc[:, ['x_orig', 'y_orig', 'time_orig', 'event_orig']]\
            .rename(columns=dict(x_orig='x', y_orig='y', time_orig='time', event_orig='event'))

        return xdf, ydf, df


    def transform(
            self,
            dataset_row: pd.Series,
            set_values: dict = {},
            replace_nodes: dict = {},
    ) -> pd.Series:
        # print(dataset_row.keys())
        result = self.sample(
            set_values={
                self.scaling: dataset_row["predictive0"],
                self.group: np.digitize(
                    dataset_row["nonpredictive0"], [-1.5, -1, 0, 1, 1.5]
                ),
            },
            replace=replace_nodes,
        )
        return pd.Series(
            index=["x", "y", "event", "time"],
            data=[
                result[self.x],
                result[self.y],
                dataset_row["event"],
                dataset_row["time"],
            ],
        )

## Simulation Model with Risk

In [9]:
class SimulationModelWithRisk(SimulationModel):
    """
    Subclasses `SimulationModel` to generated data from an predefined
    risk factor.
    """

    def generate_data(
            self,
            dataframe: pd.DataFrame,
            hazzard_column="hazzard",
    ):
        """
        Generating a dataset of simulated survival times from a given
        distribution through the hazard function using the Cox model

        Parameters:
        -----------

        * `dataframe`: **pd.Dataframe** --
            A pandas dataframe with a risk column.

        * `hazzard_column`: **str** *(default="risk")* --
            Name of the risk column.

        Returns:
        --------
        * dataset: pandas.DataFrame
            dataset of simulated survival times, event status and features


        Example:
        --------
        from pysurvival.models.simulations import SimulationModel

        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'gompertz',
                               risk_type = 'linear',
                               censored_parameter = 5.0,
                               alpha = 0.01,
                               beta = 5., )

        # Generating N Random samples
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features=5)

        # Showing a few data-points
        dataset.head()
        """

        def risk_function(risk: np.ndarray) -> np.ndarray:
            # Choosing the type of risk
            if self.risk_type.lower() == "linear":
                return risk.reshape(-1, 1)

            elif self.risk_type.lower() == "square":
                risk = np.square(risk * self.risk_parameter)

            elif self.risk_type.lower() == "gaussian":
                risk = np.square(risk)
                risk = np.exp(-risk * self.risk_parameter)

            return risk.reshape(-1, 1)

        num_samples = len(dataframe)

        BX = risk_function(np.array(dataframe[hazzard_column]))

        # Building the survival times
        T = self.time_function(BX)
        C = np.random.normal(loc=self.censored_parameter, scale=5, size=num_samples)
        C = np.maximum(C, 0.0)
        time = np.minimum(T, C)
        E = 1.0 * (T == time)

        # Building dataset
        self.dataset = copy.deepcopy(dataframe)
        self.dataset["time"] = time
        self.dataset["event"] = E

        # Building the time axis and time buckets
        self.times = np.linspace(0.0, max(self.dataset["time"]), self.bins)
        self.get_time_buckets()

        # Building baseline functions
        self.baseline_hazard = self.hazard_function(self.times, 0)
        self.baseline_survival = self.survival_function(self.times, 0)

        # Printing summary message
        message_to_print = "Number of data-points: {} - Number of events: {}"
        print(message_to_print.format(num_samples, sum(E)))
        return self.dataset

In [None]:
simpson_graph = SimpsonsParadoxGraph()
gnx = simpson_graph.to_networkx()
nx.draw_networkx(
    gnx,
    labels={n: n.name for n in gnx.nodes},
    pos=nx.layout.spring_layout(gnx, k=3),
    node_size=2_000,
)

In [None]:
n_samples = 10_000
data = []
for _ in tqdm.auto.trange(n_samples):
    result = simpson_graph.sample()
    data.append(
        {
            "x": result[simpson_graph.x],
            "y": result[simpson_graph.y],
            "hazzard": float(result[simpson_graph.hazzard]),
            "group": result[simpson_graph.group],
            "scaling": result[simpson_graph.scaling],
        }
    )
df = pd.DataFrame(data)
sim = SimulationModelWithRisk(risk_type='linear',
                                      alpha=1.0,
                                      beta=5.0,
                                      censored_parameter=5.0,
                                      survival_distribution='weibull')
df = sim.generate_data(df)

In [None]:
directory = '/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/interpretability/resample_multiplicities'
training_df = df.loc[:, ['x', 'y', 'time', 'event']]
save_orig(training_df, name='simpsons', output_dir=directory)

In [None]:
xdf, ydf, df = simpson_graph.get_interventions(simpson_graph, sim, 5000)

In [None]:
xdf.to_csv(f'{directory}/simpsons_attribute_x.csv', index=False)
ydf.to_csv(f'{directory}/simpsons_attribute_y.csv', index=False)
df.to_csv(f'{directory}/simpsons_attribute_details.csv', index=False)

In [None]:
eval('attribute_x')

In [99]:
@cga.node
def relu(x: float, noise:float) -> float:
    """
    Rectified Linear unit `mul`
    """
    out = x * (x > 0)
    return out + noise

@cga.node
def tanh(x: float, noise:float) -> float:
    """
    Hyperbolic Tangent
    """
    out = np.tanh(x)
    return out + noise

@cga.node
def sigmoid(x: float, noise:float) -> float:
    """
    Sigmoid function
    """
    out = 1 / (1 + np.exp(-x))
    return out + noise

@cga.node
def square(x: float, noise:float) -> float:
    """
    Squared
    """
    out = np.square(x)
    return out + noise

@cga.node
def sample_random_normal(noise:float)->float:
    """
    :param n: sample size
    :param noise: noise variable
    :return: random normal variable
    """
    return np.random.normal() + noise


class NonLinearCaseGraph(cga.Graph):
    def __init__(self):
        """
        causal graph for correlation case
        :param n: number of data points
        :param coeff: desired correlation coefficient between the two variables
        """
        noise = cga.node(lambda: np.random.normal(scale=0.1))
        self.noise0 = noise(name="noise0")
        self.noise1 = noise(name="noise1")
        self.noise2 = noise(name="noise2")
        self.noise3 = noise(name="noise3")
        self.noise4 = noise(name="noise4")
        self.feature0 = sample_random_normal(self.noise0, name='feature0')
        self.feature1 = relu(self.feature0, self.noise1, name='feature1')
        self.feature2 = tanh(self.feature0, self.noise2, name='feature2')
        self.feature3 = sigmoid(self.feature0, self.noise3, name='feature3')
        self.feature4 = square(self.feature0, self.noise4, name='feature4')
        super().__init__([self.feature0, self.feature1, self.feature2, self.feature3, self.feature4])

    def get_interventions(self,
                          graph: NonLinearCaseGraph,
                          sim: SimulationModelWithCorrelations,
                          n_samples: int,
                          feature_weights: list,
                          ) -> pd.DataFrame:
        data = []
        for node in [graph.noise0, graph.noise1, graph.noise2, graph.noise3, graph.noise4]:
            for _ in tqdm.auto.trange(n_samples,
                                      desc=f"Intervention {node.name}"):
                orig, interventions = graph.sample_do(
                    action=cga.Resample(node),
                    n_samples=100,
                )
                row = {'modified_attribute': node.name}
                row.update({
                    n.name + "_orig": v
                    for n, v in orig.items()
                })
                for idx, intervention in enumerate(interventions):
                    row.update({
                        n.name + f"_intervention{idx}": v
                        for n, v in intervention.items()})
                data.append(row)
        intervention_df = pd.DataFrame(data)

        orig_cols = ['feature0_orig', 'feature1_orig', 'feature2_orig', 'feature3_orig', 'feature4_orig']
        modified_attributes = [f'noise{i}' for i in range(len(orig_cols))]
        orig_df = sim.generate_data(intervention_df, feature_names=orig_cols,
                                    feature_weights=feature_weights,
                                    include_hazard=True)

        # split by modified attribute, to get the input data for attribution
        attribution_dfs = self.slice_dataframe(orig_df, modified_attributes)

        intervention_df['hazard_orig'] = orig_df.hazard
        intervention_df['event_orig'] = orig_df.event
        intervention_df['time_orig'] = orig_df.time

        return attribution_dfs, intervention_df

    def slice_dataframe(self, orig_df, modified_attributes):
        df_list = []
        for modified_attribute in modified_attributes:
            df = orig_df.loc[orig_df.modified_attribute==modified_attribute]\
                     .loc[:, ['feature0_orig', 'feature1_orig', 'feature2_orig', 'feature3_orig', 'feature4_orig', 'time', 'event']]\
                .rename(columns=dict(feature0_orig='feature0',
                                     feature1_orig='feature1',
                                     feature2_orig='feature2',
                                     feature3_orig='feature3',
                                     feature4_orig='feature4'))
            df_list.append(df)
        return df_list

    def test_intervention(self, n_iterations):
        for node in [self.noise_x]:
            for _ in tqdm.trange(n_iterations, desc=f"Intervention {node.name}"):
                # resample noise
                orig, intervention0, intervention1 = self.sample_do(action=cga.Resample(node))
        return orig, intervention0, intervention1

In [106]:
# Sample features for training data
nonlinear_graph = NonLinearCaseGraph()
data = []
for _ in tqdm.trange(10000, desc='sampling'):
    result = nonlinear_graph.sample()
    data.append({'feature0': result[nonlinear_graph.feature0],
                 'feature1': result[nonlinear_graph.feature1],
                 'feature2': result[nonlinear_graph.feature2],
                 'feature3': result[nonlinear_graph.feature3],
                 'feature4': result[nonlinear_graph.feature4]})

# Generate data
training_features = pd.DataFrame(data)
sim = SimulationModelWithCorrelations(risk_type='linear', alpha=1.0, beta=5.0, censored_parameter=5.0, survival_distribution='weibull')
feature_weights = [np.log(2), np.log(1.4), np.log(1.3), np.log(1.2), np.log(1.1)]
feature_names = [f'feature{i}' for i in range(5)]
training_df = sim.generate_data(training_features, feature_weights=feature_weights, feature_names=feature_names, include_hazard=False)

sampling: 100%|██████████| 10000/10000 [00:01<00:00, 6478.16it/s]

Number of data-points: 10000 - Number of events: 7830.0





In [107]:
save_orig(training_df, name='nonlinear', output_dir=DATA_DIRECTORY)

Saved /data/analysis/ag-reils/ag-reils-shared/cardioRS/data/interpretability/resample_multiplicities/nonlinear_train.csv
Saved /data/analysis/ag-reils/ag-reils-shared/cardioRS/data/interpretability/resample_multiplicities/nonlinear_valid.csv


In [109]:
# Resample from graph
sim = SimulationModelWithCorrelations(risk_type='linear', alpha=1.0, beta=5.0, censored_parameter=5.0, survival_distribution='weibull')
attribution_dfs, intervention_details = nonlinear_graph.get_interventions(graph=nonlinear_graph, sim=sim, n_samples=3000, feature_weights=feature_weights)

Intervention noise0:   0%|          | 0/3000 [00:00<?, ?it/s]

Intervention noise1:   0%|          | 0/3000 [00:00<?, ?it/s]

Intervention noise2:   0%|          | 0/3000 [00:00<?, ?it/s]

Intervention noise3:   0%|          | 0/3000 [00:00<?, ?it/s]

Intervention noise4:   0%|          | 0/3000 [00:00<?, ?it/s]

Number of data-points: 15000 - Number of events: 11823.0


In [110]:
experiment_name = 'nonlinear'
for idx, df in enumerate(attribution_dfs):
    df.to_csv(f'{DATA_DIRECTORY}/{experiment_name}_attribute_feature{idx}.csv', index=False)
intervention_details.to_csv(f'{DATA_DIRECTORY}/{experiment_name}_attribute_details.csv', index=False)