In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tabularmaml import ClinicalDataModel, TabularMAML, generate_fake_datasets, oversample

In [None]:
# Load prepared datasets. Most data is
X_train = pd.read_csv('X_train.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)

X_test = pd.read_csv('X_test.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)

# Generate synthetic training datasets
synthetic_train_x, synthetic_train_y = oversample(X_train, y_train)

# Construct object to store data and auto-build basic keras model
clinical_data_model = ClinicalDataModel(synthetic_train_x, synthetic_train_y,
                                        X_train, y_train,
                                        X_test, y_test)

maml = TabularMAML(clinical_data_model, num_epochs=100)

In [None]:
base_model = maml.intermediate_model
base_model.trainable = True

In [None]:
#base_model.trainable = True
#last_dense_layer = [it for it,layer in enumerate(base_model.layers) if isinstance(layer,keras.layers.Dense)][-2]
#for layer in base_model.layers[:last_dense_layer]:
#    layer.trainable = False

In [None]:
model = keras.models.clone_model(base_model)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(),
                       tf.keras.metrics.FalseNegatives(),
                       tf.keras.metrics.AUC(),
                       tf.keras.metrics.Precision(),
                       tf.keras.metrics.Recall(),
                       tf.keras.metrics.TruePositives(),
                       tf.keras.metrics.TrueNegatives(),
                       tf.keras.metrics.FalsePositives(),
                       tf.keras.metrics.FalseNegatives(),
                       ])

In [None]:
model.fit(
    x=maml.clinical_data.specific_train_x,
    y=maml.clinical_data.specific_train_y,
    batch_size=None,
    epochs=500,
    verbose="auto",
    callbacks=None,
    validation_split=0.0,
    validation_data=(maml.clinical_data.test_x, maml.clinical_data.test_y),
    shuffle=True,
    class_weight=None,
    sample_weight=None,
    initial_epoch=0,
    steps_per_epoch=None,
    validation_steps=None,
    validation_batch_size=None,
    # validation_freq=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
)

In [None]:
model.evaluate(x=maml.clinical_data.test_x,
               y=maml.clinical_data.test_y)

In [None]:
import seaborn as sns
import numpy as np
data = np.array([_.numpy() for _ in maml.loss_per_epoch])
sns.lineplot(x=range(len(data)), y=data)

In [None]:
y_test = pd.read_csv('~/data/C800/vis-machine-learning/patient_response/notebooks/y_test.csv', index_col=0)
prediction_df = y_test
n = prediction_df.shape[0]
prediction_df.reset_index(inplace=True, drop=True)
prediction_df['source_labels'] = y_test['clinical_benefit']
prediction_df.drop('clinical_benefit', axis=1, inplace=True, errors="ignore")
prediction_df['probability=1'] = pd.Series(model.predict(X_test).T[0], name='probability=1')
prediction_df['probability=0'] = 1-prediction_df['probability=1']
prediction_df['target_labels'] = prediction_df['probability=1'].apply(lambda x: np.round(x,0))
prediction_df['pair_str'] = prediction_df.apply(lambda row: ''.join([row['source_labels'].astype(int).astype(str),row['target_labels'].astype(int).astype(str)]),axis=1)
prediction_df

In [None]:
# Compute the empirical joint distribution
joint_dist_dict = dict()
label_set = ['0', '1']
conditional_pairs = ['00', '01', '10', '11']
for pair in conditional_pairs:
    pair_df = prediction_df[prediction_df['pair_str'] == pair]
    joint_dist_dict[pair] = pair_df['probability={}'.format(pair[0])].sum()/n

In [None]:
# Compute the empirical marginal distribution
marginal_dist_dict = dict()
for y in label_set:
    marginal_dist_dict[y] = sum([joint_dist_dict[_] for _ in conditional_pairs if _[-1]==y])

In [None]:
# Compute the empirical conditional distribution
conditional_dist = {key:val/marginal_dist_dict[key[-1]] for key, val in joint_dist_dict.items()}

In [None]:
conditional_dist

In [None]:
outer_sum = 0
for i in range(n):
    inner_sum = 0
    for z in label_set:
        source_label = prediction_df.loc[i]['source_labels'].astype(int).astype(str)
        inner_sum += conditional_dist[''.join([source_label, str(z)])] * prediction_df.loc[i]['probability={}'.format(z)]
    outer_sum += np.log(inner_sum)
# "From its definition, the LEEP measure is always negative and larger values
# (i.e., smaller absolute values) indicate better transferability."
# LEEP scores tend to range from (-4, -0.5).
leep_value = outer_sum/n

In [None]:
pd.Series([np.round(_[0],0) for _ in base_model.predict(X_test)], name='clinical_benefit')