In [1]:
from pathlib import Path
import json
import datetime
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [4]:
from tpp_tensorflow.config import get_params_semisparse, get_params_sparse, parser
from tpp_tensorflow.datautils import get_tfrecords_input_fn
from tpp_tensorflow.models.semisparse import SemiSparseInput
from tpp_tensorflow.models.sparse import SparseInput
from tpp_tensorflow.models.dense import DenseHead
from tpp_tensorflow.utils import compute_mean_auc_fold, print_config, export_run_logs

from tpp_tensorflow.steps import train_step, eval_step

In [5]:
TENSORBOARD_PORT = 6711

In [6]:
FEATURE_TYPE = "semisparse"

In [7]:
DATA_ROOT_DISK0 = Path("/local00/bioinf/tpp/")
DATA_ROOT_DISK1 = Path("/local01/bioinf/tpp/")

In [8]:
MODEL_DIR = (DATA_ROOT_DISK0 / "models").as_posix()
RUN_LOG_DIR = (DATA_ROOT_DISK0 / "run_logs").as_posix()

In [9]:
RECORDS_TRAIN = (DATA_ROOT_DISK1 / f"runs/thesis_chembl25/records_{FEATURE_TYPE}/train.tfrecords").as_posix()
RECORDS_TEST = (DATA_ROOT_DISK1 / f"runs/thesis_chembl25/records_{FEATURE_TYPE}/test.tfrecords").as_posix()

In [10]:
AUC_MASK_TRAIN_PATH = (DATA_ROOT_DISK1 / f"runs/thesis_chembl25/records_{FEATURE_TYPE}/label_mask_train.parquet/").as_posix()
AUC_MASK_TEST_PATH = (DATA_ROOT_DISK1 / f"runs/thesis_chembl25/records_{FEATURE_TYPE}/label_mask_test.parquet/").as_posix()

In [11]:
COMMON_ASSAYS_MASK_PATH = Path("/publicdata/tpp/runs/thesis_chembl25/common_assays_mask.parquet").as_posix()

In [12]:
with open(DATA_ROOT_DISK1 / f"runs/thesis_chembl25/records_{FEATURE_TYPE}/metadata.json", "r") as infile:
    metadata = json.load(infile)

In [13]:
metadata

{'labels_size': 2193,
 'CATS2D_clean_size': 29,
 'SHED_clean_size': 3,
 'num_items_test': 189716,
 'num_items_train': 397969}

In [14]:
# Somewhat arbitrary number < nitems for train / val split
TRAIN_SET_SIZE = int(metadata["num_items_train"] * 0.6)
NUM_CLASSES = metadata["labels_size"]

if FEATURE_TYPE == "semisparse":
    CATS2D_SIZE = metadata["CATS2D_clean_size"]
    SHED_SIZE = metadata["SHED_clean_size"]
    
    custom_args = f"""
    --cats2d-size {CATS2D_SIZE} --shed-size {SHED_SIZE} 
    """
    
elif FEATURE_TYPE == "sparse":
    DFS8_SIZE = metadata["DFS8_clean_size"]
    ECFC4_SIZE = metadata["ECFC4_clean_size"]
    ECFC6_SIZE = metadata["ECFC6_clean_size"]
    
    FEATURE = "dfs8"
    FEATURE_SIZE = DFS8_SIZE
    EMBEDDING_SIZE = 2048
    
    custom_args = f"""
    --feature {FEATURE} --feature-size {FEATURE_SIZE} --embedding-size {EMBEDDING_SIZE} 
    --ecfc4-size {ECFC4_SIZE} --ecfc6-size {ECFC6_SIZE} --dfs8-size {DFS8_SIZE} 
    """

In [15]:
TRAIN_SET_SIZE

238781

In [16]:
NUM_EPOCHS = 5
BATCH_SIZE = 64
DROPOUT_RATE = 0.4
INPUT_DROPOUT_RATE = 0.2
ACTIVATION = "selu"
REG_L2_RATE = 0.01
LR = 0.1
LR_DECAY_STEPS = 500000
LR_DECAY_RATE = 0.96

In [17]:
args = parser.parse_args((f"""
    {FEATURE_TYPE} 
    --model-dir {MODEL_DIR} 
    --run-log-dir {RUN_LOG_DIR} 
    --records-train {RECORDS_TRAIN} 
    --records-test {RECORDS_TEST} 
    --auc-mask-train-path {AUC_MASK_TRAIN_PATH} 
    --auc-mask-test-path {AUC_MASK_TEST_PATH} 
    --common-assays-mask-path {COMMON_ASSAYS_MASK_PATH} 
    --num-epochs {NUM_EPOCHS} --batch-size {BATCH_SIZE} --dropout-rate {DROPOUT_RATE} 
    --lr {LR} --lr-decay-steps {LR_DECAY_STEPS} --lr-decay-rate {LR_DECAY_RATE}
    --input-dropout-rate {INPUT_DROPOUT_RATE} --activation {ACTIVATION} --reg-l2-rate {REG_L2_RATE}
    --train-set-size {TRAIN_SET_SIZE} 
    """ + custom_args).split())

In [18]:
input_fn = get_tfrecords_input_fn(args.run_type)
get_params = get_params_semisparse if args.run_type == "semisparse" else get_params_sparse

In [19]:
# Arg Namespace to NamedTuple for autocomplete in IDE / Jupyter
rparam, hparam = get_params(args)

In [20]:
train_ds = input_fn(rparam.records_train, mode="train", cache=True, split_train_eval=True, train_set_size=rparam.train_set_size, num_epochs=1, batch_size=hparam.batch_size, shuffle=True, rparams=rparam)
val_ds = input_fn(rparam.records_train, mode="eval", cache=True, split_train_eval=True, train_set_size=rparam.train_set_size, num_epochs=1, batch_size=hparam.batch_size, rparams=rparam)

train_ds_auc = input_fn(rparam.records_train, mode="train", cache=True, split_train_eval=True, train_set_size=rparam.train_set_size, num_epochs=1, batch_size=hparam.batch_size, rparams=rparam)

In [21]:
input_model = SemiSparseInput(hparam) if args.run_type == "semisparse" else SparseInput(hparam)
output_model = DenseHead(hparam)

In [22]:
models = (input_model, output_model)

In [23]:
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(hparam.lr, hparam.lr_decay_steps, hparam.lr_decay_rate, staircase=True)
optimizer = tf.keras.optimizers.SGD(lr=hparam.lr)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Accuracy(name='train_accuracy')
train_metrics = (train_loss, train_accuracy)

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Accuracy(name='val_accuracy')
val_metrics = (val_loss, val_accuracy)

In [24]:
logdir = Path(rparam.model_dir) / "logs"

In [25]:
#%load_ext tensorboard

In [26]:
#%tensorboard --logdir $logdir --port 6711

In [27]:
run_id = rparam.run_id

train_log_dir = logdir / "train"
val_log_dir = logdir / "val"
train_summary_writer = tf.summary.create_file_writer(train_log_dir.as_posix())
val_summary_writer = tf.summary.create_file_writer(val_log_dir.as_posix())

In [28]:
label_mask_train = pd.read_parquet(rparam.auc_mask_train_path).key.values
label_mask_val = pd.read_parquet(rparam.auc_mask_train_path).key.values

In [29]:
global_step = 0
for epoch in range(hparam.num_epochs):
    for features, labels in train_ds:        
        optimizer.learning_rate = lr_scheduler(global_step)
        train_step(features, labels, models, train_metrics, optimizer, hparam)
        global_step += 1
        
    train_mean_auc, _ = compute_mean_auc_fold(models, train_ds_auc, eval_step, hparam, label_mask_train)
    
    with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss.result(), step=epoch)
        tf.summary.scalar('accuracy', train_accuracy.result(), step=epoch)
        tf.summary.scalar("mean_auc", train_mean_auc, step=epoch)

    for features, labels in val_ds:
        eval_step(features, labels, models, val_metrics, hparam)
        
    val_mean_auc, _ = compute_mean_auc_fold(models, val_ds, eval_step, hparam, label_mask_val)
    
    with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss.result(), step=epoch)
        tf.summary.scalar('accuracy', val_accuracy.result(), step=epoch)
        tf.summary.scalar("mean_auc", val_mean_auc, step=epoch)
        
    template = 'Epoch {0}, Loss: {1:.5g}, Accuracy: {2:.5g}, meanAUC: {3:.8g}, Val Loss: {4:.5g}, Val Accuracy: {5:.5g}, Val meanAUC: {6:.8g}'
    print(template.format(
        epoch,
        train_loss.result(),
        train_accuracy.result(),
        train_mean_auc,
        val_loss.result(),
        val_accuracy.result(),
        val_mean_auc))
    
    train_loss.reset_states()
    val_loss.reset_states()
    train_accuracy.reset_states()
    val_accuracy.reset_states()

Epoch 0, Loss: 0.32434, Accuracy: 0.86412, meanAUC: 0.53704286, Val Loss: 0.29737, Val Accuracy: 0.87284, Val meanAUC: 0.52347714


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/system/user/ruch/miniconda3/envs/tpp_tf_v2.0.0_py3.7/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-8596fee39612>", line 5, in <module>
    train_step(features, labels, models, train_metrics, optimizer, hparam)
  File "/system/user/ruch/miniconda3/envs/tpp_tf_v2.0.0_py3.7/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 457, in __call__
    result = self._call(*args, **kwds)
  File "/system/user/ruch/miniconda3/envs/tpp_tf_v2.0.0_py3.7/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 487, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "/system/user/ruch/miniconda3/envs/tpp_tf_v2.0.0_py3.7/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1823, in __call__
    return graph_function._filtered_cal

KeyboardInterrupt: 

In [30]:
export_run_logs(args.run_type, args.run_log_dir, rparam, hparam)

In [31]:
label_mask_test = pd.read_parquet(rparam.auc_mask_test_path).key.values

In [32]:
common_assays_mask = pd.read_parquet(rparam.common_assays_mask_path)

In [33]:
label_mask_test = set(label_mask_test) & set(np.nonzero(common_assays_mask.assay_id.values)[0])

In [34]:
test_ds = input_fn(rparam.records_test, mode="eval", cache=False, split_train_eval=False, num_epochs=1, batch_size=hparam.batch_size, rparams=rparam)
test_mean_auc, _ = compute_mean_auc_fold(models, test_ds, eval_step, hparam, label_mask_test)

In [35]:
print(f"Test meanAUC: {test_mean_auc}")

Test meanAUC: 0.526199460029602
