# Basic Response Prediction Using an MLP Architecture

In [65]:
import itertools

import numpy as np
import pandas as pd

from pathlib import Path
from tensorflow import keras

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from cdrpy.data import Dataset
from cdrpy.data.preprocess import normalize_responses
from cdrpy.mapper import BatchedResponseGenerator
from cdrpy.metrics import tf_metrics

In [66]:
SEED = 1771

## 1. Data preparation

### Loading the drug response dataset

We can load a saved `cdrpy` datasest using the `Dataset.load` method. This demo dataset consists of harmonized cell line pharmacogenomic data retreived from the **Genomics of Drug Sensitivity in Cancer** and the **Cell Model Passports** data resources.

In this dataset, cell lines are represented by log-transformed TPM gene expression values for 1771 cancer-relevant genes. Drugs are represented as 512 bit Morgan Fingerprints. Drug responses correspond to the natural log of the half-maximal inhibitory concentration (ln(IC50)) values.

In [67]:
D = Dataset.load("../../data/datasets/temp/demo.h5")
print(D)

Dataset(name=CellModelPassportsGDSCv2, size=131_847, n_cells=792, n_drugs=184)


In [68]:
# NOTE: add some data exploration -> how many features do we have
# for expression and morgan fingerprints

### Splitting the data

In precision oncology, we need to predict response in never-before-seen patients. To mimic this setting, we will use scikit-learn's `train_test_split` function to generate a train/test split in which all drug responses for a given cell line are held out for evaluation.

In [69]:
cell_ids = D.cell_meta.index
cell_groups = D.cell_meta["cancer_type"]  # stratify by cancer type
test_size = 0.1

train_cell_ids, test_cell_ids = train_test_split(
    cell_ids,
    stratify=cell_groups,
    test_size=test_size,
    random_state=SEED,
)

train_cell_ids, val_cell_ids = train_test_split(
    train_cell_ids,
    stratify=cell_groups.loc[train_cell_ids],
    test_size=test_size,
    random_state=SEED,
)

train_ds = D.select_cells(train_cell_ids, name="train")
val_ds = D.select_cells(val_cell_ids, name="val")
test_ds = D.select_cells(test_cell_ids, name="test")

### Data preprocessing

To avoid data leakage, several preprocessing steps should be completed *after* splitting the data in train/validation/test sets.

In this case, we will use scikit-learn's `StandardScaler` class to normalize gene expression values.

In [71]:
ge_data = train_ds.cell_encoders["exp"].data

ge_scaler = StandardScaler().fit(ge_data.loc[train_cell_ids])
ge_data[:] = ge_scaler.transform(ge_data)

We also need to normalize the raw drug response observations. `cdrpy` provides the `normalize_responses` helper function for this purpose. This function accepts an optional parameter, `norm_method`. When `norm_method` is set to `global`, normalization is performed per drug, reducing bias in overall performance estimates by variability in removing drug-specific effective concentration range from the data.

In [None]:
train_ds, val_ds, test_ds = normalize_responses(
    train_ds, val_ds, test_ds, norm_method="grouped"
)

## 2. Creating the model

As an example, we will construct a simple model using three multi-layer perceptrons (MLPs).

In [51]:
ge_input_dim = train_ds.cell_encoders["exp"].shape[-1]
fp_input_dim = train_ds.drug_encoders["mol"].shape[-1]

x_ge = ge_input = keras.Input((ge_input_dim,))
x_ge = keras.layers.Dense(128, activation="relu")(x_ge)
x_ge = keras.layers.Dense(64, activation="relu")(x_ge)

x_fp = fp_input = keras.Input((fp_input_dim,))
x_fp = keras.layers.Dense(128, activation="relu")(x_fp)
x_fp = keras.layers.Dense(64, activation="relu")(x_fp)

x = keras.layers.Concatenate()([x_ge, x_fp])
x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dense(32, activation="relu")(x)
x_out = keras.layers.Dense(1, activation="linear")(x)

In [52]:
model = keras.Model([ge_input, fp_input], x_out)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None, 1771)]       0           []                               
                                                                                                  
 input_20 (InputLayer)          [(None, 512)]        0           []                               
                                                                                                  
 dense_25 (Dense)               (None, 128)          226816      ['input_19[0][0]']               
                                                                                                  
 dense_27 (Dense)               (None, 128)          65664       ['input_20[0][0]']               
                                                                                            

## 3. Training and evaluation

In [53]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss="mean_squared_error",
    metrics=[tf_metrics.pearson],
)

Train the model.

In [47]:
generator = BatchedResponseGenerator(D, batch_size=128)

In [48]:
train_gen = generator.flow_from_dataset(train_ds, shuffle=True, seed=SEED)
val_gen = generator.flow_from_dataset(val_ds)

In [49]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=50, restore_best_weights=True
)

In [54]:
history = model.fit(
    train_gen, epochs=50, validation_data=val_gen, verbose=2, callbacks=[early_stopping]
)

Epoch 1/50
834/834 - 6s - loss: 2.9188 - pearson: 0.7546 - val_loss: 1.9598 - val_pearson: 0.7117 - 6s/epoch - 8ms/step
Epoch 2/50
834/834 - 5s - loss: 1.6911 - pearson: 0.8759 - val_loss: 1.9248 - val_pearson: 0.7092 - 5s/epoch - 6ms/step
Epoch 3/50
834/834 - 5s - loss: 1.6049 - pearson: 0.8825 - val_loss: 1.8672 - val_pearson: 0.7121 - 5s/epoch - 5ms/step
Epoch 4/50
834/834 - 5s - loss: 1.5489 - pearson: 0.8865 - val_loss: 1.8537 - val_pearson: 0.7186 - 5s/epoch - 5ms/step
Epoch 5/50
834/834 - 5s - loss: 1.4951 - pearson: 0.8909 - val_loss: 1.8119 - val_pearson: 0.7183 - 5s/epoch - 6ms/step
Epoch 6/50
834/834 - 5s - loss: 1.4375 - pearson: 0.8955 - val_loss: 1.8577 - val_pearson: 0.7222 - 5s/epoch - 5ms/step
Epoch 7/50
834/834 - 5s - loss: 1.4015 - pearson: 0.8978 - val_loss: 1.8021 - val_pearson: 0.7257 - 5s/epoch - 5ms/step
Epoch 8/50
834/834 - 5s - loss: 1.3788 - pearson: 0.8993 - val_loss: 1.8927 - val_pearson: 0.7258 - 5s/epoch - 6ms/step
Epoch 9/50
834/834 - 5s - loss: 1.3587 -