# ScreenDL Modeling Sandbox

In [None]:
from __future__ import annotations

import functools

import numpy as np
import pandas as pd
import tensorflow as tf
import typing as t

from pathlib import Path
from tensorflow import keras

from cdrpy.models import screendl
from cdrpy.data.datasets import Dataset, get_predictions
from cdrpy.data.preprocess import normalize_responses
from cdrpy.splits import load_split
from cdrpy.mapper import BatchedResponseGenerator


In [None]:
input_dir = Path("../../data/inputs/GDSCv2DepMap")
exp_path = (
    input_dir / "ScreenDL/FeatureCellToExpression1771MCGGenesTPMLogp1.csv"
)
mol_path = input_dir / "DrugToMorganFingerprint1024Bit.csv"
label_path = input_dir / "LabelsLogIC50.csv"
split_path = input_dir / "splits/mixed"

cell_enc = list(filter(None, screendl.load_cell_features(exp_path)))
drug_enc = screendl.load_drug_features(mol_path)

dataset = Dataset.from_csv(
    label_path,
    name="GDSCv2DepMap",
    cell_encoders=cell_enc,
    drug_encoders=[drug_enc],
)

split = load_split(split_path, 1)

train_ds = dataset.select(split.train_ids, name="train")
val_ds = dataset.select(split.val_ids, name="val")
test_ds = dataset.select(split.test_ids, name="test")

mini_ds = test_ds.sample(1000)

In [None]:
# TODO: infer_shape and infer_tspec methods in encoders

In [None]:
# TODO:
# - [ ] Need to change to dict for encoders so that I can use cnv again
# - [ ] Check the dense_weights are increasing with larger values
# - [ ] Remove sample weighting for initial validations
# - [ ] Update MLP block to enable saving
# - [ ] Update metrics and evaluation (get_predictions) to include sample_weights
# - [ ] Update sample weighting to clip really large or small zscores
# - [ ] Update make_weights to apply the same weighting strategy to both train and val
# - [ ] Add model.predict with the new evaluation schema
# - [ ] Add hyparm optimization
# - [ ] Add seed option in config (int or null)
# - [ ] Convert cell_encoders and drug_encoders to an ordered dict of encoders
#   -> that way, we can iterate over the ordered dict to encode things but also can
#       extract key-value pairs
# - [ ] Add l1 or l2 regularization https://neptune.ai/blog/fighting-overfitting-with-l1-or-l2-regularization

In [None]:
generator = BatchedResponseGenerator(train_ds, 32)
train_gen = generator.flow(
    train_ds.cell_ids, train_ds.drug_ids, train_ds.labels, shuffle=True, seed=1771
)

In [None]:
keras.metrics.R2Score()

In [None]:
from scipy import stats

path = "../../data/outputs/GDSCv2DepMap/ScreenDL/runs/2023-09-27_22-57-40/predictions.csv"
res = pd.read_csv(path)
corrs = (
    res.groupby(["split", "drug_id"])
    .apply(lambda g: stats.pearsonr(g["y_true"], g["y_pred"])[0])
    .groupby("split")
    .describe()
)
corrs

In [None]:
path = "../../data/outputs/GDSCv2DepMap/ScreenDL/runs/2023-09-27_21-40-26/predictions.csv"
res = pd.read_csv(path)
corrs = (
    res.groupby(["split", "drug_id"])
    .apply(lambda g: stats.pearsonr(g["y_true"], g["y_pred"])[0])
    .groupby("split")
    .describe()
)
corrs

In [None]:
def make_weights(ds: Dataset) -> np.ndarray:
    """Create continuous sample weights."""
    grouped = ds.obs.groupby("drug_id")
    sample_weights = []
    for _, group in grouped:
        y = group["label"]
        y_std = (y - y.min()) / (y.max() - y.min())
        y_scaled = y_std * (1 - (-1)) + (-1)
        sample_weights.extend(list(1 + abs(y_scaled)))
    return np.asanyarray(sample_weights)

In [None]:
from scipy import stats


def make_dense_weights(
    ds: Dataset, alpha: float = 0.5, epsilon: float = 1e-4
) -> np.ndarray:
    """Create continuous sample weights."""
    grouped = ds.obs.groupby("drug_id")
    sample_weights = []
    for _, group in grouped:
        Y = group["label"]
        kernel = stats.gaussian_kde(Y)

        Z = kernel(Y)
        Z_std = (Z - Z.min()) / (Z.max() - Z.min())

        weights = np.clip((1 - (alpha * Z_std)), a_min=epsilon, a_max=None)
        scaled_weights = weights / weights.mean()

        sample_weights.extend(list(scaled_weights))

    return np.asanyarray(sample_weights)

In [None]:
make_weights(train_ds).max()

In [None]:
foo = make_dense_weights(train_ds, alpha=0.75)

In [None]:
print(foo.min())
print(foo.max())
print(foo.mean())

In [None]:
foo.mean()

In [None]:
temp = train_ds.obs.copy()
grouped = temp.groupby("drug_id")

for drug, group in grouped:
    labels = group["label"]
    std_labels = (labels - labels.min()) / (labels.max() - labels.min())
    scaled_labels = std_labels * (1 - (-1)) + (-1)
    weights = 1 + abs(
        scaled_labels
    )  # NOTE: can multiple by some factor here if I want to increase weighting

weights.max()

In [None]:
norm_labels = train_ds.labels

In [None]:
path = "../../data/outputs/GDSCv2DepMap/ScreenDL/runs/2023-09-27_16-20-18/predictions.csv"
res = pd.read_csv(path)
corrs = (
    res.groupby(["split", "drug_id"])
    .apply(lambda g: stats.pearsonr(g["y_true"], g["y_pred"])[0])
    .groupby("split")
    .describe()
)
corrs

In [None]:
from cdrpy.metrics import tf_metrics

In [None]:
model = keras.model.load_model(
    "../../data/outputs/GDSCv2DepMap/ScreenDL/runs/2023-09-27_16-20-18/model",
    custom_objects=tf_metrics.pearson,
)

In [None]:
# 1. rerun as is with new config (12-58-22)
# 2. pip install and rerun with adjusted decay rates (already adjusted) (13-38-20)
# 3. revert to the best of 1 and 2 and rerun  with hallmark genes

In [None]:
from collections import OrderedDict
from tensorflow.keras import layers

In [None]:

# break out the first hidden layer and move dropout to before

In [None]:
# keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.002)

In [None]:
from cdrpy.metrics import tf_metrics

In [None]:
# NOTE: init seeding has a large impact on convergence
#  -> it might be better to increase batch size or to use a balanced sampler
#   -> also try larger batch size

In [None]:
model = keras.models.load_model(
    "../../data/outputs/GDSCv2DepMap/ScreenDL/runs/2023-09-26_17-18-16/model",
    custom_objects={"pearson": tf_metrics.pearson},
)

In [None]:
l = model.get_layer("exp_dn_1")
weights = l.weights[0].numpy()

In [None]:
genes = list(pd.read_csv(exp_path, index_col=0).columns)
gene_weights = pd.DataFrame(weights, index=genes)

# NOTE: now cluster the genes and see if we recover and structure
#   -> can apply some sort of topic labeling to the resulting clusters
gene_weights

In [None]:
weights.shape

In [None]:
for _ in range(10):
    print([x[1] for x in train_gen][0][0])
    train_gen.on_epoch_end()

In [None]:
np.asanyarray([x for x in train_gen][0][0][0]).shape

In [None]:
test_ds.encode_tf_v2().shuffle(10000).batch(32)

In [None]:
# I will need to encode the tensor shapes here somehow

encoders = [cell_enc, drug_enc]
get_tspec = lambda e: tf.TensorSpec(e.shape, tf.as_dtype(e.dtype), e.name)

# encoders = [cell_enc, drug_enc]
# # output_sig = ((tf.TensorSpec(shape=e.shape) for e in encoders), tf.TensorSpec()

ds = tf.data.Dataset.from_generator(
    mini_ds.encode_generator,
    output_signature=(
        tuple((get_tspec(e) for e in encoders)),
        tf.TensorSpec(shape=(), dtype=tf.float32, name="label"),
    ),
)

for item in ds.take(1):
    print(item)


In [None]:
def gen():
    for index, row in test_ds.obs.iterrows():
        cell_ids = [row["cell_id"]]
        drug_ids = [row["drug_id"]]

        cell_feat = [e.encode(cell_ids)[0] for e in [cell_enc]]
        drug_feat = [e.encode(drug_ids)[0] for e in [drug_enc]]
        features = tuple(cell_feat + drug_feat)

        yield (features, row["label"])

In [None]:
# NOTE: add a .get method to the encoders for getting a single value

In [None]:


# Now, I just need to encode as a generator

mini_ds_tf = tf.data.Dataset.from_generator(
    gen,
    output_signature=(
        (
            (
                tf.TensorSpec(shape=(1771,), dtype=tf.float32),
                tf.TensorSpec(shape=(1024,), dtype=tf.int32),
            ),
            tf.TensorSpec(shape=(), dtype=tf.float32),
        )
    ),
)
for item in mini_ds_tf.take(1):
    print(item)

In [None]:
gen = list(test_ds.encode_batches(32, as_numpy=True))
gen[0][0][0].shape

In [None]:
mini_ds.obs

In [None]:
split_inds = np.arange(32, 1000, 32)
np.array_split(mini_ds.obs, split_inds)[0]

In [None]:


temp_path = "https://raw.githubusercontent.com/JDACS4C-IMPROVE/DeepCDR/develop/data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv"
temp = pd.read_csv(temp_path, index_col=0)
temp.head()

In [None]:
from sklearn.preprocessing import QuantileTransformer

temp[:] = QuantileTransformer(output_distribution="normal").fit_transform(temp)
temp