# ScreenDL Modeling Sandbox

In [None]:
from __future__ import annotations

import functools

import numpy as np
import pandas as pd
import tensorflow as tf
import typing as t

from pathlib import Path
from tensorflow import keras

from cdrpy.models import screendl
from cdrpy.data.datasets import Dataset, get_predictions
from cdrpy.data.preprocess import normalize_responses
from cdrpy.splits import load_split

In [None]:
input_dir = Path("../../data/inputs/GDSCv2DepMap")
exp_path = (
    input_dir / "ScreenDL/FeatureCellToExpression1771MCGGenesTPMLogp1.csv"
)
mol_path = input_dir / "DrugToMorganFingerprint1024Bit.csv"
label_path = input_dir / "LabelsLogIC50.csv"
split_path = input_dir / "splits/mixed"

cell_enc = screendl.load_cell_features(exp_path)
drug_enc = screendl.load_drug_features(mol_path)

dataset = Dataset.from_csv(
    label_path,
    name="GDSCv2DepMap",
    cell_encoders=[cell_enc],
    drug_encoders=[drug_enc],
)

split = load_split(split_path, 1)

train_ds = dataset.select(split.train_ids, name="train")
val_ds = dataset.select(split.val_ids, name="val")
test_ds = dataset.select(split.test_ids, name="test")

mini_ds = test_ds.sample(1000)

In [None]:
test_ds.encode_tf_v2().shuffle(10000).batch(32)

In [None]:
# I will need to encode the tensor shapes here somehow

encoders = [cell_enc, drug_enc]
get_tspec = lambda e: tf.TensorSpec(e.shape, tf.as_dtype(e.dtype), e.name)

# encoders = [cell_enc, drug_enc]
# # output_sig = ((tf.TensorSpec(shape=e.shape) for e in encoders), tf.TensorSpec()

ds = tf.data.Dataset.from_generator(
    mini_ds.encode_generator,
    output_signature=(
        tuple((get_tspec(e) for e in encoders)),
        tf.TensorSpec(shape=(), dtype=tf.float32, name="label"),
    ),
)

for item in ds.take(1):
    print(item)


In [None]:
def gen():
    for index, row in test_ds.obs.iterrows():
        cell_ids = [row["cell_id"]]
        drug_ids = [row["drug_id"]]

        cell_feat = [e.encode(cell_ids)[0] for e in [cell_enc]]
        drug_feat = [e.encode(drug_ids)[0] for e in [drug_enc]]
        features = tuple(cell_feat + drug_feat)

        yield (features, row["label"])

In [None]:
# NOTE: add a .get method to the encoders for getting a single value

In [None]:


# Now, I just need to encode as a generator

mini_ds_tf = tf.data.Dataset.from_generator(
    gen,
    output_signature=(
        (
            (
                tf.TensorSpec(shape=(1771,), dtype=tf.float32),
                tf.TensorSpec(shape=(1024,), dtype=tf.int32),
            ),
            tf.TensorSpec(shape=(), dtype=tf.float32),
        )
    ),
)
for item in mini_ds_tf.take(1):
    print(item)

In [None]:
gen = list(test_ds.encode_batches(32, as_numpy=True))
gen[0][0][0].shape

In [None]:
mini_ds.obs

In [None]:
split_inds = np.arange(32, 1000, 32)
np.array_split(mini_ds.obs, split_inds)[0]

In [None]:


temp_path = "https://raw.githubusercontent.com/JDACS4C-IMPROVE/DeepCDR/develop/data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv"
temp = pd.read_csv(temp_path, index_col=0)
temp.head()

In [None]:
from sklearn.preprocessing import QuantileTransformer

temp[:] = QuantileTransformer(output_distribution="normal").fit_transform(temp)
temp