# ScreenDL Modeling Sandbox

In [1]:
from __future__ import annotations

import functools

import numpy as np
import pandas as pd
import tensorflow as tf
import typing as t

from pathlib import Path
from tensorflow import keras

from cdrpy.models import screendl
from cdrpy.data.datasets import Dataset, get_predictions
from cdrpy.data.preprocess import normalize_responses
from cdrpy.splits import load_split

2023-07-26 13:53:31.991080: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
input_dir = Path("../../data/inputs/GDSCv2DepMap")
exp_path = (
    input_dir / "ScreenDL/FeatureCellToExpression1771MCGGenesTPMLogp1.csv"
)
mol_path = input_dir / "DrugToMorganFingerprint1024Bit.csv"
label_path = input_dir / "LabelsLogIC50.csv"
split_path = input_dir / "splits/mixed"

cell_enc = screendl.load_cell_features(exp_path)
drug_enc = screendl.load_drug_features(mol_path)

dataset = Dataset.from_csv(
    label_path,
    name="GDSCv2DepMap",
    cell_encoders=[cell_enc],
    drug_encoders=[drug_enc],
)

split = load_split(split_path, 1)

train_ds = dataset.select(split.train_ids, name="train")
val_ds = dataset.select(split.val_ids, name="val")
test_ds = dataset.select(split.test_ids, name="test")

mini_ds = test_ds.sample(1000)

In [4]:
mini_ds.encode_tf()

2023-07-26 13:54:15.260290: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


<_ZipDataset element_spec=((TensorSpec(shape=(1771,), dtype=tf.float32, name=None), TensorSpec(shape=(1024,), dtype=tf.int32, name=None)), TensorSpec(shape=(), dtype=tf.float32, name=None))>

In [9]:
# I will need to encode the tensor shapes here somehow

tf.data.Dataset.from_generator(
    mini_ds.encode_generator,
    output_signature=(
        (
            (
                tf.TensorSpec(shape=(1771,), dtype=tf.float32),
                tf.TensorSpec(shape=(1024,), dtype=tf.int32),
            ),
            tf.TensorSpec(shape=(), dtype=tf.float32),
        )
    ),
)



<_FlatMapDataset element_spec=((TensorSpec(shape=(), dtype=tf.float32, name=None), TensorSpec(shape=(1024,), dtype=tf.int32, name=None)), TensorSpec(shape=(), dtype=tf.float32, name=None))>

In [7]:
cell_enc.shape[-1]

1771

In [11]:
def gen():
    for index, row in test_ds.obs.iterrows():
        cell_ids = [row["cell_id"]]
        drug_ids = [row["drug_id"]]

        cell_feat = [e.encode(cell_ids)[0] for e in [cell_enc]]
        drug_feat = [e.encode(drug_ids)[0] for e in [drug_enc]]
        features = tuple(cell_feat + drug_feat)

        yield (features, row["label"])

In [None]:
# NOTE: add a .get method to the encoders for getting a single value

In [14]:


# Now, I just need to encode as a generator

mini_ds_tf = tf.data.Dataset.from_generator(
    gen,
    output_signature=(
        (
            (
                tf.TensorSpec(shape=(1771,), dtype=tf.float32),
                tf.TensorSpec(shape=(1024,), dtype=tf.int32),
            ),
            tf.TensorSpec(shape=(), dtype=tf.float32),
        )
    ),
)
for item in mini_ds_tf.take(1):
    print(item)

((<tf.Tensor: shape=(1771,), dtype=float32, numpy=
array([1.       , 0.5753123, 4.08151  , ..., 2.711495 , 0.9708536,
       6.743623 ], dtype=float32)>, <tf.Tensor: shape=(1024,), dtype=int32, numpy=array([0, 0, 0, ..., 0, 0, 0], dtype=int32)>), <tf.Tensor: shape=(), dtype=float32, numpy=3.10755>)


In [None]:
gen = list(test_ds.encode_batches(32, as_numpy=True))
gen[0][0][0].shape

In [None]:
mini_ds.obs

In [None]:
split_inds = np.arange(32, 1000, 32)
np.array_split(mini_ds.obs, split_inds)[0]

In [None]:


temp_path = "https://raw.githubusercontent.com/JDACS4C-IMPROVE/DeepCDR/develop/data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv"
temp = pd.read_csv(temp_path, index_col=0)
temp.head()

In [None]:
from sklearn.preprocessing import QuantileTransformer

temp[:] = QuantileTransformer(output_distribution="normal").fit_transform(temp)
temp