# DualGCN Modeling Sandbox

## TODO

- [ ] Replace the preprocessing functions with tensorflow ops
  - [ ] Use tf.RaggedTensor for the lists of variable length
  - [ ] Apply the ops to the tf.RaggedTensor
- [ ] Decide if I should just spin up an instance for these other models
- [ ] For parsing results, read the config and sources from the hydra conf output

In [1]:
from __future__ import annotations

import numpy as np
import pandas as pd
import tensorflow as tf

from pathlib import Path
from tensorflow import keras

from cdrpy.models import dualgcn as dgcn
from cdrpy.data.datasets import Dataset, get_predictions
from cdrpy.data.preprocess import normalize_responses
from cdrpy.feat.transformers import PandasGroupedStandardScaler
from cdrpy.splits import load_splits

2023-07-26 14:01:14.572269: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_folder = Path("../../data/inputs/GDSCv2DepMap")
dgcn_folder = data_folder / "DualGCN"
split_folder = Path(data_folder / "splits/tumor_blind")

drug_path = data_folder / "DrugToConvMolFeatures.pickle"
ppi_path = dgcn_folder / "MetadataPPIEdgeList.csv"
cnv_path = (
    dgcn_folder / "FeatureCellToCopyNumber689DualGCNGenesCNRatioLogp1.csv"
)
exp_path = dgcn_folder / "FeatureCellToExpression689DualGCNGenesTPMLogp1.csv"
label_path = data_folder / "LabelsLogIC50.csv"

In [3]:
drug_feat_encoder, drug_adj_encoder = dgcn.load_drug_features(drug_path)
cell_feat_encoder, cell_adj_encoder = dgcn.load_cell_features(
    exp_path, cnv_path, ppi_path
)

encoders = {
    "cell_encoders": [cell_feat_encoder, cell_adj_encoder],
    "drug_encoders": [drug_feat_encoder, drug_adj_encoder],
}

ds = Dataset.from_csv(label_path, name="gdsc_v2_depmap")

In [4]:
split = list(load_splits(split_folder))[0]

train_ds = ds.select(split.train_ids, name="train")
val_ds = ds.select(split.val_ids, name="val")
test_ds = ds.select(split.test_ids, name="test")

# gss = PandasGroupedStandardScaler("label", "drug_id")

# train_ds.obs = gss.fit_transform(train_ds.obs)
# val_ds.obs = gss.transform(val_ds.obs)
# test_ds.obs = gss.transform(test_ds.obs)

In [7]:
encoders["cell_encoders"][1].shape

AttributeError: 'RepeatEncoder' object has no attribute 'shape'

In [5]:
train_ds, val_ds, test_ds = normalize_responses(
    train_ds,
    val_ds,
    test_ds,
    "grouped",
    value_col="label",
    group_col="drug_id",
)
test_ds.obs.groupby("drug_id")["label"].std().round(2)

drug_id
B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN=C2)(O)O                              1.00
C(C(C1C(=C(C(=O)O1)O)O)O)O                                                          0.77
C(CC(=O)NC(CS)C(=O)NCC(=O)O)C(C(=O)O)N                                              0.78
C1=C(C(=O)NC(=O)N1)F                                                                0.97
C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOCC(CO)O                                   0.91
                                                                                    ... 
CS(=O)(=O)C1=CC(=C(C=C1)C(=O)NC2=CC(=C(C=C2)Cl)C3=CC=CC=N3)Cl                       0.99
CS(=O)(=O)C1=CC=C(C=C1)C2=CN=C(C(=N2)C(=O)NC3=CC=CC=C3)N                            1.02
CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3NC4=CC(=C(C=C4)OCC5=CC(=CC=C5)F)Cl    0.84
CS(=O)(=O)N1CCN(CC1)CC2=CC3=C(S2)C(=NC(=N3)C4=C5C=NNC5=CC=C4)N6CCOCC6               1.16
N.N.Cl[Pt]Cl                                                                        0.93
Name: label, 

In [None]:
save_dir = "." if True else None
save_dir

In [None]:
train_ds.obs["label"].mean()

In [None]:
ss = StandardScaler()
train_ds.obs["label"] = ss.fit_transform(train_ds.obs[["label"]])
print(train_ds.obs["label"].mean())
print(train_ds.obs["label"].std())

In [None]:
train_ds.obs.groupby("drug_id")["label"].std().round(3)

In [None]:
cell_feat_norm = keras.layers.Normalization(axis=(1, 2))
cell_feat_norm.adapt(np.array(cell_feat_encoder.encode(train_ds.cell_ids)))

cell_dim = cell_feat_encoder.shape[-1]
drug_dim = drug_feat_encoder.shape[-1]

model = dgcn.create_model(cell_dim, drug_dim, cell_feat_norm)
model.compile(
    optimizer=keras.optimizers.Adam(
        learning_rate=0.001,
        epsilon=None,
        decay=0.0,
        amsgrad=False,
    ),
    loss="mean_squared_error",
    # metrics=["mse", tf_metrics.pearson],
)

In [None]:
samples = train_ds.obs["id"].sample(n=1000)
sample_ds = train_ds.select(samples, name="sample")
sample_ds

In [None]:
hx = model.fit(
    sample_ds.encode_tf(**encoders).shuffle(10000).batch(32), epochs=1
)

In [None]:
model.inputs

In [None]:
gen = sample_ds.encode_batches(**encoders, batch_size=32, return_ids=True)
x, y, cell_ids, drug_ids = list(gen)[0]

In [None]:
x_arr = [np.array(f) for f in x]
a = {0: 1, 1: 2}
{"a":1, **a}

In [None]:
model.predict_on_batch(x_arr)

In [None]:
# exp_encoder = PandasEncoder.from_csv(exp_path, index_col=0, name="exp_encoder")
# ids = ["SIDM00046", "SIDM00078", "SIDM00079", "SIDM00080", "SIDM00081"]
# temp = exp_encoder.encode_tf(ids)
# temp

In [None]:
foo = {0: [1, 2, 3], 1: [1, 2, 3]}
len(set(type(v) for v in foo.values())) == 1

In [None]:
ds = Dataset.from_csv(label_path, name="gdsc_v2_depmap")
split = list(load_splits(split_folder))[0]

train_ds = ds.select(split.train_ids, name="train")
val_ds = ds.select(split.val_ids, name="val")
test_ds = ds.select(split.test_ids, name="test")

In [None]:
drug_feat_encoder, drug_adj_encoder = load_drug_features(drug_path)
cell_feat_encoder, cell_adj_encoder = load_cell_features(
    exp_path, cnv_path, ppi_path
)

In [None]:
encoders = {
    "cell_encoders": [cell_feat_encoder, cell_adj_encoder],
    "drug_encoders": [drug_feat_encoder, drug_adj_encoder],
}

temp = val_ds.encode_tf(**encoders)
temp

In [None]:
temp_batch = temp.batch(32)
for batch_features, batch_labels in temp_batch:
    pass
batch_features