# DualGCN Modeling Sandbox

## TODO

- [ ] Replace the preprocessing functions with tensorflow ops
  - [ ] Use tf.RaggedTensor for the lists of variable length
  - [ ] Apply the ops to the tf.RaggedTensor
- [ ] Decide if I should just spin up an instance for these other models
- [ ] For parsing results, read the config and sources from the hydra conf output

In [None]:
from __future__ import annotations

import typing as t

import numpy as np
import pandas as pd
import tensorflow as tf

from pathlib import Path
from tensorflow import keras

from cdrpy.models import dualgcn
from cdrpy.metrics import tf_metrics
from cdrpy.data.datasets import Dataset, EncodedDataset
from cdrpy.splits import load_splits

In [None]:
DegList = list[np.int32]
AdjList = list[list[int]]
DrugInput = t.Tuple[np.ndarray, DegList, AdjList]

In [None]:
cpu_device = tf.config.experimental.list_physical_devices("CPU")[0]
with tf.device("CPU:0"):
    print(cpu_device.name)



In [None]:
data_folder = Path("../../data/inputs/GDSCv2DepMap")
dgcn_folder = data_folder / "DualGCN"
split_folder = Path(data_folder / "splits/tumor_blind")

drug_path = data_folder / "DrugToConvMolFeatures.pickle"
ppi_path = dgcn_folder / "MetadataPPIEdgeList.csv"
cnv_path = (
    dgcn_folder / "FeatureCellToCopyNumber689DualGCNGenesCNRatioLogp1.csv"
)
exp_path = dgcn_folder / "FeatureCellToExpression689DualGCNGenesTPMLogp1.csv"
label_path = data_folder / "LabelsLogIC50.csv"

In [None]:
drug_feat_encoder, drug_adj_encoder = dualgcn.load_drug_features(drug_path)
cell_feat_encoder, cell_adj_encoder = dualgcn.load_cell_features(
    exp_path, cnv_path, ppi_path
)

encoders = {
    "cell_encoders": [cell_feat_encoder, cell_adj_encoder],
    "drug_encoders": [drug_feat_encoder, drug_adj_encoder],
}

ds = Dataset.from_csv(label_path, name="gdsc_v2_depmap")
split = list(load_splits(split_folder))[0]

train_ds = ds.select(split.train_ids, name="train")
val_ds = ds.select(split.val_ids, name="val")
test_ds = ds.select(split.test_ids, name="test")

In [None]:
cell_feat_norm = keras.layers.Normalization(axis=(1, 2))
cell_feat_norm.adapt(np.array(cell_feat_encoder.encode(train_ds.cell_ids)))

cell_dim = cell_feat_encoder.shape[-1]
drug_dim = drug_feat_encoder.shape[-1]

model = dualgcn.create_model(cell_dim, drug_dim, cell_feat_norm)
model.compile(
    optimizer=keras.optimizers.Adam(
        learning_rate=0.001,
        epsilon=None,
        decay=0.0,
        amsgrad=False,
    ),
    loss="mean_squared_error",
    metrics=["mse", tf_metrics.pearson],
)

In [None]:
samples = train_ds.obs["id"].sample(n=1000)
sample_ds = train_ds.select(samples, name="sample")
sample_ds

In [None]:
hx = model.fit(
    sample_ds.encode_tf(**encoders).shuffle(10000).batch(32), epochs=1
)

In [None]:
model.inputs

In [None]:
gen = sample_ds.encode_batches(**encoders, batch_size=32, return_ids=True)
x, y, cell_ids, drug_ids = list(gen)[0]

In [None]:
x_arr = [np.array(f) for f in x]
a = {0: 1, 1: 2}
{"a":1, **a}

In [None]:
model.predict_on_batch(x_arr)

In [None]:
# exp_encoder = PandasEncoder.from_csv(exp_path, index_col=0, name="exp_encoder")
# ids = ["SIDM00046", "SIDM00078", "SIDM00079", "SIDM00080", "SIDM00081"]
# temp = exp_encoder.encode_tf(ids)
# temp

In [None]:
foo = {0: [1, 2, 3], 1: [1, 2, 3]}
len(set(type(v) for v in foo.values())) == 1

In [None]:
ds = Dataset.from_csv(label_path, name="gdsc_v2_depmap")
split = list(load_splits(split_folder))[0]

train_ds = ds.select(split.train_ids, name="train")
val_ds = ds.select(split.val_ids, name="val")
test_ds = ds.select(split.test_ids, name="test")

In [None]:
drug_feat_encoder, drug_adj_encoder = load_drug_features(drug_path)
cell_feat_encoder, cell_adj_encoder = load_cell_features(
    exp_path, cnv_path, ppi_path
)

In [None]:
encoders = {
    "cell_encoders": [cell_feat_encoder, cell_adj_encoder],
    "drug_encoders": [drug_feat_encoder, drug_adj_encoder],
}

temp = val_ds.encode_tf(**encoders)
temp

In [None]:
temp_batch = temp.batch(32)
for batch_features, batch_labels in temp_batch:
    pass
batch_features