In [1]:
import logging
import anndata as ad
import numpy as np

from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
import os
from glob import glob
from tqdm import tqdm

In [2]:
path = '/raid/data/ml/nips/datasets/predict_modality'
os.listdir(path)

['openproblems_bmmc_cite_phase2_mod2',
 'openproblems_bmmc_multiome_phase2_mod2',
 'openproblems_bmmc_cite_phase2_rna',
 'openproblems_bmmc_multiome_phase2_rna']

In [3]:
tasks = ["GEX2ADT", "ADT2GEX", "GEX2ATAC", "ATAC2GEX"]
task2name = {
    "ADT2GEX":f"openproblems_bmmc_cite_phase2_mod2",
    "GEX2ADT":f"openproblems_bmmc_cite_phase2_rna",
    "ATAC2GEX":f"openproblems_bmmc_multiome_phase2_mod2",
    "GEX2ATAC":f"openproblems_bmmc_multiome_phase2_rna"
}

In [4]:
task = "GEX2ADT"
name = task2name[task]
tag = "train_mod1"
glob(f'{path}/{name}/*{tag}*.h5ad')[0]

'/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad'

In [5]:
tags = ['train_mod1', 'train_mod2', 'test_mod1', 'test_mod2']
par = {f'input_{tag}': glob(f'{path}/{name}/*{tag}*.h5ad')[0] for tag in tags}
par['output'] = 'baseline'
par

{'input_train_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
 'input_train_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
 'input_test_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad',
 'input_test_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad',
 'output': 'baseline'}

In [6]:
def print_shape(*x):
    for i in x:
        print(i.shape, end=' ')
    print()

In [7]:
%%time
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
input_test_mod2 = ad.read_h5ad(par['input_test_mod2'])

print_shape(input_train_mod1, input_train_mod2, input_test_mod1, input_test_mod2)

(66175, 13953) (66175, 134) (1000, 13953) (1000, 134) 
CPU times: user 6.31 s, sys: 892 ms, total: 7.2 s
Wall time: 7.2 s


In [8]:
%%time

pred_dimx = input_test_mod1.shape[0]
pred_dimy = input_train_mod2.shape[1]

feature_obs = input_train_mod1.obs
gs_obs = input_train_mod2.obs

batches = input_train_mod1.obs.batch.unique().tolist()
batch_len = len(batches)

obs = input_test_mod1.obs
var = input_train_mod2.var
dataset_id = input_train_mod1.uns['dataset_id']

input_train = ad.concat(
    {"train": input_train_mod1, "test": input_test_mod1},
    axis=0,
    join="outer",
    label="group",
    fill_value=0,
    index_unique="-"
)

CPU times: user 8.84 s, sys: 1.73 s, total: 10.6 s
Wall time: 10.6 s


In [9]:
print('Determine parameters by the modalities')
mod1_type = input_train_mod1.var.feature_types[0]
mod1_type = mod1_type.upper()
mod2_type = input_train_mod2.var.feature_types[0]
mod2_type = mod2_type.upper()
mod1_type, mod2_type

Determine parameters by the modalities


('GEX', 'ADT')

In [10]:
n_comp_dict = {
        ("GEX", "ADT"): (300, 70, 10, 0.2),
        ("ADT", "GEX"): (None, 50, 10, 0.2),
        ("GEX", "ATAC"): (1000, 50, 10, 0.1),
        ("ATAC", "GEX"): (100, 70, 10, 0.1)
        }
logging.info(f"{mod1_type}, {mod2_type}")
n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)]
logging.info(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")

# Do PCA on the input data
logging.info('Models using the Truncated SVD to reduce the dimension')

In [11]:
print(f"{mod1_type}, {mod2_type}")

GEX, ADT


In [12]:
print(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")

300, 70, 10, 0.2


In [13]:
%%time
if n_mod1 is not None and n_mod1 < input_train.shape[1]:
    embedder_mod1 = TruncatedSVD(n_components=n_mod1)
    mod1_pca = embedder_mod1.fit_transform(input_train.X).astype(np.float32)
    train_matrix = mod1_pca[input_train.obs['group'] == 'train']
    test_matrix = mod1_pca[input_train.obs['group'] == 'test']
else:
    train_matrix = input_train_mod1.to_df().values.astype(np.float32)
    test_matrix = input_test_mod1.to_df().values.astype(np.float32)

if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]:
    embedder_mod2 = TruncatedSVD(n_components=n_mod2)
    train_gs = embedder_mod2.fit_transform(input_train_mod2.X).astype(np.float32)
else:
    train_gs = input_train_mod2.to_df().values.astype(np.float32)

del input_train
del input_train_mod1
del input_train_mod2
del input_test_mod1

CPU times: user 7min 27s, sys: 3min 6s, total: 10min 34s
Wall time: 1min 54s


In [14]:
%%time

print('Running normalization ...')
train_sd = np.std(train_matrix, axis=1).reshape(-1, 1)
train_sd[train_sd == 0] = 1
train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd
train_norm = train_norm.astype(np.float32)
del train_matrix

test_sd = np.std(test_matrix, axis=1).reshape(-1, 1)
test_sd[test_sd == 0] = 1
test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd
test_norm = test_norm.astype(np.float32)
del test_matrix

Running normalization ...
CPU times: user 87 ms, sys: 75.6 ms, total: 163 ms
Wall time: 160 ms


In [15]:
%%time

print('Running KRR model ...')
y_pred = np.zeros((pred_dimx, pred_dimy), dtype=np.float32)
np.random.seed(1000)

for _ in tqdm(range(5)):
    np.random.shuffle(batches)
    for batch in [batches[:batch_len//2], batches[batch_len//2:]]:
        # for passing the test
        if not batch:
            batch = [batches[0]]

        logging.info(batch)
        kernel = RBF(length_scale = scale)
        krr = KernelRidge(alpha=alpha, kernel=kernel)
        logging.info('Fitting KRR ... ')
        krr.fit(train_norm[feature_obs.batch.isin(batch)], 
                train_gs[gs_obs.batch.isin(batch)])
        y_pred += (krr.predict(test_norm) @ embedder_mod2.components_)

Running KRR model ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [1:09:01<00:00, 828.34s/it]

CPU times: user 7h 34min 13s, sys: 2h 6min 31s, total: 9h 40min 44s
Wall time: 1h 9min 1s





In [16]:
np.clip(y_pred, a_min=0, a_max=None, out=y_pred)
if mod2_type == "ATAC":
    np.clip(y_pred, a_min=0, a_max=1, out=y_pred)

y_pred /= 10

# Store as sparse matrix to be efficient. Note that this might require
# different classifiers/embedders before-hand. Not every class is able
# to support such data structures.
y_pred = csc_matrix(y_pred)

logging.info("Generate anndata object ...")
adata = ad.AnnData(
    X=y_pred,
    obs=obs,
    var=var,
    uns={
        'dataset_id': dataset_id,
        'method_id': 'baseline',
    },
)

logging.info('Storing annotated data...')
adata.write_h5ad('baseline', compression = "gzip")

In [17]:
yx = y_pred.toarray()

In [18]:
yx

array([[0.35220775, 0.3766317 , 1.3321805 , ..., 0.6887847 , 0.8215664 ,
        1.060226  ],
       [0.11248662, 0.48883766, 1.3249539 , ..., 0.3920132 , 0.7256106 ,
        0.43536562],
       [0.07785395, 0.28895217, 1.3754098 , ..., 0.42136812, 0.8433135 ,
        0.29659823],
       ...,
       [0.15586874, 0.41980872, 1.0597835 , ..., 0.4836412 , 0.7209059 ,
        0.50454247],
       [0.15739742, 0.30455944, 1.1269591 , ..., 0.26266563, 0.7777873 ,
        0.39446548],
       [0.11088365, 0.53968465, 1.315345  , ..., 0.42454523, 0.79862225,
        0.6007253 ]], dtype=float32)