In [1]:
import cuml
import cudf
import cupy

import logging
import anndata as ad
import numpy as np

from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
import os
from glob import glob
from tqdm import tqdm

import gc

In [2]:
path = '/raid/data/ml/nips/datasets/predict_modality'
os.listdir(path)

['openproblems_bmmc_cite_phase2_mod2',
 'openproblems_bmmc_multiome_phase2_mod2',
 'openproblems_bmmc_cite_phase2_rna',
 'openproblems_bmmc_multiome_phase2_rna']

In [3]:
tasks = ["GEX2ADT", "ADT2GEX", "GEX2ATAC", "ATAC2GEX"]
task2name = {
    "ADT2GEX":f"openproblems_bmmc_cite_phase2_mod2",
    "GEX2ADT":f"openproblems_bmmc_cite_phase2_rna",
    "ATAC2GEX":f"openproblems_bmmc_multiome_phase2_mod2",
    "GEX2ATAC":f"openproblems_bmmc_multiome_phase2_rna"
}

In [4]:
task = "GEX2ADT"
name = task2name[task]
tag = "train_mod1"
glob(f'{path}/{name}/*{tag}*.h5ad')[0]

'/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad'

In [5]:
tags = ['train_mod1', 'train_mod2', 'test_mod1', 'test_mod2']
par = {f'input_{tag}': glob(f'{path}/{name}/*{tag}*.h5ad')[0] for tag in tags}
par

{'input_train_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
 'input_train_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
 'input_test_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad',
 'input_test_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad'}

In [6]:
def print_shape(*x):
    for i in x:
        print(i.shape, end=' ')
    print()

In [7]:
%%time
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
input_test_mod2 = ad.read_h5ad(par['input_test_mod2'])

print_shape(input_train_mod1, input_train_mod2, input_test_mod1, input_test_mod2)

(66175, 13953) (66175, 134) (1000, 13953) (1000, 134) 
CPU times: user 6.27 s, sys: 970 ms, total: 7.24 s
Wall time: 7.23 s


In [8]:
%%time

pred_dimx = input_test_mod1.shape[0]
pred_dimy = input_train_mod2.shape[1]

feature_obs = input_train_mod1.obs
gs_obs = input_train_mod2.obs

batches = input_train_mod1.obs.batch.unique().tolist()
batch_len = len(batches)

obs = input_test_mod1.obs
var = input_train_mod2.var
dataset_id = input_train_mod1.uns['dataset_id']

input_train = ad.concat(
    {"train": input_train_mod1, "test": input_test_mod1},
    axis=0,
    join="outer",
    label="group",
    fill_value=0,
    index_unique="-"
)

CPU times: user 8.39 s, sys: 1.69 s, total: 10.1 s
Wall time: 10.1 s


In [9]:
print('Determine parameters by the modalities')
mod1_type = input_train_mod1.var.feature_types[0]
mod1_type = mod1_type.upper()
mod2_type = input_train_mod2.var.feature_types[0]
mod2_type = mod2_type.upper()
mod1_type, mod2_type

Determine parameters by the modalities


('GEX', 'ADT')

In [10]:
n_comp_dict = {
        ("GEX", "ADT"): (300, 70, 10, 0.2),
        ("ADT", "GEX"): (None, 50, 10, 0.2),
        ("GEX", "ATAC"): (1000, 50, 10, 0.1),
        ("ATAC", "GEX"): (100, 70, 10, 0.1)
        }
print(f"{mod1_type}, {mod2_type}")
n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)]
print(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")

# Do PCA on the input data
print('Models using the Truncated SVD to reduce the dimension')

GEX, ADT
300, 70, 10, 0.2
Models using the Truncated SVD to reduce the dimension


In [11]:
%%time

X = cupy.asarray(input_train.X.toarray(), order='F')
y = cupy.asarray(input_train_mod2.X.toarray(), order='F')

CPU times: user 11.8 s, sys: 5.06 s, total: 16.9 s
Wall time: 16.9 s


In [12]:
%%time

def get_mask(ds, val):
    r = ds == val
    return r.values

if n_mod1 is not None and n_mod1 < input_train.shape[1]:
    embedder_mod1 = cuml.decomposition.TruncatedSVD(n_components=n_mod1)
    mod1_pca = embedder_mod1.fit_transform(X).astype(np.float32)
    train_matrix = mod1_pca[get_mask(input_train.obs['group'],'train')]
    test_matrix = mod1_pca[get_mask(input_train.obs['group'],'test')]
else:
    train_matrix = input_train_mod1.to_df().values.astype(np.float32)
    test_matrix = input_test_mod1.to_df().values.astype(np.float32)

if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]:
    embedder_mod2 = cuml.decomposition.TruncatedSVD(n_components=n_mod2)
    train_gs = embedder_mod2.fit_transform(y).astype(np.float32)
else:
    train_gs = input_train_mod2.to_df().values.astype(np.float32)

del input_train
del input_train_mod1
del input_train_mod2
del input_test_mod1
del X, y

CPU times: user 5.85 s, sys: 2.68 s, total: 8.54 s
Wall time: 8.53 s


In [13]:
%%time

print('Running normalization ...')
train_sd = np.std(train_matrix, axis=1).reshape(-1, 1)
train_sd[train_sd == 0] = 1
train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd
train_norm = train_norm.astype(np.float32)
del train_matrix

test_sd = np.std(test_matrix, axis=1).reshape(-1, 1)
test_sd[test_sd == 0] = 1
test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd
test_norm = test_norm.astype(np.float32)
del test_matrix

Running normalization ...
CPU times: user 8.83 ms, sys: 7.64 ms, total: 16.5 ms
Wall time: 17 ms


In [14]:
cuml.metrics.PAIRWISE_KERNEL_FUNCTIONS

{'linear': <function cuml.metrics.pairwise_kernels.linear_kernel(X, Y)>,
 'additive_chi2': <function cuml.metrics.pairwise_kernels.additive_chi2_kernel(X, Y)>,
 'chi2': <function cuml.metrics.pairwise_kernels.chi2_kernel(X, Y, gamma=1.0)>,
 'cosine': <function cuml.metrics.pairwise_kernels.cosine_similarity(X, Y)>,
 'laplacian': <function cuml.metrics.pairwise_kernels.laplacian_kernel(X, Y, gamma=None)>,
 'polynomial': <function cuml.metrics.pairwise_kernels.polynomial_kernel(X, Y, degree=3, gamma=None, coef0=1)>,
 'poly': <function cuml.metrics.pairwise_kernels.polynomial_kernel(X, Y, degree=3, gamma=None, coef0=1)>,
 'rbf': <function cuml.metrics.pairwise_kernels.rbf_kernel(X, Y, gamma=None)>,
 'sigmoid': <function cuml.metrics.pairwise_kernels.sigmoid_kernel(X, Y, gamma=None, coef0=1)>}

In [None]:
import cupy
from cuml.metrics import pairwise_kernels
x = cupy.random.rand(33571, 300)
params = {'gamma': None, 'degree': 3, 'coef0': 1}
for i in range(10):
    K = pairwise_kernels(x, x, metric='rbf', filter_params=True, **params)
    print(i)

In [15]:
%%time

print('Running KRR model ...')
y_pred = cupy.zeros((pred_dimx, pred_dimy), dtype=np.float32)
cupy.random.seed(1000)

for _ in tqdm(range(5)):
    np.random.shuffle(batches)
    for batch in [batches[:batch_len//2], batches[batch_len//2:]]:
        # for passing the test
        if not batch:
            batch = [batches[0]]

        logging.info(batch)
        
        print('Fitting KRR ... ')
        x = train_norm[feature_obs.batch.isin(batch).values]
        y = train_gs[gs_obs.batch.isin(batch).values]
        yp = []
        for i in range(y.shape[1]):
            krr = cuml.kernel_ridge.KernelRidge(alpha=alpha, kernel='rbf')
            krr.fit(x, y[:,i].ravel())
            yp.append(krr.predict(test_norm))
            gc.collect()
        yp = cupy.array(yp).T
        yp = yp @ embedder_mod2.components_.to_output('cupy')
        y_pred += yp

Running KRR model ...


  0%|                                                                                                                                                                    | 0/5 [00:00<?, ?it/s]

Fitting KRR ... 
Fitting KRR ... 


 20%|███████████████████████████████                                                                                                                            | 1/5 [09:51<39:24, 591.18s/it]

Fitting KRR ... 
Fitting KRR ... 


 40%|██████████████████████████████████████████████████████████████                                                                                             | 2/5 [19:29<29:11, 583.71s/it]

Fitting KRR ... 
Fitting KRR ... 


 60%|█████████████████████████████████████████████████████████████████████████████████████████████                                                              | 3/5 [29:36<19:48, 594.19s/it]

Fitting KRR ... 
Fitting KRR ... 


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4/5 [39:15<09:48, 588.15s/it]

Fitting KRR ... 
Fitting KRR ... 


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [48:54<00:00, 586.80s/it]

CPU times: user 31min 5s, sys: 15min 40s, total: 46min 45s
Wall time: 48min 54s





In [16]:
yp @ embedder_mod2.components_.to_output('cupy')

ValueError: Dimension k with different lengths in arrays

In [None]:
yp.shape

In [None]:
krr.fit(x, y[:,i].ravel())

In [None]:
krr.predict(test_norm)

In [None]:
tmp = y[:,i]
type(tmp), tmp.shape

In [None]:
tmp.strides, y.shape

In [None]:
y = train_gs[gs_obs.batch.isin(batch).values]
type(y), y.shape

In [None]:
tmp = y[:,i]
tmp.strides

In [None]:
tmp = tmp.ravel()
tmp.strides

In [None]:














np.clip(y_pred, a_min=0, a_max=None, out=y_pred)
if mod2_type == "ATAC":
    np.clip(y_pred, a_min=0, a_max=1, out=y_pred)

y_pred /= 10

# Store as sparse matrix to be efficient. Note that this might require
# different classifiers/embedders before-hand. Not every class is able
# to support such data structures.
y_pred = csc_matrix(y_pred)

logging.info("Generate anndata object ...")
adata = ad.AnnData(
    X=y_pred,
    obs=obs,
    var=var,
    uns={
        'dataset_id': dataset_id,
        'method_id': meta['functionality_name'],
    },
)

logging.info('Storing annotated data...')
adata.write_h5ad(par['output'], compression = "gzip")