In [1]:
import os
GPU_id = 7
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import cuml
import cudf
import cupy

import logging
import anndata as ad
import numpy as np

from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
import os
from glob import glob
from tqdm import tqdm

import gc

In [3]:
path = '/raid/data/ml/nips/datasets/predict_modality'
os.listdir(path)

['openproblems_bmmc_cite_phase2_mod2',
 'openproblems_bmmc_multiome_phase2_mod2',
 'openproblems_bmmc_cite_phase2_rna',
 'openproblems_bmmc_multiome_phase2_rna']

In [4]:
tasks = ["GEX2ADT", "ADT2GEX", "GEX2ATAC", "ATAC2GEX"]
task2name = {
    "ADT2GEX":f"openproblems_bmmc_cite_phase2_mod2",
    "GEX2ADT":f"openproblems_bmmc_cite_phase2_rna",
    "ATAC2GEX":f"openproblems_bmmc_multiome_phase2_mod2",
    "GEX2ATAC":f"openproblems_bmmc_multiome_phase2_rna"
}

In [5]:
task = "GEX2ADT"
name = task2name[task]
tag = "train_mod1"
glob(f'{path}/{name}/*{tag}*.h5ad')[0]

'/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad'

In [6]:
tags = ['train_mod1', 'train_mod2', 'test_mod1', 'test_mod2']
par = {f'input_{tag}': glob(f'{path}/{name}/*{tag}*.h5ad')[0] for tag in tags}
par

{'input_train_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
 'input_train_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
 'input_test_mod1': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad',
 'input_test_mod2': '/raid/data/ml/nips/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad'}

In [7]:
def print_shape(*x):
    for i in x:
        print(i.shape, end=' ')
    print()

In [8]:
%%time
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
input_test_mod2 = ad.read_h5ad(par['input_test_mod2'])

print_shape(input_train_mod1, input_train_mod2, input_test_mod1, input_test_mod2)

(66175, 13953) (66175, 134) (1000, 13953) (1000, 134) 
CPU times: user 6.44 s, sys: 844 ms, total: 7.28 s
Wall time: 7.28 s


In [9]:
%%time

pred_dimx = input_test_mod1.shape[0]
pred_dimy = input_train_mod2.shape[1]

feature_obs = input_train_mod1.obs
gs_obs = input_train_mod2.obs

batches = input_train_mod1.obs.batch.unique().tolist()
batch_len = len(batches)

obs = input_test_mod1.obs
var = input_train_mod2.var
dataset_id = input_train_mod1.uns['dataset_id']

input_train = ad.concat(
    {"train": input_train_mod1, "test": input_test_mod1},
    axis=0,
    join="outer",
    label="group",
    fill_value=0,
    index_unique="-"
)

CPU times: user 8.63 s, sys: 1.71 s, total: 10.3 s
Wall time: 10.3 s


In [10]:
print('Determine parameters by the modalities')
mod1_type = input_train_mod1.var.feature_types[0]
mod1_type = mod1_type.upper()
mod2_type = input_train_mod2.var.feature_types[0]
mod2_type = mod2_type.upper()
mod1_type, mod2_type

Determine parameters by the modalities


('GEX', 'ADT')

In [11]:
n_comp_dict = {
        ("GEX", "ADT"): (300, 70, 10, 0.2),
        ("ADT", "GEX"): (None, 50, 10, 0.2),
        ("GEX", "ATAC"): (1000, 50, 10, 0.1),
        ("ATAC", "GEX"): (100, 70, 10, 0.1)
        }
print(f"{mod1_type}, {mod2_type}")
n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)]
print(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")

# Do PCA on the input data
print('Models using the Truncated SVD to reduce the dimension')

GEX, ADT
300, 70, 10, 0.2
Models using the Truncated SVD to reduce the dimension


In [12]:
%%time

X = cupy.asarray(input_train.X.toarray(), order='F')
y = cupy.asarray(input_train_mod2.X.toarray(), order='F')

CPU times: user 12.1 s, sys: 5.13 s, total: 17.2 s
Wall time: 17.2 s


In [13]:
%%time

def get_mask(ds, val):
    r = ds == val
    return r.values

if n_mod1 is not None and n_mod1 < input_train.shape[1]:
    embedder_mod1 = cuml.decomposition.TruncatedSVD(n_components=n_mod1)
    mod1_pca = embedder_mod1.fit_transform(X).astype(np.float32)
    train_matrix = mod1_pca[get_mask(input_train.obs['group'],'train')]
    test_matrix = mod1_pca[get_mask(input_train.obs['group'],'test')]
else:
    train_matrix = input_train_mod1.to_df().values.astype(np.float32)
    test_matrix = input_test_mod1.to_df().values.astype(np.float32)

if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]:
    embedder_mod2 = cuml.decomposition.TruncatedSVD(n_components=n_mod2)
    train_gs = embedder_mod2.fit_transform(y).astype(np.float32)
else:
    train_gs = input_train_mod2.to_df().values.astype(np.float32)

del input_train
del input_train_mod1
del input_train_mod2
del input_test_mod1
del X, y
gc.collect()

CPU times: user 5.95 s, sys: 2.73 s, total: 8.68 s
Wall time: 8.66 s


285

In [14]:
%%time

print('Running normalization ...')
train_sd = np.std(train_matrix, axis=1).reshape(-1, 1)
train_sd[train_sd == 0] = 1
train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd
train_norm = train_norm.astype(np.float32)
del train_matrix

test_sd = np.std(test_matrix, axis=1).reshape(-1, 1)
test_sd[test_sd == 0] = 1
test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd
test_norm = test_norm.astype(np.float32)
del test_matrix

Running normalization ...
CPU times: user 14.7 ms, sys: 4.81 ms, total: 19.5 ms
Wall time: 22.6 ms


In [15]:
%%time

print('Running KRR model ...')
y_pred = cupy.zeros((pred_dimx, pred_dimy), dtype=np.float32)
cupy.random.seed(1000)
emb = embedder_mod2.components_.to_output('cupy')
for _ in tqdm(range(5)):
    np.random.shuffle(batches)
    for batch in [batches[:batch_len//2], batches[batch_len//2:]]:
        # for passing the test
        if not batch:
            batch = [batches[0]]

        logging.info(batch)
        print('Fitting KRR ... ')
        krr = cuml.kernel_ridge.KernelRidge(alpha=alpha, kernel='rbf')
        krr.fit(train_norm[feature_obs.batch.isin(batch).values], 
                train_gs[gs_obs.batch.isin(batch).values])
        gc.collect()
        y_pred += (krr.predict(test_norm) @ emb)
        gc.collect()

Running KRR model ...


  0%|                                                                                                                                                                    | 0/5 [00:00<?, ?it/s]

Fitting KRR ... 
Fitting KRR ... 


 20%|███████████████████████████████▏                                                                                                                            | 1/5 [00:08<00:32,  8.13s/it]

Fitting KRR ... 
Fitting KRR ... 


 40%|██████████████████████████████████████████████████████████████▍                                                                                             | 2/5 [00:16<00:24,  8.01s/it]

Fitting KRR ... 
Fitting KRR ... 


 60%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 3/5 [00:23<00:15,  7.85s/it]

Fitting KRR ... 
Fitting KRR ... 


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 4/5 [00:31<00:07,  7.90s/it]

Fitting KRR ... 
Fitting KRR ... 


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  7.99s/it]

CPU times: user 26.7 s, sys: 13.1 s, total: 39.8 s
Wall time: 40 s





In [16]:
y_pred = y_pred.get()
np.clip(y_pred, a_min=0, a_max=None, out=y_pred)
if mod2_type == "ATAC":
    np.clip(y_pred, a_min=0, a_max=1, out=y_pred)

y_pred /= 10

# Store as sparse matrix to be efficient. Note that this might require
# different classifiers/embedders before-hand. Not every class is able
# to support such data structures.
y_pred = csc_matrix(y_pred)

logging.info("Generate anndata object ...")
adata = ad.AnnData(
    X=y_pred,
    obs=obs,
    var=var,
    uns={
        'dataset_id': dataset_id,
        'method_id': 'gpu',
    },
)

logging.info('Storing annotated data...')
adata.write_h5ad('gpu.h5ad', compression = "gzip")

### Evaluation

In [22]:
yx = y_pred.toarray()
yte = input_test_mod2.X.toarray()
score = ((yx - yte)**2).mean()**0.5
print(f'RMSE {score:.4f}')

RMSE 0.3867


The score is slightly worse than the baseline but the error is small. This score still ranks first in the `GEX 2 ADT` task as shown in the [leaderboard](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860)