In [1]:
# Based on last year's winning submission:
# https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw/run

import logging
import utils
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge

logging.basicConfig(level=logging.INFO)

- Whole notebooks fits in about 1min for 10k rows. 
- 100k failed with an error: `Canceled future for execute_request message before replies were done`

In [17]:
# TODO: look up viash:  https://viash.io/
par = {
    'input_train': '../data/original/train_cite_inputs.h5',
    'output_train': '../data/original/train_cite_targets.h5',
    'input_test': '../data/original/test_cite_inputs.h5',
}

LIMIT = 100000

logging.info('Reading `h5ad` files...')
input_train= pd.read_hdf(par['input_train'], start=0, stop=LIMIT)
output_train= pd.read_hdf(par['output_train'], start=0, stop=LIMIT)
input_test= pd.read_hdf(par['input_test'], start=0, stop=LIMIT)

pred_dim_x = input_test.shape[0]
pred_dim_y = output_train.shape[1]


INFO:root:Reading `h5ad` files...


In [18]:
input_pca_train = pd.concat(
    [input_train, input_test],
    axis=0,  # type: ignore
)

# Do PCA on all input data
logging.info('Models using the Truncated SVD to reduce the dimension')

PCA_COMPONENTS_INPUT = 128
pca_input = TruncatedSVD(n_components=PCA_COMPONENTS_INPUT)
pca_features = pca_input.fit_transform(input_pca_train).astype(np.float32)
pca_train_input = pca_features[:len(input_train)] # First len(input_train) rows are input_train
pca_test_input = pca_features[len(input_train):] # Last len(input_test) rows are input_test
assert( len(pca_train_input) + len(pca_test_input) == len(pca_features))

del input_train
del input_test

INFO:root:Models using the Truncated SVD to reduce the dimension


In [19]:
PCA_COMPONENTS_OUTPUT = 128
pca_output = TruncatedSVD(n_components=PCA_COMPONENTS_OUTPUT)
pca_train_output = pca_output.fit_transform(output_train).astype(np.float32)

del output_train 

In [20]:
def row_wise_std_scaler(M):
    std = np.std(M, axis=1).reshape(-1, 1)
    # Make any zero std 1 to avoid numerical problems
    std[std == 0] = 1
    mean = np.mean(M, axis=1).reshape(-1, 1)
    return (M - mean) / std

logging.info('Running row-wise normalization...')
# normalization across gene counts for a single cell.
# Possibly useful since we only care about correlation and not magnitude
# TODO: do we really want to normalize PCA input?
train_norm = row_wise_std_scaler(pca_train_input).astype(np.float32)
del pca_train_input 
test_norm = row_wise_std_scaler(pca_test_input).astype(np.float32)
del pca_test_input 

INFO:root:Running row-wise normalization...


In [21]:
logging.info('Running KRR model ...')
y_pred = np.zeros((pred_dim_x, pred_dim_y), dtype=np.float32)
np.random.seed(1000)

# TODO: research these more
SCALE = 10
ALPHA = .2

kernel = RBF(length_scale = SCALE)
krr = KernelRidge(alpha=ALPHA, kernel=kernel)  # type: ignore
logging.info('Fitting KRR ... ')
krr.fit(train_norm, pca_train_output)
# TODO: review pca de-reduction
y_pred += (krr.predict(test_norm) @ pca_output.components_)

# Store as sparse matrix to be efficient. Note that this might require
# different classifiers/embedders before-hand. Not every class is able
# to support such data structures.
y_pred = csc_matrix(y_pred)

INFO:root:Running KRR model ...
INFO:root:Fitting KRR ... 


: 

: 