In [1]:
# Based on last year's winning submission:
# https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw/run

import logging
import utils
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO)

- Whole notebooks fits in about 1min for 10k rows. 
- 100k failed with an error: `Canceled future for execute_request message before replies were done`

In [2]:
class ExperimentParameters:
    MAX_ROWS_TRAIN = 10_000
    OUTPUT_SUBMISSION = True
    TECHNOLOGY = utils.cite
    REPO: utils.ExperimentRepository = utils.cite
    INPUT_PCA_DIMS = 128
    OUTPUT_PCA_DIMS = 128
    K_FOLDS = 3
    NP_RANDOM_SEED = 1000

np.random.seed(ExperimentParameters.NP_RANDOM_SEED)

In [3]:
logging.info('Reading `h5ad` files...')
# TODO: extract to method with params  (sparse/dense, technology)
input_train= pd.read_hdf(
    ExperimentParameters.TECHNOLOGY.train_inputs_path, 
    start=0, 
    stop=ExperimentParameters.MAX_ROWS_TRAIN
)
output_train= pd.read_hdf(
    ExperimentParameters.TECHNOLOGY.train_targets_path,
    start=0, 
    stop=ExperimentParameters.MAX_ROWS_TRAIN
)

if ExperimentParameters.OUTPUT_SUBMISSION:
    input_test= pd.read_hdf(
        ExperimentParameters.TECHNOLOGY.test_inputs_path, 
    )
else:
    input_test= pd.read_hdf(
        ExperimentParameters.TECHNOLOGY.test_inputs_path, 
        start=0, 
        stop=ExperimentParameters.MAX_ROWS_TRAIN
    )
    

pred_dim_x = input_test.shape[0]
pred_dim_y = output_train.shape[1]


INFO:root:Reading `h5ad` files...


In [4]:
# Do PCA on all input data
input_pca_train = pd.concat(
    [input_train, input_test],
    axis=0,  # type: ignore
)

logging.info('Models using the Truncated SVD to reduce the dimension')

pca_input = TruncatedSVD(n_components=ExperimentParameters.INPUT_PCA_DIMS)
# TODO: float16 might be better, saw something in the forum
pca_features = pca_input.fit_transform(input_pca_train).astype(np.float32)
pca_train_input = pca_features[:len(input_train)] # First len(input_train) rows are input_train
pca_test_input = pca_features[len(input_train):] # Last len(input_test) rows are input_test
assert( len(pca_train_input) + len(pca_test_input) == len(pca_features))

del input_train
del input_test

INFO:root:Models using the Truncated SVD to reduce the dimension


In [5]:
# Also do PCA on output (needs to be de-reduced later)
pca_output = TruncatedSVD(n_components=ExperimentParameters.OUTPUT_PCA_DIMS)
pca_train_output = pca_output.fit_transform(output_train).astype(np.float32)

del output_train 

In [6]:
# Last year they found row-wise normalization was helpful, though they had
# to deal with more batch effects
def row_wise_std_scaler(M):
    """ 
    Standard scale values by row. 
    Sklearn StandardScaler has now row-wise option
    """
    std = np.std(M, axis=1).reshape(-1, 1)
    # Make any zero std 1 to avoid numerical problems
    std[std == 0] = 1
    mean = np.mean(M, axis=1).reshape(-1, 1)
    return (M - mean) / std

logging.info('Running row-wise normalization...')
# normalization across gene counts for a single cell.
# Possibly useful since we only care about correlation and not magnitude
# TODO: do we really want to normalize PCA input?
train_norm = row_wise_std_scaler(pca_train_input).astype(np.float32)
del pca_train_input 
test_norm = row_wise_std_scaler(pca_test_input).astype(np.float32)
del pca_test_input 

INFO:root:Running row-wise normalization...


In [7]:
from dataclasses import dataclass
from typing import List

@dataclass
class Score:
    score: float

@dataclass
class ScoreSummary:
    scores: List[Score]
   

In [8]:
logging.info('Running KRR model ...')
# TODO: research these more
SCALE = 10
ALPHA = .2
logging.info('Starting k-fold loop...')

kf = KFold(n_splits=ExperimentParameters.K_FOLDS)

def report_score_summary(score):
    logging.info(f"Score: {score}")
    # TODO: add results writing
    
kernel = RBF(length_scale = SCALE)
krr = KernelRidge(alpha=ALPHA, kernel=kernel)  # type: ignore

# TODO: require clean history (git), write outputs to file
def fit_and_score(*,
    fold_index: int,
    train_indices: np.ndarray,
    test_indices: np.ndarray,
    pca_output,
    ) -> Score:
    """
    performs fit and returns score
    """
    logging.info(f'Fitting KRR fold {fold_index} of {ExperimentParameters.K_FOLDS}...')
    krr.fit(train_norm[train_indices], pca_train_output[train_indices])
    # TODO: review pca de-reduction
    Y_hat = krr.predict(train_norm[test_indices]) @ pca_output.components_
    Y = pca_train_output[test_indices] @ pca_output.components_
    score = utils.correlation_score(Y, Y_hat)
    return Score(score=score)

scores = []
for fold_index, (train_index, test_index) in enumerate(kf.split(train_norm)):
    score = fit_and_score(
        fold_index=fold_index, 
        train_indices=train_index,
        test_indices=test_index,
        pca_output = pca_output,
    )
    scores.append(score)

ScoreSummary(scores=scores)


INFO:root:Running KRR model ...
INFO:root:Starting k-fold loop...
INFO:root:Fitting KRR fold 0 of 3...
INFO:root:Fitting KRR fold 1 of 3...
INFO:root:Fitting KRR fold 2 of 3...


ScoreSummary(scores=[Score(score=0.8919446084606477), Score(score=0.8924710202317526), Score(score=0.8887646403539322)])

In [9]:
krr.fit(train_norm, pca_train_output)
Y_hat = krr.predict(test_norm) @ pca_output.components_
submission_cite = utils.format_submission(Y_hat, utils.cite)

INFO:root:Loading indices...
INFO:root:Loading evaluation ids...
INFO:root:Final step: fill empty submission df...


In [11]:
# Load old submission which includes multiome tech predictions
multi_submission = pd.read_csv(
    utils.OUTPUT_DIR / "full_64_reduced_linreg.csv",
    index_col=0
)

In [13]:
# drop multi-index to align with other submission
reindexed_submission_cite = pd.DataFrame(submission_cite.reset_index(drop=True))

In [43]:
def test_valid_submission(submission: pd.DataFrame):
    assert submission.index.name == 'row_id'
    assert submission.columns == ['target']
    assert len(submission) == 65744180
    assert submission['target'].isna().sum() == 0

In [63]:
# put into dataframe with proper column names
merged = reindexed_submission_cite['target'].fillna(multi_submission[reindexed_submission_cite['target'].isna()]['target'])
formatted_submission = pd.DataFrame(merged, columns=['target'])
formatted_submission.index.name = "row_id"
test_valid_submission(formatted_submission)

In [64]:
# write full predictions to csv
formatted_submission.to_csv(utils.OUTPUT_DIR / "cite_rbf_with_multi_linear.csv")