In [14]:
# Based on last year's winning submission:
# https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw/run

import logging
import utils
import pandas as pd
import numpy as np
import scipy as sp
from dataclasses import dataclass
from scipy.sparse import csc_matrix
from prefect import flow

from sklearn.decomposition import TruncatedSVD
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO)
@dataclass
class LastYearRBFExperiment(
    utils.ExperimentParameters, 
    utils.PCAInputs, 
    utils.PCATargets, 
    utils.KFold
    ):
    pass

In [5]:
submit = LastYearRBFExperiment(
    MAX_ROWS_TRAIN = 25_000,
    OUTPUT_SUBMISSION = True,
    TECHNOLOGY = utils.multi,
    INPUTS_PCA_DIMS = 128,
    TARGETS_PCA_DIMS = 128,
    K_FOLDS = 3
)

unittest = LastYearRBFExperiment(
    MAX_ROWS_TRAIN = 1000,
    OUTPUT_SUBMISSION = False,
    TECHNOLOGY = utils.cite,
    INPUTS_PCA_DIMS = 10,
    TARGETS_PCA_DIMS = 10,
    K_FOLDS = 3
)

np.random.seed(utils.ExperimentParameters.NP_RANDOM_SEED)

In [6]:
# Parameter cell for papermill, do not merge or delete
IS_TEST = False 

In [7]:
if IS_TEST:
    EXPERIMENT = unittest
else:
    EXPERIMENT = submit 
datasets = utils.load_sparse_values_data(EXPERIMENT)
inputs_train, targets_train, inputs_test = datasets.inputs_train, datasets.targets_train, datasets.inputs_test

INFO:root:Reading `.sparse.npz` files...
INFO:root:Reading `hd5 targets` files...


In [8]:
# Do PCA on all input data
# build up a cache filename based on the experiment params
cache_name =  utils.REDUCED_DIR / "-".join([
    EXPERIMENT.TECHNOLOGY.name, 
    str(EXPERIMENT.MAX_ROWS_TRAIN), "rows", 
    "submit" if EXPERIMENT.OUTPUT_SUBMISSION else "",
    "-input_pca-" + str(EXPERIMENT.INPUTS_PCA_DIMS) + ".pkl"
])

input_pca_train = sp.sparse.vstack([inputs_train, inputs_test])
logging.info('Models using the Truncated SVD to reduce the dimensionality')

pca_input = TruncatedSVD(n_components=EXPERIMENT.INPUTS_PCA_DIMS)
# TODO: float16 might be better, saw something in the forum
pca_features = pca_input.fit_transform(input_pca_train).astype(np.float32)
pca_train_input = pca_features[:inputs_train.shape[0]] # First len(input_train) rows are input_train
pca_test_input = pca_features[inputs_train.shape[0]:] # Last len(input_test) rows are input_test
assert( len(pca_train_input) + len(pca_test_input) == len(pca_features))

del inputs_train
del inputs_test

In [10]:
# Also do PCA on output (needs to be de-reduced later)
pca_targets = TruncatedSVD(n_components=EXPERIMENT.TARGETS_PCA_DIMS)
pca_train_targets = pca_targets.fit_transform(targets_train).astype(np.float32)

del targets_train 

In [11]:
# Last year they found row-wise normalization was helpful, though they had
# to deal with more batch effects
logging.info('Running row-wise normalization...')
# normalization across gene counts for a single cell.
# Possibly useful since we only care about correlation and not magnitude
# TODO: do we really want to normalize PCA input?
train_norm = utils.row_wise_std_scaler(pca_train_input).astype(np.float32)
del pca_train_input 
test_norm = utils.row_wise_std_scaler(pca_test_input).astype(np.float32)
del pca_test_input 

INFO:root:Running row-wise normalization...


In [12]:
logging.info('Running KRR model ...')
# TODO: research these more
SCALE = 10
ALPHA = .2
logging.info('Starting k-fold loop...')

kf = KFold(n_splits=EXPERIMENT.K_FOLDS)

kernel = RBF(length_scale = SCALE)
krr = KernelRidge(alpha=ALPHA, kernel=kernel)  # type: ignore

def fit_and_score(*,
    fold_index: int,
    train_indices: np.ndarray,
    test_indices: np.ndarray,
    pca_targets,
    ) -> utils.Score:
    """
    performs fit and returns score
    """
    logging.info(
        f'Fitting KRR fold {fold_index + 1} of {EXPERIMENT.K_FOLDS}...'
    )
    krr.fit(
        train_norm[train_indices], 
        pca_train_targets[train_indices]
    )
    # TODO: review pca de-reduction
    Y_hat = krr.predict(train_norm[test_indices]) @ pca_targets.components_
    Y = pca_train_targets[test_indices] @ pca_targets.components_
    score = utils.correlation_score(Y, Y_hat)
    logging.info(f"Score: {score}")
    return utils.Score(score=score)

scores = []
for fold_index, (train_index, test_index) in enumerate(kf.split(train_norm)):
    score = fit_and_score(
        fold_index=fold_index, 
        train_indices=train_index,
        test_indices=test_index,
        pca_targets = pca_targets,
    )
    scores.append(score)

# TODO: require clean history (git), write outputs to file
utils.ScoreSummary(scores=scores)


INFO:root:Running KRR model ...
INFO:root:Starting k-fold loop...
INFO:root:Fitting KRR fold 1 of 3...
INFO:root:Score: 0.9644400591933789
INFO:root:Fitting KRR fold 2 of 3...
INFO:root:Score: 0.9691158634315664
INFO:root:Fitting KRR fold 3 of 3...
INFO:root:Score: 0.9700660943874819


ScoreSummary(scores=[Score(score=0.9644400591933789), Score(score=0.9691158634315664), Score(score=0.9700660943874819)])

In [13]:
# TODO: extract to utils method
if EXPERIMENT.OUTPUT_SUBMISSION: 
    OTHER_FILENAME = 'cite_rbf_with_multi_linear'
    OTHER_SUBMISSION_PATH = utils.OUTPUT_DIR / f'{OTHER_FILENAME}.csv' 
    # fit model on downsampled data
    krr.fit(train_norm, pca_train_targets)
    # predict on full submission inputs
    Y_hat = krr.predict(test_norm) @ pca_targets.components_
    
    # Format this experiment for submission
    this_submission = utils.format_submission(
        Y_hat, EXPERIMENT.TECHNOLOGY
    )
    # Load other submission which includes predictions for alternate tech 
    other_submission = pd.read_csv(OTHER_SUBMISSION_PATH, index_col=0)
    
    # drop multi-index to align with other submission
    reindexed_submission_this = pd.DataFrame(
        this_submission.reset_index(drop=True)
    )
    # Merge with separate predictions for other technology 
    merged = (reindexed_submission_this['target']
              .fillna(
                  other_submission[
                      reindexed_submission_this['target'].isna()]['target']
              )
    )
    # put into dataframe with proper column names
    formatted_submission = pd.DataFrame(merged, columns=['target'])
    formatted_submission.index.name = "row_id"
    utils.test_valid_submission(formatted_submission)
    # write full predictions to csv
    formatted_submission.to_csv(
        utils.OUTPUT_DIR / f"{EXPERIMENT.TECHNOLOGY.name}_rbf_with_{OTHER_FILENAME}.csv")
else:
    logging.info("======Test Passed!======")

INFO:root:Loading indices...
INFO:root:Loading evaluation ids...
INFO:root:Final step: fill empty submission df...


In [None]:
# kaggle competitions submit -c open-problems-multimodal -f submission.csv -m "EXPERIMENT.__str__()"