In [1]:
import utils
from utils import (
    fit_and_score_pca_targets,
    k_fold_validation,
    truncated_pca,
    pca_inputs,
    load_all_data,
)
import numpy as np
import pandas as pd
from utils import Datasets
from prefect import flow, get_run_logger

from sklearn.gaussian_process.kernels import RBF  # type: ignore
from sklearn.kernel_ridge import KernelRidge  # type: ignore

In [2]:
# Parameter cell for papermill, do not merge or delete this cell
IS_TEST = False 

In [3]:
# By default, Prefect makes a best effort to compute a
# table hash of the .py file in which the flow is defined to
# automatically detect when your code changes.
@flow(
    name="RBF with Input and Target PCA",
    description="Based on last year's winner of RNA->Prot",
)
def last_year_rbf_flow(
    max_rows_train=1000,
    submit_to_kaggle=False,
    technology=utils.cite,
    inputs_pca_dims=4,
    targets_pca_dims=4,
    k_folds=2,
    scale=10,  # RBF scale param. Higher means more model complexity
    alpha=0.2,  # Regularization param. More is more regularization.
):
    logger = get_run_logger()
    if technology == utils.multi:
        data: Datasets = load_all_data(
            technology=technology,
            max_rows_train=max_rows_train,
            submit_to_kaggle=submit_to_kaggle,
            sparse=True,
        )
    else:
        data: Datasets = load_all_data(
            technology=technology,
            max_rows_train=max_rows_train,
            submit_to_kaggle=submit_to_kaggle,
            sparse=True,
        )
    inputs_train, targets_train, inputs_test = (
        data.inputs_train,
        data.targets_train,
        data.inputs_test,
    )
    pca_inputs_train, pca_inputs_test, _ = pca_inputs(
        inputs_train, inputs_test, inputs_pca_dims
    )
    pca_targets_train, pca_model_targets = truncated_pca(
        targets_train, targets_pca_dims, return_model=True
    )
    train_norm = utils.row_wise_std_scaler(pca_inputs_train).astype(np.float32)
    del pca_inputs_train
    kernel = RBF(length_scale=scale)
    krr = KernelRidge(alpha=alpha, kernel=kernel)  # type: ignore
    scores = k_fold_validation(
        model=krr,
        train_inputs=train_norm,
        train_targets=pca_targets_train,
        fit_and_score_func=fit_and_score_pca_targets,
        k=k_folds,
        pca_model_targets=pca_model_targets,
    )
    logger.info(f"K-Fold complete. Scores: {scores}")

    test_norm = utils.row_wise_std_scaler(pca_inputs_test).astype(np.float32)
    del pca_inputs_test
    if submit_to_kaggle:
        # TODO: extract to utils method
        OTHER_FILENAME = "cite_rbf_with_multi_linear"
        OTHER_SUBMISSION_PATH = utils.OUTPUT_DIR / f"{OTHER_FILENAME}.csv"
        # fit model on downsampled data
        krr.fit(train_norm, pca_targets_train)
        # predict on full submission inputs
        Y_hat = krr.predict(test_norm) @ pca_model_targets.components_  # type: ignore
        # Format this experiment for submission
        this_submission = utils.format_submission(Y_hat, technology)
        # Load other submission which includes predictions
        # for alternate tech
        other_submission = pd.read_csv(OTHER_SUBMISSION_PATH, index_col=0)
        # drop multi-index to align with other submission
        reindexed_submission_this = pd.DataFrame(this_submission.reset_index(drop=True))
        # Merge with separate predictions for other technology
        merged = reindexed_submission_this["target"].fillna(
            other_submission[reindexed_submission_this["target"].isna()]["target"]
        )
        # put into dataframe with proper column names
        formatted_submission = pd.DataFrame(merged, columns=["target"])
        formatted_submission.index.name = "row_id"
        utils.test_valid_submission(formatted_submission)
        # write full predictions to csv
        logger.info(
            utils.OUTPUT_DIR / f"{technology.name}_rbf_with_{OTHER_FILENAME}.csv"
        )
        formatted_submission.to_csv(
            utils.OUTPUT_DIR / f"{technology.name}_rbf_with_{OTHER_FILENAME}.csv"
        )
    else:
        return scores 

In [4]:
@flow
def test_or_run(is_test):
    logger = get_run_logger()
    if is_test:
        scores = last_year_rbf_flow(
            max_rows_train=1_000,
        )
        assert sum([s.score for s in scores]) /len(scores) > .9
        logger.info("=============TEST PASSED=============")
    else:
        last_year_rbf_flow(
            max_rows_train=1_000,
            submit_to_kaggle=False,
            technology=utils.multi,
            inputs_pca_dims=5,
            targets_pca_dims=4,
            k_folds=3,
            scale=10,  # RBF scale param. Higher means more model complexity
            alpha=0.2,  # Regularization param. More is more regularization.
        )

In [6]:
test_or_run(IS_TEST)

22:00:23.636 | INFO    | prefect.engine - Created flow run 'icy-bumblebee' for flow 'test-or-run'
22:00:23.869 | INFO    | Flow run 'icy-bumblebee' - Created subflow run 'large-caterpillar' for flow 'RBF with Input and Target PCA'
22:00:23.994 | INFO    | Flow run 'large-caterpillar' - Created subflow run 'convivial-falcon' for flow 'load-all-data'
22:00:24.060 | INFO    | Flow run 'convivial-falcon' - Created task run 'load_inputs_test-9e70d44b-0' for task 'load_inputs_test'
22:00:24.061 | INFO    | Flow run 'convivial-falcon' - Executing 'load_inputs_test-9e70d44b-0' immediately...
22:00:37.510 | INFO    | Task run 'load_inputs_test-9e70d44b-0' - Finished in state Completed()
22:00:37.535 | INFO    | Flow run 'convivial-falcon' - Created task run 'load_targets_train-3eedfce5-0' for task 'load_targets_train'
22:00:37.536 | INFO    | Flow run 'convivial-falcon' - Executing 'load_targets_train-3eedfce5-0' immediately...
22:00:39.122 | INFO    | Task run 'load_targets_train-3eedfce5-0' -

[Completed(message='All states completed.', type=COMPLETED, result=[Completed(message=None, type=COMPLETED, result=(array([[221.41353   ,  -4.8420906 , -12.080435  ,  12.7254925 ],
        [208.2218    , -35.21668   ,  29.500877  ,  34.624687  ],
        [206.0232    , -16.991434  ,  -3.1208575 ,   1.1522756 ],
        ...,
        [247.39156   ,  -5.690018  ,   0.33937997,  20.217266  ],
        [277.66058   ,  21.561852  ,  13.541169  ,  17.91311   ],
        [207.39906   , -36.90172   ,  34.65962   ,  41.566624  ]],
       dtype=float32), TruncatedSVD(n_components=4))), Completed(message=None, type=COMPLETED, result=Datasets(inputs_train=<1000x228942 sparse matrix of type '<class 'numpy.float32'>'
 	with 4933386 stored elements in Compressed Sparse Row format>, targets_train=gene_id       ENSG00000121410  ENSG00000268895  ENSG00000175899  \
 cell_id                                                           
 56390cf1b95e              0.0         0.000000              0.0   
 fc0c601