In [1]:
import utils
from utils import fit_and_score_pca_targets, k_fold_validation, truncated_pca, pca_inputs, load_all_data
import logging
import scipy as sp
import numpy as np
import pandas as pd
from utils import Datasets, TechnologyRepository

from prefect import flow

from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge


In [2]:
# By default, Prefect makes a best effort to compute a s
# table hash of the .py file in which the flow is defined to 
# automatically detect when your code changes. 
@flow(name="RBF with Input and Target PCA",
      description="Based on last year's winner of RNA->Prot")
def last_year_rbf_flow(
      max_rows_test = 1000,
      submit_to_kaggle = True,
      technology = utils.cite,
      inputs_pca_dims = 4,
      targets_pca_dims = 4,
      k_folds = 2,
      scale = 10, # RBF scale param. Higher means more model complexity 
      alpha = .2 # Regularization param. More is more regularization.
      ):
      if technology == utils.multi:
            data: Datasets = load_all_data(
                                    technology=technology, 
                                    max_rows_test=max_rows_test, 
                                    submit_to_kaggle=submit_to_kaggle,
                                    sparse=True
                             )
      else:
            data: Datasets = load_all_data(
                                    technology=technology, 
                                    max_rows_test=max_rows_test, 
                                    submit_to_kaggle=submit_to_kaggle,
                                    sparse=True
                             )
      inputs_train, targets_train, inputs_test = (
            data.inputs_train, data.targets_train, data.inputs_test
      )
      pca_inputs_train, pca_inputs_test, _ = pca_inputs(
                                                inputs_train, 
                                                inputs_test,
                                                inputs_pca_dims
                                             )
      pca_targets_train, pca_model_targets = truncated_pca(
                                                targets_train,
                                                targets_pca_dims,
                                                return_model = True
                                            )
      train_norm = utils.row_wise_std_scaler(pca_inputs_train).astype(np.float32)
      del pca_inputs_train
      kernel = RBF(length_scale = scale)
      krr = KernelRidge(alpha=alpha, kernel=kernel)  # type: ignore
      scores = k_fold_validation(
            model=krr, 
            train_inputs=train_norm, 
            train_targets=pca_targets_train,
            fit_and_score_func=fit_and_score_pca_targets,
            k = k_folds,
            pca_model_targets = pca_model_targets
      )

      test_norm = utils.row_wise_std_scaler(pca_inputs_test).astype(np.float32)
      del pca_inputs_test

      pass

In [3]:
last_year_rbf_flow()

15:11:16.292 | INFO    | prefect.engine - Created flow run 'fortunate-albatross' for flow 'RBF with Input and Target PCA'
15:11:16.501 | INFO    | Flow run 'fortunate-albatross' - Created subflow run 'benign-bobcat' for flow 'load-all-data'
15:11:16.568 | INFO    | Flow run 'benign-bobcat' - Created task run 'load_inputs_test-9e70d44b-0' for task 'load_inputs_test'
15:11:16.569 | INFO    | Flow run 'benign-bobcat' - Executing 'load_inputs_test-9e70d44b-0' immediately...
15:11:23.164 | INFO    | Task run 'load_inputs_test-9e70d44b-0' - Finished in state Completed()
15:11:23.185 | INFO    | Flow run 'benign-bobcat' - Created task run 'load_targets_train-3eedfce5-0' for task 'load_targets_train'
15:11:23.186 | INFO    | Flow run 'benign-bobcat' - Executing 'load_targets_train-3eedfce5-0' immediately...
15:11:23.498 | INFO    | Task run 'load_targets_train-3eedfce5-0' - Finished in state Completed()
15:11:23.521 | INFO    | Flow run 'benign-bobcat' - Created task run 'load_inputs_test-9e70

[Completed(message=None, type=COMPLETED, result=(array([[25.26903  , -3.5555148,  1.6674383,  0.1065831],
        [35.43327  , -6.593282 ,  0.9918304,  1.9251753],
        [25.114338 , 22.898996 ,  3.738587 ,  1.0916662],
        ...,
        [23.90535  , -3.2159214,  3.902786 , -2.1204877],
        [39.45138  , -4.9356546, -8.141688 ,  2.5049949],
        [37.69809  , -6.9884186, -3.4335222,  1.998573 ]], dtype=float32), TruncatedSVD(n_components=4))),
 Completed(message=None, type=COMPLETED, result=Datasets(inputs_train=<1000x22050 sparse matrix of type '<class 'numpy.float32'>'
 	with 5140414 stored elements in Compressed Sparse Row format>, targets_train=gene_id           CD86     CD274     CD270     CD155     CD112      CD47  \
 cell_id                                                                    
 45006fe3e4c8  1.167804  0.622530  0.106959  0.324989  3.331674  6.426002   
 d02759a80ba2  0.818970  0.506009  1.078682  6.848758  3.524885  5.279456   
 c016c6b0efa5 -0.356703 -0