This notebook has the script for reproducing the experimental results shown in Fig. 3, 4, S2.

In [2]:
import os
import sys
import time
from importlib import reload
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np
    
import assay
import calibrate as cal

In [3]:
reload(cal)
reload(assay)

alpha = 0.1                           # miscoverage
n_trains = [96, 192, 384]             # number of training points
ntrain2reg = {96: 10, 192: 1, 384: 1} # ridge regularization strength (gamma in code and paper)
n_seed = 2000                         # number of random trials
lmbdas = [0, 2, 4, 6]                 # lambda, inverse temperature
y_increment = 0.02                    # grid spacing between candidate label values, \Delta in paper
ys = np.arange(0, 2.21, y_increment)  # candidate label values, \mathcal{Y} in paper
order = 2                             # complexity of features. 1 encodes the AA at each site,
                                      # 2 the AAs at each pair of sites,
                                      # 3 the AAs at each set of 3 sites, etc.
        
# likelihood under training input distribution, p_X in paper (uniform distribution)
ptrain_fn = lambda x: (1.0 / np.power(2, 13)) * np.ones([x.shape[0]])
for fitness_str in ['red']:
    
    # featurize all sequences in combinatorially complete dataset
    data = assay.PoelwijkData(fitness_str, order=order)
    
    for t, n_train in enumerate(n_trains):

        reg = ntrain2reg[n_train]
        fcs = cal.ConformalRidgeFeedbackCovariateShift(ptrain_fn, ys, data.X_nxp, reg)
        scs = cal.ConformalRidgeStandardCovariateShift(ptrain_fn, ys, data.X_nxp, reg)

        for l, lmbda in enumerate(lmbdas):

            fset_s, sset_s = [], []
            fcov_s, scov_s = np.zeros([n_seed]), np.zeros([n_seed])
            ytest_s, predtest_s = np.zeros([n_seed]), np.zeros([n_seed])
            t0 = time.time()

            for seed in range(n_seed):
                
                # sample training and designed data
                Xtrain_nxp, ytrain_n, Xtest_1xp, ytest_1, pred_1 = assay.get_training_and_designed_data(
                    data, n_train, reg, lmbda, seed=seed )
                ytest_s[seed] = ytest_1[0]
                predtest_s[seed] = pred_1[0]

                # construct confidence set under feedback covariate shift
                fset, _ = fcs.get_confidence_set(Xtrain_nxp, ytrain_n, Xtest_1xp, lmbda, alpha=alpha) 
                fset_s.append(fset)
                fcov_s[seed] = cal.is_covered(ytest_s[seed], fset, y_increment)

                # construct confidence set under standard covariate shift
                sset, _ = scs.get_confidence_set(Xtrain_nxp, ytrain_n, Xtest_1xp, lmbda, alpha=alpha) 
                sset_s.append(sset)
                scov_s[seed] = cal.is_covered(ytest_s[seed], sset, y_increment)

                if (seed + 1) % 100 == 0:
                    print("{}, {}, {}. {} trials. SCS, FCS coverage: {:.4f}, {:.4f}. {:.1f} s".format(
                        fitness_str, n_train, lmbda, seed + 1,
                        np.mean(scov_s[: seed + 1]), np.mean(fcov_s[: seed + 1]), time.time() - t0))

            np.savez('../fluorescence/{}_n{}_lambda{}_alpha{}_gamma{}.npz'.format(
                fitness_str, n_train, lmbda, alpha, reg),
                     ytest_s=ytest_s, predtest_s=predtest_s,
                     fset_s=fset_s, fcov_s=fcov_s, sset_s=sset_s, scov_s=scov_s, 
                    )

Using 92 order-2 features
Loading estimated measurement noise SD computed using order 7 and significance level 0.01
red, 384, 6. 100 trials. SCS, FCS coverage: 0.8800, 0.8600. 752.7 s
red, 384, 6. 200 trials. SCS, FCS coverage: 0.9050, 0.8950. 1523.3 s
red, 384, 6. 300 trials. SCS, FCS coverage: 0.9100, 0.9000. 2321.2 s
red, 384, 6. 400 trials. SCS, FCS coverage: 0.9050, 0.8975. 3115.3 s
red, 384, 6. 500 trials. SCS, FCS coverage: 0.9040, 0.8960. 3886.5 s
red, 384, 6. 600 trials. SCS, FCS coverage: 0.9083, 0.8967. 4682.9 s
red, 384, 6. 700 trials. SCS, FCS coverage: 0.9129, 0.9014. 5508.1 s
red, 384, 6. 800 trials. SCS, FCS coverage: 0.9100, 0.9012. 6339.8 s
red, 384, 6. 900 trials. SCS, FCS coverage: 0.9156, 0.9056. 7130.1 s
red, 384, 6. 1000 trials. SCS, FCS coverage: 0.9170, 0.9060. 7918.1 s
red, 384, 6. 1100 trials. SCS, FCS coverage: 0.9182, 0.9082. 8694.7 s
red, 384, 6. 1200 trials. SCS, FCS coverage: 0.9183, 0.9092. 9461.5 s
red, 384, 6. 1300 trials. SCS, FCS coverage: 0.9192, 0