In [1]:
import os
import sys
import time
import csv
from importlib import reload
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import assay
import util
import calibrate as cal

from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
    
import numpy as np
import scipy as sc
import pandas as pd 

import matplotlib.pyplot as plt
import matplotlib as mpl
plt.rcParams["font.size"] = 16

import seaborn as sns
sns.set_style('darkgrid')

In [2]:
def get_train_and_test(data, n_train, seed, lmbda):
    # get random training data
    np.random.seed(seed)
    train_idx = np.random.choice(data.n, n_train, replace=False)
    Xtrain_nxp, ytrain_n = data.X_nxp[train_idx], data.y_n[train_idx]

    # train model (exclude intercept feature)
    model.fit(Xtrain_nxp[:, 1 :], ytrain_n)

    # construct test covariate distribution
    predall_n = model.predict(data.X_nxp[:, 1 :])
    punnorm_n = np.exp(lmbda * predall_n)
    Z = np.sum(punnorm_n)

    # draw test covariate
    test_idx = np.random.choice(data.n, 1, p=punnorm_n / Z)
    return Xtrain_nxp, ytrain_n, test_idx

In [None]:
# covariate intervention
reload(cal)
reload(assay)
order = 2
alpha = 0.1
n_trains = [24, 48, 96, 384]
ntrain2reg = {24: 10, 48: 10, 96: 10, 384: 1}
n_seed = 1000
lmbdas = [0.1, 2, 4, 6, 8]
y_increment = 0.01
ys = np.arange(0, 1.7, y_increment)

for fitness_str in ['blue', 'red']:
    
    data = assay.PoelwijkData(fitness_str, order=order)
    ptrain_fn = lambda x: (1.0 / data.n) * np.ones([x.shape[0]])
    
    for t, n_train in enumerate(n_trains):

        reg = ntrain2reg[n_train]
        model = Ridge(alpha=reg, fit_intercept=True)

        covint = cal.ConformalRidgeCovariateIntervention(ptrain_fn, ys, data.X_nxp, reg)
        covshift = cal.ConformalRidgeCovariateShift(ptrain_fn, ys, data.X_nxp, reg)

        for l, lmbda in enumerate(lmbdas):

            cslooset_s, csisset_s = [], []
            cilooset_s, ciisset_s = [], []
            ytest_s, predtest_s = np.zeros([n_seed]), np.zeros([n_seed])
            csloocov_s, csiscov_s = np.zeros([n_seed]), np.zeros([n_seed])
            ciloocov_s, ciiscov_s = np.zeros([n_seed]), np.zeros([n_seed])
            t0 = time.time()

            for seed in range(n_seed):
                
                Xtrain_nxp, ytrain_n, test_idx = get_train_and_test(data, n_train, seed, lmbda)
                Xtest_p = data.X_nxp[test_idx]
                ytest_s[seed] = data.y_n[test_idx]
                predtest_s[seed] = model.predict(Xtest_p[:, 1 :])

                # get confidence set using covariate intervention
                cilooset, ciisset = covint.get_confidence_set(Xtrain_nxp, ytrain_n, Xtest_p, lmbda, alpha=alpha)
                ciisset_s.append(ciisset)
                ciiscov_s[seed] = cal.is_covered(ytest_s[seed], ciisset, y_increment)
                cilooset_s.append(cilooset)
                ciloocov_s[seed] = cal.is_covered(ytest_s[seed], cilooset, y_increment)

                # get confidence set using covariate shift
                cslooset, csisset = covshift.get_confidence_set(Xtrain_nxp, ytrain_n, Xtest_p, lmbda, alpha=alpha)
                csisset_s.append(csisset)
                csiscov_s[seed] = cal.is_covered(ytest_s[seed], csisset, y_increment)
                cslooset_s.append(cslooset)
                csloocov_s[seed] = cal.is_covered(ytest_s[seed], cslooset, y_increment)

                if (seed + 1) % 200 == 0:
                    print("{}, {}, {}. {} trials. CS IS, LOO coverage: {:.4f}, {:.4f}. CI IS, LOO coverage: {:.4f}, {:.4f}. {:.1f} s".format(
                        fitness_str, n_train, lmbda, seed + 1,
                        np.mean(csiscov_s[: seed + 1]), np.mean(csloocov_s[: seed + 1]),
                        np.mean(ciiscov_s[: seed + 1]), np.mean(ciloocov_s[: seed + 1]), time.time() - t0))

            np.savez('../results/112221/{}_n{}_lambda{}_alpha{}_gamma{}.npz'.format(
                fitness_str, n_train, lmbda, alpha, reg),
                     ytest_s=ytest_s, predtest_s=predtest_s,
                     ciisset_s=ciisset_s, ciiscov_s=ciiscov_s, cilooset_s=cilooset_s, ciloocov_s=ciloocov_s,
                     csisset_s=csisset_s, csiscov_s=csiscov_s, cslooset_s=cslooset_s, csloocov_s=csloocov_s, 
                    )


Feature normalization won't work w/ WT-centered distribution!
92 features
Loading estimated SE precomputed with order 7 and significance level 0.01
blue, 24, 0.1. 200 trials. CS IS, LOO coverage: 0.8750, 0.8650. CI IS, LOO coverage: 0.8750, 0.8650. 180.5 s
blue, 24, 0.1. 400 trials. CS IS, LOO coverage: 0.9050, 0.8950. CI IS, LOO coverage: 0.9050, 0.8950. 358.8 s
