Running this notebook is computationally intensive. You'll need to be running the GPU docker. Additionally, ensure you have plenty of disk space, and ideally, multiple CPUs available. Intermediate results have been stored and are accessible on S3. See the Supp_Fig_2.ipynb notebook for accessing and plotting this data.

In [1]:
import os
import sys
import warnings
import multiprocessing as mp
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy


sys.path.append('../common')
import data_io_utils
import paths
import utils
import constants

import A003_common
import policy_evaluation
import acquisition_policies
import models

%reload_ext autoreload
%autoreload 2

In [2]:
data_io_utils.sync_s3_path_to_local(paths.DATASETS_DIR)

In [3]:
data_io_utils.sync_s3_path_to_local(paths.POLICY_EVAL_DIR)

In [4]:
data_io_utils.sync_s3_path_to_local(paths.EVOTUNING_CKPT_DIR)

## Config

In [5]:
random.seed(8329)
np.random.seed(4158)

FORCE = True

N_REPLICATES = 20
SPLIT = 2 # Split 0 used for training, 1 for prospective design, 2 for final figure.

training_sets = ['sarkisyan']
acq_policies = ['random']
n_training_points_schedule = np.array([8, 24, 96])

models = [
    'LassoLars',
    'Ridge',
    'RidgeSparseRefit',
    'EnsembledRidgeSparseRefit',
    
    'Doc2VecLassoLars',
    'Doc2VecRidge',
    'Doc2VecRidgeSparseRefit',
    'Doc2VecEnsembledRidgeSparseRefit',
    
    'UniRepLassoLars', 
    'UniRepRidge',
    'UniRepRidgeSparseRefit',
    'UniRepEnsembledRidgeSparseRefit',
    
    'EvotunedUniRep_Random_Init_1_LassoLars', 
    'EvotunedUniRep_Random_Init_1_Ridge',
    'EvotunedUniRep_Random_Init_1_RidgeSparseRefit',
    'EvotunedUniRep_Random_Init_1_EnsembledRidgeSparseRefit',
    
    'EvotunedUniRep_Global_Init_1_LassoLars', 
    'EvotunedUniRep_Global_Init_1_Ridge',
    'EvotunedUniRep_Global_Init_1_RidgeSparseRefit',
    'EvotunedUniRep_Global_Init_1_EnsembledRidgeSparseRefit',
    
    'EvotunedUniRep_Global_Init_2_LassoLars', 
    'EvotunedUniRep_Global_Init_2_Ridge',
    'EvotunedUniRep_Global_Init_2_RidgeSparseRefit',
    'EvotunedUniRep_Global_Init_2_EnsembledRidgeSparseRefit',
]

## Run

In [None]:
for model in models:
    for acq_policy in acq_policies:
        for training_set in training_sets:
            
            ## Load inputs
            inputs = policy_evaluation.load_data_eff_inputs(
                split=SPLIT, 
                training_set_name=training_set, 
                acq_policy=acq_policy, 
                model=model)
            
            print('CHANGED N_TRAINING_POINTS_SCHEDULE')
            inputs['n_training_points_schedule'] = n_training_points_schedule
            
            ## Sync any previous progress from S3
            print(inputs['root_output_dir'])
            if data_io_utils.path_exists_on_s3(inputs['root_output_dir']):
                print('Found previous data on S3.')
                data_io_utils.sync_s3_path_to_local(inputs['root_output_dir'])
            else:
                print('No previous data found on S3.') 
            
            ## RUN
            for i in range(N_REPLICATES):
                results = policy_evaluation.evaluate_model_and_acquisition_policy(
                    inputs['training_set_df'],
                    inputs['acquisition_policy_obj'],
                    inputs['n_training_points_schedule'],
                    inputs['acquisition_policy_params'],
                    inputs['model_obj'],
                    inputs['generalization_set_dfs'],
                    inputs['generalization_set_names'],
                    inputs['generalization_set_sub_category_columns'],
                    inputs['generalization_set_calc_params'],
                    os.path.join(inputs['root_output_dir'], 'rep_' + str(i)), # subdir for replicate
                    force=FORCE,
                    verbose=True,
                    save_models=True
                )

FP homolog parents contained in gen set: 3
CHANGED N_TRAINING_POINTS_SCHEDULE
/notebooks/analysis/common/../../data/s3/policy_evaluation/split_2/train_sarkisyan/LassoLarsModel/RandomAcquisition
No previous data found on S3.
Found no existing progress or being forced.
n_train: 8
	Acquiring training points
	Training/tuning model
	Evaluating generalization
	Checkpointing
n_train: 24
	Acquiring training points
	Training/tuning model
	Evaluating generalization
	Checkpointing
n_train: 96
	Acquiring training points
	Training/tuning model
	Evaluating generalization
	Checkpointing
Found no existing progress or being forced.
n_train: 8
	Acquiring training points
	Training/tuning model
	Evaluating generalization


## Sync to S3

In [None]:
data_io_utils.sync_local_path_to_s3(paths.POLICY_EVAL_DIR)