In [9]:
# Imports
import os
import argparse
from ir_utils import load_queries


# Models:

from ir_lmart import *

# HPO

from hpo import *

In [3]:
def get_train_budget_data_file(budget, query_list, train_data_file):
    # Budget is percentage of training data: 
    # min_budget = 10%
    # max_budget = 100%
    if (int(budget) <= 100 or int(budget) >= 10):
        len_queries = len(query_list)
        budgeted_queries = round(len_queries * (budget / 100))
        print('total budget:', len_queries)
        print('allocated budget:', budgeted_queries)
        train_budget_queries_file = train_data_file + '_budget' + str(budget)
        if not os.path.exists(train_budget_queries_file):
            with open(train_data_file, 'rt') as f_in:
                with open(train_budget_queries_file, 'wt') as budget_file_out:
                    for query_feature in f_in:                        
                        qid = query_feature.split()[1].split(':')[1]
#                         print(qid)
                        if qid in query_list[0:budgeted_queries]:
                            
                            budget_file_out.write(query_feature)
        else:
            print("File already exists")
            return train_budget_queries_file                
    else:
        print('Budget is outside the limits (10% < b < 100%): ', budget)
        return 

In [None]:
def eval_hpo(params_list, hpo_params):
    
    train_data_file = params_list[0]
    val_data_file = params_list[1]
    fold_dir = params_list[2]
    lmart_model = params_list[3]
    qrels_val_file = params_list[4]
    
    hpo_params_suffix = 'nl' + str(hpo_params['n_leaves']) + 'lr' + str(hpo_params['learning_rate']) + 'nt' + str(hpo_params['n_trees'])
    
    save_model_file = fold_dir + dataset_fold + '_lmart_' + hpo_params_suffix + '_model' 
    
    lmart_model.train(train_data_file, save_model_file, hpo_params)
    run_file = fold_dir + 'run_' + dataset_fold + '_lmart_' + hpo_params_suffix
    
#   lmart_model.gen_run_file(test_data_file, run_file)

    lmart_model.gen_run_file(val_data_file, run_file)
    
#     print(qrels_val_file)
#     print(run_file)
    eval_results = eval(trec_eval_command, qrels_val_file, run_file)
    eval_results.update(lmart_model.hpo_config)
    eval_results['lmart_model'] = lmart_model
    return eval_results

In [None]:
def start_process():
    print( 'Starting', multiprocessing.current_process().name)


In [None]:
def eval_multi_hpo(params_list, hpo_params_list, pool_size):
   
    eval_hpo_partial = partial(eval_hpo, params_list)

    pool = multiprocessing.Pool(processes=pool_size,
                                initializer=start_process,
                                )

    pool_outputs = pool.map_async(eval_hpo_partial, hpo_params_list)
    pool.close() # no more tasks
    pool.join()  # wrap up current tasks
    print('Total parameters: ' + str(len(pool_outputs.get())))
    return pool_outputs.get()

In [4]:
# Classes

In [5]:
class fakeParser:
    def __init__(self):
        self.dataset = 'bioasq' 
#         self.dataset = 'robust' 
        self.data_split = 'all'
#         self.data_split = 'train'
#         self.data_split = 'dev'
#         self.build_index = True
        self.build_index = None
        self.fold = 'all'
        self.gen_features = True
#         self.gen_features = None
        



In [6]:
# Main
if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Example 1 - sequential and local execution.')
    parser.add_argument('--dataset',   type=str, help='')
    parser.add_argument('--data_split',   type=str, help='')
    parser.add_argument('--fold', type=str,   help='')

    parser = argparse.ArgumentParser(description='Example 1 - sequential and local execution.')
    parser.add_argument('--min_budget',   type=float, help='Minimum budget used during the optimization.',    default=2)
    parser.add_argument('--max_budget',   type=float, help='Maximum budget used during the optimization.',    default=4)
    parser.add_argument('--n_iterations', type=int,   help='Number of iterations performed by the optimizer', default=500)
    parser.add_argument('--n_workers', type=int,   help='Number of workers to run in parallel.', default=5)
    
    
#     args=parser.parse_args()
    args = fakeParser()

In [None]:
    hpo_method = 'rs'
    hpo_method = 'bohb'

    random_iterations = 20 # these are outside parameters

    nleaves_range = np.arange(1,51,1)
    lrate_range = np.arange(0.1,1,0.1)
    ntrees_range = np.arange(1,51,1)

    h_param_ranges = [nleaves_range, lrate_range, ntrees_range]

    if hpo_method == 'rs':
        h_params = get_random_params(h_param_ranges, random_iterations)
    elif hpo_method == 'gs':
        h_params = get_grid_search_params(h_param_ranges)

    hpo_params_list = [{'n_leaves': x[0], 'learning_rate': x[1], 'n_trees': x[2]} for x in h_params]

    print(len(hpo_params_list))

In [7]:
    budget = 10
    dataset = args.dataset
    workdir = './' + dataset + '_dir/'
    
    ranklib_location = '../../../ranklib/'
    
    if (not args.fold or args.dataset == 'bioasq'):
        folds = ['']
    elif args.fold == 'all':
        folds = ['1','2','3','4','5']
#         args.fold = ['1']
    else:
        folds = [args.fold]
    
    # Get features for every fold and data_split
    
    for fold in folds:
        
        print(fold)
        
        if args.dataset == 'bioasq':
            fold_dir = workdir
        else:
            fold_dir = workdir + 's' + fold + '/'

            
        if args.dataset == 'bioasq':
            train_queries_file = '../../bioasq_data/bioasq.' + 'train' + '.json'
            
            query_list = load_queries(train_queries_file)
            qid_list = [q['id'] for q in query_list] 
            train_features_file =  fold_dir + dataset + '_' + 'train' + '_features'
            budget_train_data_file = get_train_budget_data_file(budget, qid_list, train_features_file)

        elif args.dataset == 'robust':
            train_queries_file = '../../robust04_data/split_' + fold + '/rob04.' +  'train' + '.s' + fold + '.json'
#                 q_file = queries_file

            query_list = load_queries(train_queries_file)
            qid_list = [q['id'] for q in query_list] 
            print(len(qid_list))
            
            train_features_fold_file = fold_dir + dataset + '_' + 'train' + '_s' + fold  + '_features'
            budget_train_data_file = get_train_budget_data_file(budget, qid_list, train_features_fold_file)
            



total budget: 1751
allocated budget: 175
File already exists
