In [1]:
# Stronger baseline: Listwise L2R - LambdaMART
# Hyperparameter optimziation HPonsteroids requires Python 3!

In [2]:
# Imports
import os
import subprocess
import sys

# REMOVE!!
from ir_baseline import *

# HPO


import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH

from hpbandster.core.worker import Worker

import logging
logging.basicConfig(level=logging.DEBUG)

In [3]:
# Functions
def generate_run_file(pre_run_file, run_file):
    
    with open(pre_run_file, 'rt') as input_f:
        pre_run = input_f.readlines()
        print(type(pre_run))
    with open(run_file, 'wt') as out_f:
        for line in pre_run:
            out_f.write(line.replace('docid=','').replace('indri', 'lambdaMART'))
        

In [4]:
# Classes
class L2Ranker:
    def __init__(self, ranklib_location, params, normalization=[]):
        self.ranklib_location = ranklib_location
        # Works with Oracle JSE
        # java version "1.8.0_211"
        # Java(TM) SE Runtime Environment (build 1.8.0_211-b12)
        # Java HotSpot(TM) 64-Bit Server VM (build 25.211-b12, mixed mode)
        self.params = params
        self.log_file = self.params[-1:][0] + '.log'
        self.ranker_command = ['java', '-jar', ranklib_location + 'RankLib-2.12.jar']
        self.normalization = normalization
        self.save_model_file = ''
        
#     def build(self, ir_tool_params):
    def train(self, train_data_file, save_model_file, hpo_config):
        self.save_model_file = save_model_file
        toolkit_parameters = [
                                *self.ranker_command, # * to unpack list elements
                                '-train',
                                train_data_file,
                                *self.normalization,
                                *self.params,
                                '-leaf', 
                                str(hpo_config['n_leaves']),
                                '-shrinkage',
                                str(hpo_config['learning_rate']),
                                '-tree', # Fix: is this necessary according to original paper?
                                str(hpo_config['n_trees']),
                                '-save',
                                self.save_model_file   
                            ] 
        
        print(toolkit_parameters)
        with open(self.log_file, 'wt') as rf:
            proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=rf, stderr=subprocess.STDOUT, shell=False)
#         proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False)
            
        (out, err)= proc.communicate()
#         print(out.decode('utf-8').splitlines())
#         print(out)
#         print(err)
        print('Model saved: ', self.save_model_file)
            
  

    def gen_run_file(self, test_data_file, run_file):
        pre_run_file = run_file.replace('run_', 'pre_run_', 1)
        toolkit_parameters = [
                                *self.ranker_command, # * to unpack list elements
                                '-load',
                                self.save_model_file,
                                *self.normalization,
                                '-rank',
                                test_data_file,
                                '-indri',
                                pre_run_file     
                            ] 
        
        print(toolkit_parameters)
        with open(self.log_file, 'at') as rf:
            proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=rf, stderr=subprocess.STDOUT, shell=False)
#         proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False)
            
        (out, err)= proc.communicate()
#         print(out.decode('utf-8').splitlines())
#         print(out)
#         print(err)

        
        generate_run_file(pre_run_file, run_file)
        
        print('Run model saved: ', run_file)

In [5]:
# try:
#     import keras
#     from keras.datasets import mnist
#     from keras.models import Sequential
#     from keras.layers import Dense, Dropout, Flatten
#     from keras.layers import Conv2D, MaxPooling2D
#     from keras import backend as K
# except:
#     raise ImportError("For this example you need to install keras.")

# try:
#     import torchvision
#     import torchvision.transforms as transforms
# except:
#     raise ImportError("For this example you need to install pytorch-vision.")





class HpoWorker(Worker):
    def __init__(self, **kwargs):
            super().__init__(**kwargs)
            self.run_val_file = ''
            self.save_model_file = ''

            
    def compute(self, hpo_config, working_directory, *args, **kwargs):
            """
            Simple example for a compute function using a feed forward network.
            It is trained on the MNIST dataset.
            The input parameter "config" (dictionary) contains the sampled configurations passed by the bohb optimizer
            """
            
            # Train model with config parameters
            
            
                
            #     pre_run_file = workdir + 'pre_run_' + dataset + l2r_model
            
            n_l = hpo_config['n_leaves']
            l_r = hpo_config['learning_rate']
            n_t = hpo_config['n_trees']
            
            config_suffix = '_leaves' + str(n_l) + '_lr' + str(l_r) + '_n' + str(n_t)
            self.run_val_file = workdir + 'run_' + dataset + l2r_model + config_suffix
            self.save_model_file = workdir + dataset + l2r_model + config_suffix
            
#             lmart_model = L2Ranker(ranklib_location, l2r_params)
            lmart_model = L2Ranker(ranklib_location, l2r_params, norm_params)
            lmart_model.train(train_data_file, self.save_model_file, hpo_config)
            lmart_model.gen_run_file(val_data_file, self.run_val_file)
            
            # Evaluate Model
            
            val_results = eval(trec_eval_command, qrels_val_file, self.run_val_file)
            val_results = val_results.splitlines()
            val_map = float(val_results[0].split()[-1:][0])
            print(val_map)

            #import IPython; IPython.embed()
            return ({
                    'loss': 1 - val_map, # remember: HpBandSter always minimizes!
                    'model': lmart_model
            })


    @staticmethod
    def get_configspace():
            """
            It builds the configuration space with the needed hyperparameters.
            It is easily possible to implement different types of hyperparameters.
            Beside float-hyperparameters on a log scale, it is also able to handle categorical input parameter.
            :return: ConfigurationsSpace-Object
            """
            cs = CS.ConfigurationSpace()
            
            n_leaves = CSH.UniformIntegerHyperparameter('n_leaves', lower=1, upper=20, default_value=10, q=10, log=False)
            learning_rate = CSH.UniformFloatHyperparameter('learning_rate', lower=0.1, upper=0.9, default_value=0.1, q=10, log=False)
            n_trees = CSH.UniformIntegerHyperparameter('n_trees', lower=1, upper=1000, default_value=1000, q=10, log=False)
            
            cs.add_hyperparameters([n_leaves, learning_rate, n_trees])

            return cs




In [6]:
# Main
if __name__ == "__main__":
    
    # Options and variables
#     dataset = sys.argv[1] # 'bioasq'
#     workdir = './' + dataset + '_dir/'
#     data_split = sys.argv[2] # 'test'

    dataset = 'bioasq'
    workdir = './' + dataset + '_dir/'
    data_split =  'train'
    k_fold = 's1' 
    ranklib_location = '../../../ranklib/'
    
#     train_data_file = './bioasq_dir/bioasq.trai_features_reduced'
#     val_data_file = './bioasq_dir/bioasq.dev_features_reduced'
#     test_data_file = './bioasq_dir/bioasq.test_features_reduced'
    
    train_data_file = './bioasq_dir/bioasq.trai_features'
    val_data_file = './bioasq_dir/bioasq.dev_features'
    test_data_file = './bioasq_dir/bioasq.test_features'
    
    l2r_model = '_lmart_'
    


    
    enabled_features_file = workdir + dataset + l2r_model + 'enabled_features'
    
    print(enabled_features_file)
    # Train L2R model: LambdaMART
    # Parameters 
    
#     n_leaves = '10'
#     learning_rate = '0.1'
#     n_trees = '1000'
#     hpo_params = [n_leaves, learning_rate, n_trees]
    
    
    
    metric2t = 'MAP' # 'MAP, NDCG@k, DCG@k, P@k, RR@k, ERR@k (default=ERR@10)'
    
    ranker_type = '6' # LambdaMART
    
    # normalization: Feature Engineering?
    norm_params = ['-norm', 'zscore'] # 'sum', 'zscore', 'linear'
    
    l2r_params = [
        '-validate',
        val_data_file,
        '-ranker',
        ranker_type,
        '-metric2t',
        metric2t,
        '-feature',
        enabled_features_file
    ]
    
    # Run train
    
#     lmart_model = L2Ranker(ranklib_location, l2r_params)
#     lmart_model = L2Ranker(ranklib_location, l2r_params, norm_params)
    

./bioasq_dir/bioasq_lmart_enabled_features


In [7]:
#     lmart_model.train(train_data_file, hpo_params)

In [8]:
#     lmart_model.gen_run_file(test_data_file, run_file)

In [9]:
    trec_eval_command = '../../eval/trec_eval'
    qrels_val_file = './bioasq_dir/bioasq.dev_qrels'
#     eval(trec_eval_command, qrels_file, './run_l2linear')

In [10]:
    # HPO 
    working_directory = './hpo_workdir/'
    
    
    worker = HpoWorker(run_id='0')
    cs = worker.get_configspace()

    config = cs.sample_configuration().get_dictionary()
    
        
#     pre_run_file = workdir + 'pre_run_' + dataset + l2r_model
    
    run_file = workdir + 'run_' + dataset + l2r_model
    
    print(config)
    res = worker.compute(hpo_config=config, working_directory='.')
    print(res['loss'])

{'learning_rate': 0.1, 'n_leaves': 10, 'n_trees': 380}
['java', '-jar', '../../../ranklib/RankLib-2.12.jar', '-train', './bioasq_dir/bioasq.trai_features', '-norm', 'zscore', '-validate', './bioasq_dir/bioasq.dev_features', '-ranker', '6', '-metric2t', 'MAP', '-feature', './bioasq_dir/bioasq_lmart_enabled_features', '-leaf', '10', '-shrinkage', '0.1', '-tree', '380', '-save', './bioasq_dir/bioasq_lmart__leaves10_lr0.1_n380']
Model saved:  ./bioasq_dir/bioasq_lmart__leaves10_lr0.1_n380
['java', '-jar', '../../../ranklib/RankLib-2.12.jar', '-load', './bioasq_dir/bioasq_lmart__leaves10_lr0.1_n380', '-norm', 'zscore', '-rank', './bioasq_dir/bioasq.dev_features', '-indri', './bioasq_dir/pre_run_bioasq_lmart__leaves10_lr0.1_n380']
<class 'list'>
Run model saved:  ./bioasq_dir/run_bioasq_lmart__leaves10_lr0.1_n380
['../../eval/trec_eval', '-m', 'map', '-m', 'P.20', '-m', 'ndcg_cut.20', './bioasq_dir/bioasq.dev_qrels', './bioasq_dir/run_bioasq_lmart__leaves10_lr0.1_n380']
map                  

In [11]:
    qrels_test_file = './bioasq_dir/bioasq.test_qrels'
    run_val_file = './this.file'
    lmart_model = res['model']
    lmart_model.gen_run_file(test_data_file, run_val_file)
    eval(trec_eval_command, qrels_test_file, run_val_file)

['java', '-jar', '../../../ranklib/RankLib-2.12.jar', '-load', './bioasq_dir/bioasq_lmart__leaves10_lr0.1_n380', '-norm', 'zscore', '-rank', './bioasq_dir/bioasq.test_features', '-indri', './this.file']
<class 'list'>
Run model saved:  ./this.file
['../../eval/trec_eval', '-m', 'map', '-m', 'P.20', '-m', 'ndcg_cut.20', './bioasq_dir/bioasq.test_qrels', './this.file']
map                   	all	0.4721
P_20                  	all	0.2687
ndcg_cut_20           	all	0.5649

Run error:  None
No errors


'map                   \tall\t0.4721\nP_20                  \tall\t0.2687\nndcg_cut_20           \tall\t0.5649\n'