In [1]:
# Stronger baseline: Listwise L2R - LambdaMART
# Hyperparameter optimziation HPonsteroids requires Python 3!

In [2]:
# Imports
import os
import subprocess
import sys

# REMOVE!!
from ir_baseline import *

In [3]:
# Functions
def generate_run_file(pre_run_file, run_file):
    
    with open(pre_run_file, 'rt') as input_f:
        pre_run = input_f.readlines()
        print(type(pre_run))
    with open(run_file, 'wt') as out_f:
        for line in pre_run:
            out_f.write(line.replace('docid=','').replace('indri', 'lambdaMART'))
        

In [4]:
# Classes
class L2Ranker:
    def __init__(self, ranklib_location, params, normalization=[]):
        self.ranklib_location = ranklib_location
        # Works with Oracle JSE
        # java version "1.8.0_211"
        # Java(TM) SE Runtime Environment (build 1.8.0_211-b12)
        # Java HotSpot(TM) 64-Bit Server VM (build 25.211-b12, mixed mode)
        self.params = params
        self.log_file = self.params[-1:][0] + '.log'
        self.ranker_command = ['java', '-jar', ranklib_location + 'RankLib-2.12.jar']
        self.normalization = normalization
        
#     def build(self, ir_tool_params):
    def train(self, train_data_file):

        toolkit_parameters = [
                                *self.ranker_command, # * to unpack list elements
                                '-train',
                                train_data_file,
                                *self.normalization,
                                *self.params
                            ] 
        
        print(toolkit_parameters)
        with open(self.log_file, 'wt') as rf:
            proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=rf, stderr=subprocess.STDOUT, shell=False)
#         proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False)
            
        (out, err)= proc.communicate()
#         print(out.decode('utf-8').splitlines())
#         print(out)
#         print(err)
        print('Model saved: ', self.params[-1:][0])
            
  

    def gen_run_file(self, test_data_file, run_file):
        pre_run_file = run_file.replace('run_', 'pre_run_', 1)
        toolkit_parameters = [
                                *self.ranker_command, # * to unpack list elements
                                '-load',
                                self.params[-1:][0],
                                *self.normalization,
                                '-rank',
                                test_data_file,
                                '-indri',
                                pre_run_file     
                            ] 
        
        print(toolkit_parameters)
        with open(self.log_file, 'at') as rf:
            proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=rf, stderr=subprocess.STDOUT, shell=False)
#         proc = subprocess.Popen(toolkit_parameters,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False)
            
        (out, err)= proc.communicate()
#         print(out.decode('utf-8').splitlines())
#         print(out)
#         print(err)

        
        generate_run_file(pre_run_file, run_file)
        
        print('Run model saved: ', run_file)

In [5]:
# Main
if __name__ == "__main__":
    
    # Options and variables
#     dataset = sys.argv[1] # 'bioasq'
#     workdir = './' + dataset + '_dir/'
#     data_split = sys.argv[2] # 'test'

    dataset = 'bioasq'
    workdir = './' + dataset + '_dir/'
    data_split =  'train'
    k_fold = 's1' 
    ranklib_location = '../../../ranklib/'
    
#     train_data_file = './bioasq_dir/bioasq.trai_features_reduced'
    train_data_file = './bioasq_dir/bioasq.trai_features'
#     val_data_file = './bioasq_dir/bioasq.dev_features_reduced'
    val_data_file = './bioasq_dir/bioasq.dev_features'
#     test_data_file = './bioasq_dir/bioasq.test_features_reduced'
    test_data_file = './bioasq_dir/bioasq.test_features'
    
    l2r_model = '_lmart_'
    
    save_model_file = workdir + dataset + l2r_model
    
    pre_run_file = workdir + 'pre_run_' + dataset + l2r_model
    
    run_file = workdir + 'run_' + dataset + l2r_model
    
    enabled_features_file = workdir + dataset + l2r_model + 'enabled_features'
    
    print(enabled_features_file)
    # Train L2R model: LambdaMART
    # Parameters 
    
    n_leaves = '10'
    learning_rate = '0.1'
    n_trees = '1000'
    hpo_params = [n_leaves, learning_rate, n_trees]
    metric2t = 'MAP' # 'MAP, NDCG@k, DCG@k, P@k, RR@k, ERR@k (default=ERR@10)'
    
    ranker_type = '6' # LambdaMART
    
    # normalization: Feature Engineering?
    norm_params = ['-norm', 'zscore'] # 'sum', 'zscore', 'linear'
    
    l2r_params = [
        '-validate',
        val_data_file,
        '-ranker',
        ranker_type,
        '-metric2t',
        metric2t,
        '-leaf', 
        hpo_params[0],
        '-shrinkage',
        hpo_params[1],
        '-tree', # Fix: is this necessary according to original paper?
        hpo_params[2], 
        '-feature',
        enabled_features_file,   
        '-save',
        save_model_file   
    ]
    
    # Run train
    
    lmart_model = L2Ranker(ranklib_location, l2r_params)
#     lmart_model = L2Ranker(ranklib_location, l2r_paramsm, norm_params)
    

./bioasq_dir/bioasq_lmart_enabled_features


In [6]:
    lmart_model.train(train_data_file)

['java', '-jar', '../../../ranklib/RankLib-2.12.jar', '-train', './bioasq_dir/bioasq.trai_features', '-validate', './bioasq_dir/bioasq.dev_features', '-ranker', '6', '-metric2t', 'MAP', '-leaf', '10', '-shrinkage', '0.1', '-tree', '1000', '-feature', './bioasq_dir/bioasq_lmart_enabled_features', '-save', './bioasq_dir/bioasq_lmart_']
Model saved:  ./bioasq_dir/bioasq_lmart_


In [7]:
    lmart_model.gen_run_file(test_data_file, run_file)

['java', '-jar', '../../../ranklib/RankLib-2.12.jar', '-load', './bioasq_dir/bioasq_lmart_', '-rank', './bioasq_dir/bioasq.test_features', '-indri', './bioasq_dir/pre_run_bioasq_lmart_']
<class 'list'>
Run model saved:  ./bioasq_dir/run_bioasq_lmart_


In [8]:
    trec_eval_command = '../../eval/trec_eval'
    qrels_file = './bioasq_dir/bioasq.test_qrels'
    eval(trec_eval_command, qrels_file, run_file)

['../../eval/trec_eval', '-m', 'map', '-m', 'P.20', '-m', 'ndcg_cut.20', './bioasq_dir/bioasq.test_qrels', './bioasq_dir/run_bioasq_lmart_']
map                   	all	0.3043
P_20                  	all	0.1646
ndcg_cut_20           	all	0.3609

Run error:  None


'Ok'