In [1]:
import os
from os.path import join
import numpy as np
import pandas as pd

sh_or_bat = "sh"
project_folder = r"/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/"
hw_files = r"/Users/fferegrino/Documents/ae1/"
if os.name == 'nt':
    project_folder = r"C:\Users\anton\Documents\GitHub\information-retrieval\ae2"
    hw_files = r"C:\\terrier_data\\"
    sh_or_bat = "bat"

terrier_path =      r"/Users/fferegrino/Documents/terrier-core-4.2"
logs_folder =       join(project_folder,"logs")
results_folder =    join(project_folder,"results")
features_file =     join(project_folder,"features.txt")
jforest_file =      join(project_folder,"jforests.properties")
terrier_index =     join(hw_files,r"indices/blocks_fields_stemming")
training_topics =   join(hw_files,r"topics/training/topics")
validation_topics = join(hw_files,r"topics/validation/topics")
training_qrels =    join(hw_files,r"topics/training/qrels")
validation_qrels =  join(hw_files,r"topics/validation/qrels")
hp04_topics =       join(hw_files,r"topics/HP04/topics")
hp04_qrels =        join(hw_files,r"topics/HP04/qrels")

terrier = join(terrier_path, "bin", "trec_terrier.%s" % sh_or_bat)
teval = join(terrier_path, "bin", "trec_eval.%s" % sh_or_bat)
anyclass = join(terrier_path, "bin", "anyclass.%s" % sh_or_bat)
letor_tr_file= join(results_folder, "tr.letor")
letor_va_file= join(results_folder, "va.letor")
ensemble_file= join(results_folder, "ensemble.txt")

In [2]:
def write_log(file, content, command=None):
    with open(join(logs_folder, file), "w") as w:
        if command is not None:
            w.write(command + "\n\n")
        for line in content:
            w.write(line + "\n")

Next, we want to retrieve results for the training topics. In this, we are going to be calculating results with multiple features, as listed in the etc/features.list file, so we use a series of Matching classes: FatFull to make a FatResultSet (i.e. a ResultSet with extra posting information), and FatFeaturedScoringMatching to add the additional features, and return a FeaturedResultSet. We then add the document label from the qrels using LabelDecorator, and write the results in a LETOR-compatible results file using Normalised2LETOROutputFormat:

In [3]:
results_file = join(results_folder,"pl2_tr_ltr.res")
command="%s -r -Dtrec.topics=%s " % (terrier, training_topics) + \
    "-Dtrec.results.file=%s -Dtrec.model=PL2 -Dterrier.index.path=%s " % (results_file,  terrier_index) + \
    "-Dtrec.matching=FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
    "-Dtrec.querying.outputformat=Normalised2LETOROutputFormat " + \
    "-Dquerying.postprocesses.order=QueryExpansion,org.terrier.learning.LabelDecorator " + \
    "-Dquerying.postprocesses.controls=labels:org.terrier.learning.LabelDecorator,qe:QueryExpansion " + \
    "-Dquerying.default.controls=labels:on " + \
    "-Dlearning.labels.file=%s " % training_qrels + \
    "-Dtrec.results.file=%s -Dproximity.dependency.type=SD " % letor_tr_file + \
    "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file 
    
run_results = !$command
write_log("pl2_tr_ltr_querying.log", run_results, command=command)

#eval_file = join(results_folder, "pl2_tr_ltr.eval")
#command = "%s %s %s -q > %s" % (teval, training_qrels, results_file, eval_file)    

#run_results = !$command
#write_log("pl2_tr_ltr_eval.log", run_results, command=command)

Lets a have a look at what was output into tr.letor: 
(maybe not, it is too big)

The header reports the name of the features. "score"”" means the model used to generate the sample, i.e. the first pass retrieval, in our case DPH. After the header, for each retrieved document for each query, there is a single line in the output. The label obtained from the qrels file is the first entry on each row.

We repeat the retrieval step for the validation queries, --this time from the 2003 TREC task--:

[Here](http://terrier.org/docs/v4.2/learning.html)

In [4]:
results_file = join(results_folder,"pl2_va_ltr.res")
command="%s -r -Dtrec.topics=%s " % (terrier, validation_topics) + \
    "-Dtrec.results.file=%s -Dtrec.model=PL2 -Dterrier.index.path=%s " % (results_file,  terrier_index) + \
    "-Dtrec.matching=FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
    "-Dtrec.querying.outputformat=Normalised2LETOROutputFormat " + \
    "-Dquerying.postprocesses.order=QueryExpansion,org.terrier.learning.LabelDecorator " + \
    "-Dquerying.postprocesses.controls=labels:org.terrier.learning.LabelDecorator,qe:QueryExpansion " + \
    "-Dquerying.default.controls=labels:on " + \
    "-Dlearning.labels.file=%s " % validation_qrels + \
    "-Dtrec.results.file=%s -Dproximity.dependency.type=SD " % letor_va_file + \
    "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file 
    
run_results = !$command
write_log("pl2_va_ltr_querying.log", run_results, command=command)

#eval_file = join(results_folder, "pl2_va_ltr.eval")
#command = "%s %s %s -q > %s" % (teval, training_qrels, results_file, eval_file)    

#run_results = !$command
#write_log("pl2_va_ltr_eval.log", run_results, command=command)

In [5]:
command="%s edu.uci.jforests.applications.Runner " % anyclass + \
    "--config-file %s " % jforest_file + \
    "--cmd=generate-bin --ranking --folder %s " % results_folder + \
    "--file tr.letor --file va.letor"
    
run_results = !$command
write_log("anyclass_run_1.log", run_results, command=command)

In [10]:
tr_bin = join(results_folder, "tr.bin")
va_bin = join(results_folder, "va.bin")
command="%s edu.uci.jforests.applications.Runner " % anyclass + \
    "--config-file %s " % jforest_file + \
    "--cmd=train --ranking --folder %s " % results_folder + \
    "--train-file %s --validation-file %s " % (tr_bin, va_bin) + \
    "--output-model %s" % ensemble_file
    
run_results = !$command
write_log("anyclass_run_2.log", run_results, command=command)

Once the learned model (from Jforests, this is an XML file which takes the form of a gradient boosted regression tree) is obtained in ensemble.txt, we can use this to apply the learned model. The configuration for Terrier is similar to retrieval for the training topics, but we additionally use JforestsModelMatching for application of the learned model, and output the final results using the default, trec_eval compatible TRECDocnoOutputFormat:

In [14]:
ltr_test_results_file = join(results_folder,"pl2_te_ltr.res")
jforest_statistics = join(results_folder, "jforests-feature-stats.txt")
command="%s -r -Dtrec.model=PL2 -Dtrec.topics=%s " % (terrier, hp04_topics) + \
    "-Dtrec.matching=JforestsModelMatching,FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
    "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file + \
    "-Dtrec.results.file=%s  -Dterrier.index.path=%s " % (ltr_test_results_file,  terrier_index) + \
    "-Dfat.matching.learned.jforest.model=%s " % ensemble_file + \
    "-Dfat.matching.learned.jforest.statistics=%s " % jforest_statistics + \
    "-Dproximity.dependency.type=SD"
    
    
run_results = !$command
write_log("pl2_te_ltr_querying.log", run_results, command=command)

In [15]:
test_results_file = join(results_folder,"pl2_te.res")
command="%s -r -Dtrec.model=PL2 -Dtrec.topics=%s " % (terrier, hp04_topics) + \
    "-Dtrec.results.file=%s  -Dterrier.index.path=%s " % (test_results_file,  terrier_index) 
    
run_results = !$command
write_log("pl2_te_querying.log", run_results, command=command)

In [16]:
eval_file = join(results_folder, "pl2_ltr.eval")
command = "%s %s %s -q > %s" % (teval, hp04_qrels, ltr_test_results_file, eval_file)
run_results = !$command
write_log("pl2_ltr_eval.log", run_results, command=command)

eval_file = join(results_folder, "pl2.eval")
command = "%s %s %s -q > %s" % (teval, hp04_qrels, test_results_file, eval_file)
run_results = !$command
write_log("pl2_eval.log", run_results, command=command)