In [1]:
import os
from os.path import join
import numpy as np
import pandas as pd

sh_or_bat = "sh"
project_folder = r"/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/q2/"
base_folder = r"/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/"
hw_files = r"/Users/fferegrino/Documents/ae1/"
terrier_path =      r"/Users/fferegrino/Documents/terrier-core-4.2"
if os.name == 'nt':
    terrier_path =      r"C:\Users\anton\Terrier"
    project_folder = r"C:\Users\anton\Documents\GitHub\information-retrieval\ae2\q2"
    base_folder = r"C:\Users\anton\Documents\GitHub\information-retrieval\ae2"
    hw_files = r"C:\terrier_data"
    sh_or_bat = "bat"
    
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

logs_folder =       join(project_folder,"logs")
results_folder =    join(project_folder,"results")
features_file =     join(project_folder,"features.txt")
jforest_file =      join(project_folder,"jforests.properties")
terrier_index =     join(hw_files,r"indices/blocks_fields_stemming")
training_topics =   join(hw_files,r"topics/training/topics")
validation_topics = join(hw_files,r"topics/validation/topics")
training_qrels =    join(hw_files,r"topics/training/qrels")
validation_qrels =  join(hw_files,r"topics/validation/qrels")
hp04_topics =       join(hw_files,r"topics/HP04/topics")
hp04_qrels =        join(hw_files,r"topics/HP04/qrels")
feature_files =     join(hw_files,r"features/pagerank.oos.gz")
jar_file =          join(base_folder, r"search-features/target/ircourse-1.0-SNAPSHOT.jar")

terrier = join(terrier_path, "bin", "trec_terrier.%s" % sh_or_bat)
teval = join(terrier_path, "bin", "trec_eval.%s" % sh_or_bat)
anyclass = join(terrier_path, "bin", "anyclass.%s" % sh_or_bat)
letor_tr_file= join(results_folder, "tr.letor")
letor_va_file= join(results_folder, "va.letor")
ensemble_file= join(results_folder, "ensemble.txt")

def write_log(file, content, command=None):
    with open(join(logs_folder, file), "w") as w:
        if command is not None:
            w.write(command + "\n\n")
        for line in content:
            w.write(line + "\n")

## Q2:  
Now, you should implement two additional proximity search features – a proximity search feature allows documents where the query terms occur closely together to be boosted. We require
that you implement two of the functions numbered 1-5 in the following paper: 
 > Ronan Cummins and Colm O'Riordan. 2009. Learning in a pairwise term-term proximity framework for information retrieval. In Proceedings of ACM SIGIR 2009. http://ir.dcs.gla.ac.uk/~ronanc/papers/cumminsSIGIR09.pdf
 
NB: You should calculate your feature by aggregating the function score (mean or min or max, as appropriate) over all pairs of query terms.  

You will implement your new features as two DocumentScoreModifiers (DSM) classes, using the example DSM code provided in the Github repository(https://github.com/cmacdonald/IRcourseHM). 

You can add a DSM as an additional feature by appending its full name to the feature file, e.g.:  

```
DSM:org.myclass.MyProx1DSM  
```

### Q2a:  
Name the two proximity features you have chosen to implement and provide a brief rationale for your choice of these two particular features, especially in terms of how they might affect the performance of the deployed LTR baseline approach of Q1.

#### Features

| Feature       | Class                              |
| ------------- |:----------------------------------:|
| avg_dist      | `uk.ac.gla.dcs.dsms.AvgDistDsm`    |
| diff_avg_pos  | `uk.ac.gla.dcs.dsms.DiffAvgPosDsm` |


In [2]:
# Build package
build_log = !mvn -f search-features/pom.xml clean package 

In [3]:
features = {
    'none': [],
#    'min_dist': ['DSM:uk.ac.gla.dcs.dsms.MinDistDsm'],
#    'avg_min_dist': ['DSM:uk.ac.gla.dcs.dsms.AvgMinDistDsm'],
    'avg_dist': ['DSM:uk.ac.gla.dcs.dsms.AvgDistDsm'],
    'diff_avg_pos': ['DSM:uk.ac.gla.dcs.dsms.DiffAvgPosDsm'],
    'both': ['DSM:uk.ac.gla.dcs.dsms.DiffAvgPosDsm','DSM:uk.ac.gla.dcs.dsms.AvgDistDsm'],
#    'all': ['DSM:uk.ac.gla.dcs.dsms.MinDistDsm', 'DSM:uk.ac.gla.dcs.dsms.AvgMinDistDsm', 'DSM:uk.ac.gla.dcs.dsms.AvgDistDsm', 'DSM:uk.ac.gla.dcs.dsms.DiffAvgPosDsm']
}
template = "WMODEL:SingleFieldModel(BM25,0)\nQI:StaticFeature(OIS,%s)" % feature_files

for model in features:
    final = template
    for f in features[model]:
        final += "\n" + f 
    with open(join(project_folder,"features_%s.txt" % model), "w") as w:
        w.write(final)

In [4]:
evals = []
for model in features:
    jforest_statistics = join(results_folder, "jforests-feature-stats.txt")
    if os.path.exists(jforest_statistics):
        os.remove(jforest_statistics)
    
    features_file = join(project_folder,"features_%s.txt" % model)

    tr_letor = "tr_%s.letor" % model
    features_file = join(project_folder,"features_%s.txt" % model)

    letor_tr_file= join(results_folder, tr_letor)
    results_file = join(results_folder,"pl2_tr_ltr_%s.res" % model)
    command="CLASSPATH=%s %s -r -Dtrec.topics=%s " % (jar_file, terrier, training_topics) + \
        "-Dtrec.results.file=%s -Dtrec.model=PL2 -Dterrier.index.path=%s " % (results_file,  terrier_index) + \
        "-Dtrec.matching=FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
        "-Dtrec.querying.outputformat=Normalised2LETOROutputFormat " + \
        "-Dquerying.postprocesses.order=QueryExpansion,org.terrier.learning.LabelDecorator " + \
        "-Dquerying.postprocesses.controls=labels:org.terrier.learning.LabelDecorator,qe:QueryExpansion " + \
        "-Dquerying.default.controls=labels:on " + \
        "-Dlearning.labels.file=%s " % training_qrels + \
        "-Dtrec.results.file=%s -Dproximity.dependency.type=SD " % letor_tr_file + \
        "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file 

    run_results = !$command
    write_log("pl2_tr_ltr_%s_querying.log" % model, run_results, command=command)
    print("pl2_tr_ltr_%s_querying.log" % model)

    ###

    va_letor = "va_%s.letor" % model
    letor_va_file= join(results_folder, va_letor)
    results_file = join(results_folder,"pl2_va_ltr_%s.res" % model)
    command="CLASSPATH=%s %s -r -Dtrec.topics=%s " % (jar_file, terrier, validation_topics) + \
        "-Dtrec.results.file=%s -Dtrec.model=PL2 -Dterrier.index.path=%s " % (results_file,  terrier_index) + \
        "-Dtrec.matching=FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
        "-Dtrec.querying.outputformat=Normalised2LETOROutputFormat " + \
        "-Dquerying.postprocesses.order=QueryExpansion,org.terrier.learning.LabelDecorator " + \
        "-Dquerying.postprocesses.controls=labels:org.terrier.learning.LabelDecorator,qe:QueryExpansion " + \
        "-Dquerying.default.controls=labels:on " + \
        "-Dlearning.labels.file=%s " % validation_qrels + \
        "-Dtrec.results.file=%s -Dproximity.dependency.type=SD " % letor_va_file + \
        "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file 

    run_results = !$command
    write_log("pl2_va_ltr_%s_querying.log" % model, run_results, command=command)
    print("pl2_va_ltr_%s_querying.log" % model)

    ###

    command="%s edu.uci.jforests.applications.Runner " % anyclass + \
        "--config-file %s " % jforest_file + \
        "--cmd=generate-bin --ranking --folder %s " % results_folder + \
        "--file %s --file %s" %(tr_letor, va_letor)

    run_results = !$command
    write_log("anyclass_run_%s_1.log" % model, run_results, command=command)
    print("anyclass_run_%s_1.log" % model)

    ###

    tr_bin = join(results_folder, "tr_%s.bin" % model)
    va_bin = join(results_folder, "va_%s.bin" % model)
    ensemble_file= join(results_folder, "ensemble_%s.txt" % model)
    command="%s edu.uci.jforests.applications.Runner " % anyclass + \
        "--config-file %s " % jforest_file + \
        "--cmd=train --ranking --folder %s " % results_folder + \
        "--train-file %s --validation-file %s " % (tr_bin, va_bin) + \
        "--output-model %s" % ensemble_file

    run_results = !$command
    write_log("anyclass_run_%s_2.log" % model, run_results, command=command)
    print("anyclass_run_%s_2.log" % model)

    ###

    ltr_test_results_file = join(results_folder,"pl2_te_ltr_%s.res" % model)
    command="CLASSPATH=%s %s -r -Dtrec.model=PL2 -Dtrec.topics=%s " % (jar_file, terrier, hp04_topics) + \
        "-Dtrec.matching=JforestsModelMatching,FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull " + \
        "-Dfat.featured.scoring.matching.features=FILE -Dfat.featured.scoring.matching.features.file=%s " % features_file + \
        "-Dtrec.results.file=%s  -Dterrier.index.path=%s " % (ltr_test_results_file,  terrier_index) + \
        "-Dfat.matching.learned.jforest.model=%s " % ensemble_file + \
        "-Dfat.matching.learned.jforest.statistics=%s " % jforest_statistics + \
        "-Dproximity.dependency.type=SD"

    run_results = !$command
    write_log("pl2_te_ltr_%s_querying.log" % model, run_results, command=command)
    print("pl2_te_ltr_%s_querying.log" % model)

    ###

    eval_file = join(results_folder, "pl2_ltr_%s.eval" % model)
    command = "%s %s %s -q > %s" % (teval, hp04_qrels, ltr_test_results_file, eval_file)
    run_results = !$command
    write_log("pl2_ltr_%s_eval.log" % model, run_results, command=command)
    print("pl2_ltr_%s_eval.log" % model)
    
    evals.append(eval_file)

pl2_tr_ltr_none_querying.log
pl2_va_ltr_none_querying.log
anyclass_run_none_1.log
anyclass_run_none_2.log
pl2_te_ltr_none_querying.log
pl2_ltr_none_eval.log
pl2_tr_ltr_avg_dist_querying.log
pl2_va_ltr_avg_dist_querying.log
anyclass_run_avg_dist_1.log
anyclass_run_avg_dist_2.log
pl2_te_ltr_avg_dist_querying.log
pl2_ltr_avg_dist_eval.log
pl2_tr_ltr_diff_avg_pos_querying.log
pl2_va_ltr_diff_avg_pos_querying.log
anyclass_run_diff_avg_pos_1.log
anyclass_run_diff_avg_pos_2.log
pl2_te_ltr_diff_avg_pos_querying.log
pl2_ltr_diff_avg_pos_eval.log
pl2_tr_ltr_both_querying.log
pl2_va_ltr_both_querying.log
anyclass_run_both_1.log
anyclass_run_both_2.log
pl2_te_ltr_both_querying.log
pl2_ltr_both_eval.log


In [5]:
from utils.read_eval import read_eval
for e in evals:
    total, pq = read_eval(e)
    print(e)
    print(total['map'][0], total['P_5'][0])

/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/q2/results/pl2_ltr_none.eval
0.4918 0.1413
/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/q2/results/pl2_ltr_avg_dist.eval
0.4746 0.1387
/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/q2/results/pl2_ltr_diff_avg_pos.eval
0.4738 0.1387
/Users/fferegrino/Documents/GitHub/information-retrieval/ae2/q2/results/pl2_ltr_both.eval
0.5029 0.1333
