#### Before you start  
Make sure you run the following cell:

In [10]:
import os
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from os.path import join

from slugify import slugify
from glob import glob
from utils.parse_terrier_output import parse_stats, parse_run_results, parse_evaluation_results
from utils.read_eval import read_eval, get_recalls
from utils.logging import write_log

terrier_path = r"/Users/fferegrino/Documents/terrier-core-4.2"

logs_folder = r"/Users/fferegrino/Documents/GitHub/ir-ae1/logs"
results_folder = r"/Users/fferegrino/Documents/terrier-core-4.2/var/results"

dataframes_path= "results"

sh_or_bat = "sh"
if os.name == 'nt':
    sh_or_bat = "bat"
    
setup = join(terrier_path, "bin", "trec_setup.%s" % sh_or_bat)
terrier = join(terrier_path, "bin", "trec_terrier.%s" % sh_or_bat)
teval = join(terrier_path, "bin", "trec_eval.%s" % sh_or_bat)

## Setup topics:  

(This is for the assessed exercise)

In [11]:
tp = ['HP04', 'NP04', 'TD04']
models = ['LnTfIdfModel', 'Log10TfIdfModel', 'LogTfIdfModel', 'TF_IDF' , 'BM25', 'PL2']
query_exp = ['','-q']
combinations = [comb for comb in itertools.product(tp, models,query_exp)]
java_models = {
    'LnTfIdfModel': 'com.thatcsharpguy.models.LnTfIdfModel',
    'Log10TfIdfModel': 'com.thatcsharpguy.models.Log10TfIdfModel',
    'LogTfIdfModel': 'com.thatcsharpguy.models.LogTfIdfModel'
}
classpath = r"/Users/fferegrino/Documents/GitHub/ir-ae1/custom-model/target/ircourse-1.0-SNAPSHOT.jar"

topics = {}
qrels = {}
for topic in tp:
    topics[topic] = r"/Users/fferegrino/Documents/ae1/TopicsQrels/%s/topics" % topic
    qrels[topic] = r"/Users/fferegrino/Documents/ae1/TopicsQrels/%s/qrels" % topic

In [7]:
# Compile models  
!mvn clean -f ../custom-model/
!mvn package -f ../custom-model/

[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m------------------------------------------------------------------------[m
[[1;34mINFO[m] [1mBuilding ircourse 1.0-SNAPSHOT[m
[[1;34mINFO[m] [1m------------------------------------------------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mmaven-clean-plugin:2.5:clean[m [1m(default-clean)[m @ [36mircourse[0;1m ---[m
[[1;34mINFO[m] [1m------------------------------------------------------------------------[m
[[1;34mINFO[m] [1;32mBUILD SUCCESS[m
[[1;34mINFO[m] [1m------------------------------------------------------------------------[m
[[1;34mINFO[m] Total time: 0.511 s
[[1;34mINFO[m] Finished at: 2018-02-01T22:57:13Z
[[1;34mINFO[m] Final Memory: 7M/123M
[[1;34mINFO[m] [1m------------------------------------------------------------------------[m
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m-----------------

In [12]:
# http://terrier.org/docs/v4.2/configure_retrieval.html

for comb in combinations:
    topic = comb[0]
    model = comb[1]
    qexp = comb[2]
    topics_file = topics[topic]
    results_file = join(results_folder,  topic + "-" + model + qexp + ".res")
    
    model_name = java_models.get(model, model)
    
    command="CLASSPATH=%s %s -r %s -Dtrec.topics=%s -Dtrec.results.file=%s -Dtrec.model=%s" % (classpath,
                                                                                            terrier, 
                                                                                               qexp,
                                                                                            topics_file, 
                                                                                            results_file,
                                                                                            model_name)
    run_results = !$command
    write_log(join(logs_folder,"run-%s-%s%s.txt" % (topic, model, qexp)), run_results, command)
print("Done!")

Done!


In [13]:
# http://terrier.org/docs/v4.2/evaluation.html
for comb in combinations:
    topic = comb[0]
    model = comb[1]
    qexp = comb[2]
    qrels_file = qrels[topic]
    results_file = join(results_folder,  topic + "-" + model  + qexp + ".res")
    eval_file = join(results_folder,  topic + "-" + model + qexp  + ".eval")
    command = "%s %s %s -q > %s" % (teval, qrels_file, results_file, eval_file)
    evaluation_results = !$command
    write_log(join(logs_folder,"eval-%s-%s%s.txt" % (topic, model,qexp)), evaluation_results, command)

In [14]:
def read_eval(eval_file, skip_first=True):
    data = pd.read_table(eval_file,
                         sep='\t',
                         header=None,
                         skiprows=[0] if skip_first else None,
                         names=['measure', 'query', 'value'])
    # Split frames:
    data.dropna(inplace=True)
    data['measure'] = data['measure'].str.strip()
    
    per_query = data[data['query'] != 'all'].pivot(index='query', columns='measure', values='value')
    per_query.index = per_query.index.astype(int)
    per_query.sort_index(inplace=True)
    
    totals = data[data['query'] == 'all'].pivot(index='query', columns='measure', values='value')
    
    return totals, per_query

dataframes = {}
for comb in combinations:
    topic = comb[0]
    model = comb[1]
    qexp = comb[2]
    ev_file = join(results_folder,  topic + "-" + model + qexp + ".eval")
    total, per_query = read_eval(ev_file,True)
    if total is None or len(total) == 0:
        print(comb,"has no total")
    if per_query is None or len(per_query) == 0:
        print(comb,"has no per query")
        
    total.to_csv(os.path.join(dataframes_path, "total_%s_%s%s.csv" %(topic, model, qexp)))
    per_query.to_csv(os.path.join(dataframes_path, "per_query_%s_%s%s.csv" %(topic, model, qexp)))
    dataframes[ topic + "_" + model] = {
        'total': total,
        'per_query': per_query
    }