## BM25 + RM3 with Anserini

In [None]:
import pickle
import json
import gzip
import os
import subprocess
import numpy as np
import multiprocessing
import re 
import shutil
from itertools import islice
import random

import os
import sys
import uuid
import datetime

In [None]:
## Options

# search best b and k now?
# grid_search = 'yes' 
grid_search = 'no' 

# build index? 
build_index_flag = 'yes'
# build_index_flag = 'no'

# N of workers for multiprocessing used grid_search
pool_size = 20

hits = 100

In [None]:
# Define paths
dataloc = '../../bioasq_data/'
# dataloc = '../../robust04_data/split_2/'
baseline_files ='./baseline_files/'
corpus_files ='./corpus_files/'
galago_loc='./galago-3.10-bin/bin/'
anserini_loc = '../../../anserini/'

## TREC storage
trec_storage = '/ssd/francisco/trec_datasets/deep-relevance-ranking/'

In [None]:
# Select data split to work with
# split = "test"
split = "dev"
# split = "train"

In [None]:
def remove_sc(text):
##     text = re.sub('[.,?;*!%^&_+():-\[\]{}]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip())
##     text = re.sub('[\[\]{}.,?;*!%^&_+():-]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip()) # DeepPaper method
    text = re.sub(r'[^\w\s]',' ',text) # My method
##     text = text.rstrip('.?')
    return text

In [None]:
def get_pickle_docs(pickle_filename):
    # Pickle to Trectext converter
    with open(pickle_filename, 'rb') as f_in:
        data = pickle.load(f_in)
        if not os.path.exists(baseline_files):
            os.makedirs(baseline_files)
        if not os.path.exists(corpus_files):
            os.makedirs(corpus_files)
        docs = {}
        for key, value in data.items():
            if "pmid" in value.keys():
                doc_code = value.pop('pmid')
            else:
                doc_code = key
                
# Uncomment                 
#             doc = '<DOC>\n' + \
#                   '<DOCNO>' + doc_code + '</DOCNO>\n' + \
#                   '<TITLE>' + value.pop('title') + '</TITLE>\n' + \
#                   '<TEXT>' + value.pop('abstractText') + '</TEXT>\n' + \
#                   '</DOC>\n'
            
            doc = '<DOC>\n' + \
                  '<DOCNO>' + doc_code + '</DOCNO>\n' + \
                  '<TITLE>' + remove_sc(value.pop('title')) + '</TITLE>\n' + \
                  '<TEXT>' + remove_sc(value.pop('abstractText')) + '</TEXT>\n' + \
                  '</DOC>\n'
            docs[doc_code] = doc
        return docs

In [None]:
def to_trecfile(docs, filename, compression = 'yes'):
    # Pickle to Trectext converter
    doc_list = []
    if compression == 'yes':
        with gzip.open(filename,'wt', encoding='utf-8') as f_out:
            docus = {}
            for key, value in docs.items():
                f_out.write(value)
    else:
        with open(filename,'wt', encoding='utf-8') as f_out:
            docus = {}
            for key, value in docs.items():
                f_out.write(value)

In [None]:
# Build corpus index with Anserini
def build_index(index_input, index_loc, log_file):
    if not os.path.exists(index_loc):
            os.makedirs(index_loc) 
#     index_loc_param = '--indexPath=' + index_loc

    anserini_index = anserini_loc + 'target/appassembler/bin/IndexCollection'
    anserini_parameters = [
#                            'nohup', 
                           'sh',
                           anserini_index,
                           '-collection',
                           'TrecCollection',
                           '-generator',
                           'JsoupGenerator',
                           '-threads',
                            '16',
                            '-input',
                           index_input,
                           '-index',
                           index_loc,
                           '-storePositions',
                            '-keepStopwords',
                            '-storeDocvectors',
                            '-storeRawDocs']
#                           ' >& ',
#                           log_file,
#                            '&']



#     anserini_parameters = ['ls',
#                           index_loc]


#     print(anserini_parameters)

    index_proc = subprocess.Popen(anserini_parameters,
            stdout=subprocess.PIPE, shell=False)
    (out, err) = index_proc.communicate()
#     print(out.decode("utf-8"))
#     print(err)

In [None]:
def generate_queries_file(queries, filename):
    queries_list = []
    queries_dict = {}
    query = {}
    q_dict = {}
    q_trec = {}
    ids_dict = {}
    id_num = 0
    for q in queries:
        str_id = str(id_num)
        id_new = str_id.rjust(15, '0')
#         print(q['body'])
#         text = q['body']
        text = remove_sc(q['body'])
#         print(text)
    
#         text = re.sub(r'[^\w\s]',' ',text)
##     text = text.lower()
##         text = text.rstrip('.?')
    
        q_dict[q['id']] = q['body']
        query['id_new'] = id_new
        query['number'] = q['id']
        query['text'] = '#stopword(' + text + ')'
        queries_list.append(dict(query))
        q_t = '<top>\n\n' + \
              '<num> Number: ' + id_new + '\n' + \
              '<title> ' + q['body'] + '\n\n' + \
              '<desc> Description:' + '\n\n' + \
              '<narr> Narrative:' + '\n\n' + \
              '</top>\n\n'
        q_trec[q['id']] = q_t
        ids_dict[str(id_num)] = q['id']
        id_num += 1
    queries_dict['queries'] = queries_list

    with open(filename, 'wt', encoding='utf-8') as q_file:
        json.dump(queries_dict, q_file, indent = 4)
    
    return [q_dict, q_trec, ids_dict]

In [None]:
def retrieve_docs(q_topics_file, retrieved_docs_file, index_loc, b_val=0.2, k_val=0.8, n_docs=10, n_terms=10, w_ori_q=0.5, hits=100):
    
    anserini_search = anserini_loc + 'target/appassembler/bin/SearchCollection'
#     print(b_val)
    command = [ 
               'sh',
               anserini_search,
               '-topicreader',
                'Trec',
                '-index',
                index_loc,
                '-topics',
                q_topics_file,
                '-output',
                retrieved_docs_file,
                '-bm25',
                '-b',
                str(b_val),
                '-k1',
                str(k_val),
                '-rm3',
                '-rm3.fbDocs',
                str(n_docs),
                '-rm3.fbTerms',
                str(n_terms),
                '-rm3.originalQueryWeight',
                str(w_ori_q),
                '-hits',
                str(hits)
               ]
#     print(command)
#     command = command.encode('utf-8')
    anserini_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=False, encoding='utf-8')
    (out, err) = anserini_exec.communicate()
###     print(out)
# ##    print(err)

In [None]:
# Return top 100 bm25 scored docs, given query and corpus indexed by anserini

def generate_preds_file(retrieved_docs_file, q_dict, ids_dict, hits=100):
    
    with open(retrieved_docs_file, 'rt') as f_in:
        aux_var = -1
        bm25_docs = []
        while aux_var != 0:
            question = {}
            lines_gen = islice(f_in, hits)
            documents = []
            for line in lines_gen:
                id_aux = line.split(' ')[0]
                current_key = ids_dict[id_aux]
                documents.append(line.split(' ')[2])
                
###             print(documents)
            aux_var = len(documents)
            if aux_var == 0: 
                break
# ##            print(aux_var)##
# ##            print(documents)
            question['id'] = current_key
            question['body'] = q_dict[current_key]
            
            if "bioasq" in dataset_name: 
                documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
                question['documents'] = documents_url
            elif "rob04" in dataset_name:
                question['documents'] = documents
            bm25_docs.append(dict(question))
            
    return bm25_docs        

In [None]:
# docus = generate_preds_file(retrieved_docs_file, q_dict, ids_dict)

In [None]:
# len(docus)

In [None]:
pkl_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in ['docset', split, '.pkl'])]

# pkl_files = [ x for x in os.listdir(dataloc) if all(y in x for y in ['docset', '.pkl'])]

In [None]:
pkl_files

In [None]:
# Convert pickle to trectext file format to be processed with galago
# pkl_file = [s for s in pkl_files if split in s]
# [output_file, doc_list ]= pickle_to_json(pkl_file[0])
doc_list = []
output_files = []
all_docs = []
for pkl_file in pkl_files:
###     print(pkl_file)
    docs = get_pickle_docs(pkl_file)
    doc_list = doc_list + list(docs.keys())
    all_docs.append(docs)
    out_name = pkl_file.split('/')[-1:][0]
    out_name = re.sub('\.pkl', '', out_name)
    output_file = corpus_files + out_name + '.gz'
    trec_doc_file = trec_storage + out_name + '.gz'
    output_files.append(output_file)
    ### print(out_name)
    to_trecfile(docs, output_file)
    to_trecfile(docs, trec_doc_file)

In [None]:
# Random grid search sampling

def get_random_params(hyper_params, num_iter):
    random_h_params_list = []
    while len(random_h_params_list) < num_iter:
        random_h_params_set = []
        for h_param_list in hyper_params:
            sampled_h_param = random.sample(list(h_param_list), k=1)
#             print(type(sampled_h_param[0]))
#             print(sampled_h_param[0])
            random_h_params_set.append(round(sampled_h_param[0], 3))
        if not random_h_params_set in random_h_params_list:
            random_h_params_list.append(random_h_params_set)
#             print('Non repeated')
        else:
            print('repeated')
    return random_h_params_list

In [None]:
pkl_file

In [None]:
data_split = split
print(data_split)

if "rob04" in output_files[0]:
    s = re.findall("(s[0-5]).pkl$", pkl_file)
    dataset_name = "rob04"
    dataset_name_ext = dataset_name + '_'+ s[0]
#     dataset_name_ext = dataset_name 
    gold_file = '../../robust04_data/rob04.' + split +'.json'
#     with open(gold_file, 'w') as outfile:
#         json.dump(query_data, outfile, indent = 4)
    print(dataset_name_ext)
elif "bioasq" in output_file:
    print("bioasq")
    dataset_name = "bioasq"
    dataset_name_ext = dataset_name

### Build Index

In [None]:
index_loc = baseline_files + 'anserini_index' + '_' + dataset_name_ext + '_' + data_split
# index_input = output_files
index_input = corpus_files
log_file = baseline_files + 'log_index_' + dataset_name_ext + '_' + data_split

if build_index_flag == 'yes':
    build_index(index_input, index_loc, log_file)
    
#     build_index(index_input, index_loc)

In [None]:
output_file

In [None]:
q_filename = [ x for x in os.listdir(dataloc) if all(y in x for y in [dataset_name +'.'+ data_split, '.json'])]

In [None]:
q_filename

In [None]:
# queries_file = dataloc + q_filename[0]

def load_queries(queries_file):
    with open(queries_file, 'rb') as input_file:
        query_data = json.load(input_file)
        return query_data['questions']

In [None]:
query_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in [dataset_name +'.'+ data_split, '.json'])]

In [None]:
queries = []
query_data = {}
for file in query_files:
    queries = queries + load_queries(file)
# ##    print(queries)
query_data['questions'] = queries

In [None]:
def save_preds(file, preds):
    with open(file, 'wt') as f_out:
        json.dump(preds, f_out, indent=4)
    print('Predictions file: ' + file + ', done!')

In [None]:
query_files[0].strip('split_1')

In [None]:
def start_process():
    print( 'Starting', multiprocessing.current_process().name)

In [None]:
def extract_question(query):
    question = {}
    question['body'] = query['body']
    question['id'] = query['id']
###     print(query['body'].rstrip('.'))
#     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
    documents = get_bm25_docs(query['body'], index_loc)
    if "bioasq" in dataset_name: 
        documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
        question['documents'] = documents_url
    elif "rob04" in dataset_name:
        question['documents'] = documents
    return dict(question)

In [None]:
# get_bm25_docs(query_data['questions'][0]['body'], index_loc)
index_loc

In [None]:
bm25_queries_file = baseline_files + 'bm25_queries_' + dataset_name_ext + '_' + data_split + '.json'
[q_dict, q_trec, ids_dict]= generate_queries_file(queries,bm25_queries_file)

q_topic_filename = dataset_name_ext + '_' + 'query_topics'  + '_' + data_split + '.txt'
q_topics_file = baseline_files + q_topic_filename
trec_q_topics_file = trec_storage + q_topic_filename

to_trecfile(q_trec, q_topics_file, compression = 'no')
to_trecfile(q_trec, trec_q_topics_file, compression = 'no')

In [None]:
# b = 0.2
# k = 0.8
# retrieved_docs_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_' + data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.txt'
# retrieve_docs(q_topics_file, retrieved_docs_file, q_dict, index_loc, b_val=0.2, k_val=0.8)

In [None]:
def format_bioasq2treceval_qrels(bioasq_data, filename):
    with open(filename, 'wt') as f:
        for q in bioasq_data['questions']:
            for d in q['documents']:
                f.write('{0} 0 {1} 1'.format(q['id'], d))
                f.write('\n')

def format_bioasq2treceval_qret(bioasq_data, system_name, filename):
    with open(filename, 'wt') as f:
        for q in bioasq_data['questions']:
            rank = 1
            for d in q['documents']:
                
                sim = (len(q['documents']) + 1 - rank) / float(len(q['documents']))
                f.write('{0} {1} {2} {3} {4} {5}'.format(q['id'], 0, d, rank, sim, system_name))
                f.write('\n')
                rank += 1

def trec_evaluate(qrels_file, qret_file):
    trec_eval_res = subprocess.Popen(
        ['./trec_eval', '-m', 'all_trec', qrels_file, qret_file],
        stdout=subprocess.PIPE, shell=False)

#     print(trec_eval_res)
    (out, err) = trec_eval_res.communicate()
    trec_eval_res = out.decode("utf-8")
    print(trec_eval_res)
    print(out)
    print(err)

In [None]:
def evaluate(golden_file, predictions_file):

    system_name = predictions_file
    
    with open(golden_file, 'r') as f:
        golden_data = json.load(f)

    with open(predictions_file, 'r') as f:
        predictions_data = json.load(f)

    temp_dir = uuid.uuid4().hex
    qrels_temp_file = '{0}/{1}'.format(temp_dir, 'qrels.txt')
    qret_temp_file = '{0}/{1}'.format(temp_dir, 'qret.txt')

    try:
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        else:
            sys.exit("Possible uuid collision")

        format_bioasq2treceval_qrels(golden_data, qrels_temp_file)
        format_bioasq2treceval_qret(predictions_data, system_name, qret_temp_file)

        trec_evaluate(qrels_temp_file, qret_temp_file)
    finally:
        os.remove(qrels_temp_file)
        os.remove(qret_temp_file)
        os.rmdir(temp_dir)

In [None]:
def bm25_computing(params):
    b = params[0]
    k = params[1]
    n_doc = params[2]
    n_term = params[3]
    w_ori_q = params[4]
#     b = 0.2
#     k = 0.8
    params_suffix = 'b' + str(b) + 'k' + str(k) + 'n_doc' + str(n_doc) + 'n_term' + str(n_term) + 'w_ori_q' + str(w_ori_q)

    bm25_preds_file = baseline_files + 'bm25_rm3_preds_' + dataset_name_ext + '_' + data_split + '_' + params_suffix + '.json'
    
    ###     print(bm25_preds_file)
    if os.path.isfile(bm25_preds_file):
        print(bm25_preds_file + "Already exists!!")
        return
    retrieved_docs_file = baseline_files + 'run_bm25_rm3_preds_' + dataset_name_ext + '_' + data_split + '_' + params_suffix + '.txt'
    #print(b)
    #print(k)
    retrieve_docs(q_topics_file, retrieved_docs_file, index_loc, b, k, n_doc, n_term, w_ori_q)
    bm25_preds = {}
    bm25_preds['questions'] = generate_preds_file(retrieved_docs_file, q_dict, ids_dict)

    save_preds(bm25_preds_file, bm25_preds)  
    
    golden_file = dataloc + dataset_name_ext + '.' + data_split + '.json'
    
    if os.path.exists(golden_file):
        print('yes, we can evaluate!')    
        print(golden_file)    
        evaluate(golden_file,bm25_preds_file)
    else:
        print('no, we cannot evaluate  :( !')    

In [96]:
if __name__ == '__main__':
    grid_search = 'yes'
    
    if grid_search == 'yes':
        ## Heavy grid search
        brange = np.arange(0.1,1,0.05)
        krange = np.arange(0.1,4,0.1)
        N_range = np.arange(5,500,1) # num of docs
        M_range = np.arange(5,500,1) # num of terms
        lamb_range = np.arange(0,1,0.1) # weights of original query

        ## Light grid search
#         brange = [0.2]
#         krange = [0.8]
#         N_range = np.arange(1,50,2)
#         M_range = np.arange(1,50,2)
#         lamb_range = np.arange(0,1,0.2)
        
        h_param_ranges = [brange, krange, N_range, M_range, lamb_range]
        n_iters = 2
        params = get_random_params(h_param_ranges, n_iters)

    else:
        brange = [0.2]
        krange = [0.8]
        N_range = [10]
        M_range = [10]
        lamb_range = [0.5]
       
        params = [[round(b,3), round(k,3), round(n_doc,3), round(n_term,3), round(w_ori_q,3)] 
                  for b in brange for k in krange for N in N_range for M in M_range for Lambda in lamb_range]
    
    pool_size = 2
    print(len(params))
    pool = multiprocessing.Pool(processes=pool_size,
                                initializer=start_process,
                                )
    pool_outputs = pool.map(bm25_computing, params)
    pool.close() # no more tasks
    pool.join()  # wrap up current tasks
    
    

    
#     shutil.rmtree(corpus_files)

2
Starting ForkPoolWorker-127
Starting ForkPoolWorker-128
Predictions file: ./baseline_files/bm25_rm3_preds_bioasq_dev_b0.7k0.9n_doc232n_term36w_ori_q0.5.json, done!
yes, we can evaluate!
../../bioasq_data/bioasq.dev.json
6a1f146d48fb436382715aabeb21921d
runid                 	all	./baseline_files/bm25_rm3_preds_bioasq_dev_b0.7k0.9n_doc232n_term36w_ori_q0.5.json
num_q                 	all	100
num_ret               	all	10000
num_rel               	all	1415
num_rel_ret           	all	1093
map                   	all	0.4178
gm_map                	all	0.2103
Rprec                 	all	0.3921
bpref                 	all	0.8339
recip_rank            	all	0.6571
iprec_at_recall_0.00  	all	0.6887
iprec_at_recall_0.10  	all	0.5884
iprec_at_recall_0.20  	all	0.5430
iprec_at_recall_0.30  	all	0.5082
iprec_at_recall_0.40  	all	0.4712
iprec_at_recall_0.50  	all	0.4461
iprec_at_recall_0.60  	all	0.3862
iprec_at_recall_0.70  	all	0.3479
iprec_at_recall_0.80  	all	0.3116
iprec_at_recall_0.90  	all	0.26

In [None]:
params