## Investigating query performance

In [None]:
import pickle
import json
import gzip
import os
import subprocess
import numpy as np
import multiprocessing
import re 

In [None]:
# Define paths
# dataloc = '../../bioasq_data/'
dataloc = '../../robust04_data/'
baseline_files ='./baseline_files/'
galago_loc='./galago-3.10-bin/bin/'

In [None]:
# Select split to work with
split = "test"

In [None]:
def pickle_to_json(pickle_filename):
    # Pickle to Trectext converter
    doc_list = []
    with open(dataloc + pickle_filename, 'rb') as f_in:
        data = pickle.load(f_in)
        if not os.path.exists(baseline_files):
            os.makedirs(baseline_files)
        out_file = baseline_files + pickle_filename[:-4] + '.gz'
        with gzip.open(out_file,'wt', encoding='utf-8') as f_out:
            docu = {}
            for key, value in data.items():
                if "pmid" in value.keys():
                    doc_code = value.pop('pmid')
                else:
                    doc_code = key
                f_out.write('<DOC>\n' + 
                            '<DOCNO>' + doc_code + '</DOCNO>\n' +
                            '<TITLE>' + value.pop('title') + '</TITLE>\n' +
                            '<TEXT>' + value.pop('abstractText') + '</TEXT>\n' + 
                            '</DOC>\n')
                doc_list.append(doc_code)
        return [out_file, doc_list]

In [None]:
# Build corpus index 
def build_index(index_input, index_loc):
    index_input_param = '--inputPath+' + index_input    
    index_loc_param = '--indexPath=' + index_loc
    print(index_input_param)
    print(index_loc_param)
    if not os.path.exists(index_loc):
            os.makedirs(index_loc) 
    index_proc = subprocess.Popen(
            [galago_loc + 'galago', 'build', '--stemmer+krovetz',
                index_input_param, index_loc_param],
            stdout=subprocess.PIPE, shell=False)
    (out, err) = index_proc.communicate()
    print(out.decode("utf-8"))
    print(err)

In [None]:
# Return top 100 bm25 scored docs, given query and corpus indexed by galago
def get_bm25_docs(query, index_loc, b_val=0.75, k_val=1.2):
    query = re.sub(r'[^\w\s]',' ',query)
    query = query.lower()
#     query = query.rstrip('.?')
    index_loc_param = '--index=' + index_loc  
    b=' --b=' + str(b_val)
    k=' --k=' + str(k_val)
    if "'" in query:
        query_param = '--query="#stopword(' + query + ')"' 
    else:
        query_param = '--query=\'#stopword(' + query + ')\'' 

    command = galago_loc + 'galago batch-search --verbose=false --requested=100 ' + \
         index_loc_param + ' --scorer=bm25' + \
         b + \
         k + \
         ' --stemmer+krovetz ' + \
         query_param + ' | cut -d" " -f3'
#     print(command)
    galago_bm25_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    (out, err) = galago_bm25_exec.communicate()
    bm25_documents = out.decode("utf-8")
    return bm25_documents.splitlines()

In [None]:
pkl_files = [ x for x in os.listdir(dataloc) if all(y in x for y in ['docset', '.pkl'])]

In [None]:
pkl_files

In [None]:
# # Convert pickle to trectext file format to be processed with galago


# pkl_file = [s for s in pkl_files if split in s]
# [output_file, doc_list ]= pickle_to_json(pkl_file[0])

In [None]:
with open('./baseline_files/rob04_bm25_docset_top1000.test.gz')

In [None]:
doc_list = 
output_file = './baseline_files/rob04_bm25_docset_top1000.test.gz'

In [None]:
data_split = split
print(data_split)

if "rob04" in output_file:
    s = re.findall("(s[0-5]).pkl$", pkl_file[0])
    dataset_name = "rob04"
#     dataset_name_ext = dataset_name + '_'+ s[0]
    dataset_name_ext = dataset_name
    print(dataset_name_ext)
elif "bioasq" in output_file:
    print("bioasq")
    dataset_name = "bioasq"
    dataset_name_ext = dataset_name

In [None]:
index_loc = baseline_files + 'index' + '_' + dataset_name_ext + '_' + data_split
index_input = output_file
build_index(index_input, index_loc)

In [None]:
q_filename = [ x for x in os.listdir(dataloc) if all(y in x for y in [dataset_name +'.'+ data_split, '.json'])]

In [None]:
q_filename

In [None]:
queries_file = dataloc + q_filename[0]
with open(queries_file, 'rb') as input_file:
    query_data = json.load(input_file)

In [None]:
def save_preds(file, preds):
    with open(file, 'wt') as f_out:
        json.dump(preds, f_out, indent=4)
    print('Predictions file: ' + file + ', done!')

In [None]:
print(index_loc)

In [None]:
# def bm25_computing(b_k):
#     b = b_k[0]
#     k = b_k[1]
#     bm25_preds_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_'+ data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.json'
# #     print(bm25_preds_file)
#     if os.path.isfile(bm25_preds_file):
#         print(bm25_preds_file + "Already exists!!")
#         return
#     bm25_preds = {}
#     questions = []
#     question = {}
#     for query in query_data['questions']:
#         question['body'] = query['body']
#         question['id'] = query['id']
#     #     print(query['body'].rstrip('.'))
#     #     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
#         documents = get_bm25_docs(query['body'], index_loc, b, k)
#         if "bioasq" in dataset_name: 
#             documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
#             question['documents'] = documents_url
#         elif "rob04" in dataset_name:
#             question['documents'] = documents
#         questions.append(dict(question))
    
#     bm25_preds['questions'] = questions
#     save_preds(bm25_preds_file, bm25_preds)  

In [None]:
def start_process():
    print( 'Starting', multiprocessing.current_process().name)

In [None]:
def extract_question(query):
    question = {}
    question['body'] = query['body']
    question['id'] = query['id']
#     print(query['body'].rstrip('.'))
#     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
    documents = get_bm25_docs(query['body'], index_loc)
    if "bioasq" in dataset_name: 
        documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
        question['documents'] = documents_url
    elif "rob04" in dataset_name:
        question['documents'] = documents
    return dict(question)

In [None]:
# def bm25_computing(b_k):
#     b = b_k[0]
#     k = b_k[1]
b = '0.75'
k = '1.2'
bm25_preds_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_' + data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.json'
#     print(bm25_preds_file)
if os.path.isfile(bm25_preds_file):
    print(bm25_preds_file + "Already exists!!")
#     return
bm25_preds = {}
questions = []
pool_size = 50
pool = multiprocessing.Pool(processes=pool_size,
                            initializer=start_process,
                            )
questions = pool.map(extract_question, query_data['questions'])
pool.close() # no more tasks
pool.join()  # wrap up current tasks
    
bm25_preds['questions'] = questions
save_preds(bm25_preds_file, bm25_preds)  

In [None]:
# if __name__ == '__main__':
#     grid_search = 'no'
#     if grid_search == 'yes':
#         brange = np.arange(0.2,1,0.1)
#         krange = np.arange(0.5,2,0.1)
#     else:
#         brange = [0.2]
#         krange = [0.8]

#     b_k = [(round(b,2), round(k,2)) for b in brange for k in krange]
#     pool_size = 8
#     pool = multiprocessing.Pool(processes=pool_size,
#                                 initializer=start_process,
#                                 )
#     pool_outputs = pool.map(bm25_computing, b_k)
#     pool.close() # no more tasks
#     pool.join()  # wrap up current tasks