## Investigating query performance

In [2]:
import pickle
import json
import gzip
import os
import subprocess
import numpy as np
import multiprocessing
import re 

In [3]:
# Define paths
dataloc = '../../bioasq_data/'
# dataloc = '../../robust04_data/split_5/'
baseline_files ='./baseline_files/'
galago_loc='./galago-3.10-bin/bin/'

In [4]:
# Select data split to work with
split = "test"
# split = "dev"

In [5]:
def get_pickle_docs(pickle_filename):
    # Pickle to Trectext converter
    with open(pickle_filename, 'rb') as f_in:
        data = pickle.load(f_in)
        if not os.path.exists(baseline_files):
            os.makedirs(baseline_files)
        docs = {}
        for key, value in data.items():
            if "pmid" in value.keys():
                doc_code = value.pop('pmid')
            else:
                doc_code = key
            doc = '<DOC>\n' + \
                  '<DOCNO>' + doc_code + '</DOCNO>\n' + \
                  '<TITLE>' + value.pop('title') + '</TITLE>\n' + \
                  '<TEXT>' + value.pop('abstractText') + '</TEXT>\n' + \
                  '</DOC>\n'
            docs[doc_code] = doc
        return docs

In [6]:
def doc_to_jsonfile(docs, filename):
    # Pickle to Trectext converter
    doc_list = []
    with gzip.open(filename,'wt', encoding='utf-8') as f_out:
        docus = {}
        for key, value in docs.items():
            f_out.write(value)

In [7]:
# Build corpus index 
def build_index(index_input, index_loc):
    if not os.path.exists(index_loc):
            os.makedirs(index_loc) 
    index_loc_param = '--indexPath=' + index_loc
    galago_parameters = [galago_loc + 'galago', 'build', '--stemmer+krovetz']
    [galago_parameters.append('--inputPath+' + idx) for idx in index_input]
    galago_parameters.append(index_loc_param)
    print(galago_parameters)

    index_proc = subprocess.Popen(galago_parameters,
            stdout=subprocess.PIPE, shell=False)
    (out, err) = index_proc.communicate()
    print(out.decode("utf-8"))
    print(err)

In [None]:
# # Return top 100 bm25 scored docs, given query and corpus indexed by galago
# def get_bm25_docs(query, index_loc, b_val, k_val):
# #     query = query.lower()
#     query = query.rstrip('.?')
#     index_loc_param = '--index=' + index_loc  
#     b=' --b=' + str(b_val)
#     k=' --k=' + str(k_val)
#     if "'" in query:
#         query_param = '--query="#stopword(' + query + ')"' 
#     else:
#         query_param = '--query=\'#stopword(' + query + ')\'' 

#     command = galago_loc + 'galago batch-search --verbose=false --casefold=true --requested=100 ' + \
#          index_loc_param + ' --scorer=bm25' + \
#          b + \
#          k + \
#          ' --stemmer+krovetz ' + \
#          query_param + ' | cut -d" " -f3'
# #     print(command)
#     galago_bm25_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
#     (out, err) = galago_bm25_exec.communicate()
#     bm25_documents = out.decode("utf-8")
#     return bm25_documents.splitlines()

In [1]:
# Return top 100 bm25 scored docs, given query and corpus indexed by galago
def get_bm25_docs(query, index_loc, b_val=0.2, k_val=0.8):
#     query = re.sub(r'[^\w\s]',' ',query)
#     query = query.lower()
#     query = query.rstrip('.?')
#     query = 'List the classical triad of symptoms of the Melkersson–Rosenthal syndrome.'
    print(query)
    index_loc_param = '--index=' + index_loc  
    b=' --b=' + str(b_val)
    k=' --k=' + str(k_val)
    if "'" in query:
        query_param = '--query="#stopword(' + query + ')"' 
    else:
        query_param = '--query=\'#stopword(' + query + ')\'' 

    command = galago_loc + 'galago batch-search --verbose=true --casefold=true --requested=100 ' + \
         index_loc_param + ' --scorer=bm25' + \
         b + \
         k + \
         ' --stemmer+krovetz ' + \
         query_param + ' | cut -d" " -f3'
#     print(command)
    galago_bm25_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    (out, err) = galago_bm25_exec.communicate()
    bm25_documents = out.decode("utf-8")
    return bm25_documents.splitlines()

In [16]:
query = 'List the classical triad of symptoms of the Melkersson–Rosenthal syndrome.'
get_bm25_docs(query, index_loc)

List the classical triad of symptoms of the Melkersson–Rosenthal syndrome.


UnicodeEncodeError: 'latin-1' codec can't encode character '\u2013' in position 253: ordinal not in range(256)

In [9]:
pkl_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in ['docset', split, '.pkl'])]

# pkl_files = [ x for x in os.listdir(dataloc) if all(y in x for y in ['docset', '.pkl'])]

In [None]:
pkl_files

In [12]:
# Convert pickle to trectext file format to be processed with galago
# pkl_file = [s for s in pkl_files if split in s]
# [output_file, doc_list ]= pickle_to_json(pkl_file[0])
doc_list = []
output_files = []
all_docs = []
for pkl_file in pkl_files:
#     print(pkl_file)
    docs = get_pickle_docs(pkl_file)
    doc_list = doc_list + list(docs.keys())
    all_docs.append(docs)
    out_name = pkl_file.split('/')[-1:][0]
    out_name = re.sub('\.pkl', '', out_name)
    output_file = baseline_files + out_name + '.gz'
    output_files.append(output_file)
    # print(out_name)
    doc_to_jsonfile(docs, output_file)

In [None]:
# sets = [set(doc.keys()) for doc in all_docs]

In [None]:
pkl_file

In [14]:
data_split = split
print(data_split)

if "rob04" in output_files[0]:
    s = re.findall("(s[0-5]).pkl$", pkl_file)
    dataset_name = "rob04"
    dataset_name_ext = dataset_name + '_'+ s[0]
#     dataset_name_ext = dataset_name 
    gold_file = '../../robust04_data/rob04.' + split +'.json'
#     with open(gold_file, 'w') as outfile:
#         json.dump(query_data, outfile, indent = 4)
    print(dataset_name_ext)
elif "bioasq" in output_file:
    print("bioasq")
    dataset_name = "bioasq"
    dataset_name_ext = dataset_name

test
bioasq


In [15]:
index_loc = baseline_files + 'index' + '_' + dataset_name_ext + '_' + data_split
index_input = output_files
# build_index(index_input, index_loc)

In [None]:
output_file

In [None]:
q_filename = [ x for x in os.listdir(dataloc) if all(y in x for y in [dataset_name +'.'+ data_split, '.json'])]

In [None]:
q_filename

In [None]:
# queries_file = dataloc + q_filename[0]

def load_queries(queries_file):
    with open(queries_file, 'rb') as input_file:
        query_data = json.load(input_file)
        return query_data['questions']

In [None]:
query_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in [dataset_name +'.'+ data_split, '.json'])]

# pkl_files = [ x for x in os.listdir(dataloc) if all(y in x for y in ['docset', '.pkl'])]

In [None]:
queries = []
query_data = {}
for file in query_files:
    queries = queries + load_queries(file)
query_data['questions'] = queries

In [None]:
def save_preds(file, preds):
    with open(file, 'wt') as f_out:
        json.dump(preds, f_out, indent=4)
    print('Predictions file: ' + file + ', done!')

In [None]:
query_files[0].strip('split_1')

In [None]:
print(index_loc)

## Allows BM25 b and k grid search

In [None]:
# def bm25_computing(b_k):
#     b = b_k[0]
#     k = b_k[1]
#     bm25_preds_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_'+ data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.json'
# #     print(bm25_preds_file)
#     if os.path.isfile(bm25_preds_file):
#         print(bm25_preds_file + "Already exists!!")
#         return
#     bm25_preds = {}
#     questions = []
#     question = {}
#     for query in query_data['questions']:
#         question['body'] = query['body']
#         question['id'] = query['id']
#     #     print(query['body'].rstrip('.'))
#     #     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
#         documents = get_bm25_docs(query['body'], index_loc, b, k)
#         if "bioasq" in dataset_name: 
#             documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
#             question['documents'] = documents_url
#         elif "rob04" in dataset_name:
#             question['documents'] = documents
#         questions.append(dict(question))
    
#     bm25_preds['questions'] = questions
#     save_preds(bm25_preds_file, bm25_preds)  

In [None]:
def start_process():
    print( 'Starting', multiprocessing.current_process().name)

In [None]:
def extract_question(query):
    question = {}
    question['body'] = query['body']
    question['id'] = query['id']
#     print(query['body'].rstrip('.'))
#     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
    documents = get_bm25_docs(query['body'], index_loc)
    if "bioasq" in dataset_name: 
        documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
        question['documents'] = documents_url
    elif "rob04" in dataset_name:
        question['documents'] = documents
    return dict(question)

In [None]:
# get_bm25_docs(query_data['questions'][0]['body'], index_loc)
index_loc

In [None]:
# def bm25_computing(b_k):
#     b = b_k[0]
#     k = b_k[1]
b = 0.2
k = 0.8
bm25_preds_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_' + data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.json'
#     print(bm25_preds_file)
if os.path.isfile(bm25_preds_file):
    print(bm25_preds_file + "Already exists!!")
#     return
bm25_preds = {}
questions = []
pool_size = 1
pool = multiprocessing.Pool(processes=pool_size,
                            initializer=start_process,
                            )
questions = pool.map(extract_question, query_data['questions'])
pool.close() # no more tasks
pool.join()  # wrap up current tasks
    
bm25_preds['questions'] = questions
save_preds(bm25_preds_file, bm25_preds)  

In [None]:
# if __name__ == '__main__':
#     grid_search = 'no'
#     if grid_search == 'yes':
#         brange = np.arange(0.2,1,0.1)
#         krange = np.arange(0.5,2,0.1)
#     else:
#         brange = [0.2]
#         krange = [0.8]

#     b_k = [(round(b,2), round(k,2)) for b in brange for k in krange]
#     pool_size = 8
#     pool = multiprocessing.Pool(processes=pool_size,
#                                 initializer=start_process,
#                                 )
#     pool_outputs = pool.map(bm25_computing, b_k)
#     pool.close() # no more tasks
#     pool.join()  # wrap up current tasks