## Investigating query performance

In [1]:
import pickle
import json
import gzip
import os
import subprocess
import numpy as np
import multiprocessing
import re 

In [2]:
## Options

# search best b and k now?
# grid_search = 'yes' 
grid_search = 'no' 

# build index? 
build_index_flag = 'yes'
# build_index_flag = 'no'

# N of workers for multiprocessing used grid_search
pool_size = 20

In [3]:
# Define paths
# dataloc = '../../bioasq_data/'
dataloc = '../../robust04_data/split_1/'
baseline_files ='./baseline_files/'
galago_loc='./galago-3.10-bin/bin/'

In [4]:
# Select data split to work with
split = "test"
# split = "dev"
# split = "train"

In [5]:
def remove_sc(text):
##     text = re.sub('[.,?;*!%^&_+():-\[\]{}]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip())
##     text = re.sub('[\[\]{}.,?;*!%^&_+():-]', '', text.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip()) # DeepPaper method
    text = re.sub(r'[^\w\s]',' ',text) # My method
##     text = text.rstrip('.?')
    return text

In [6]:
def get_pickle_docs(pickle_filename):
    # Pickle to Trectext converter
    with open(pickle_filename, 'rb') as f_in:
        data = pickle.load(f_in)
        if not os.path.exists(baseline_files):
            os.makedirs(baseline_files)
        docs = {}
        for key, value in data.items():
            if "pmid" in value.keys():
                doc_code = value.pop('pmid')
            else:
                doc_code = key
                
# Uncomment                 
#             doc = '<DOC>\n' + \
#                   '<DOCNO>' + doc_code + '</DOCNO>\n' + \
#                   '<TITLE>' + value.pop('title') + '</TITLE>\n' + \
#                   '<TEXT>' + value.pop('abstractText') + '</TEXT>\n' + \
#                   '</DOC>\n'
            
            doc = '<DOC>\n' + \
                  '<DOCNO>' + doc_code + '</DOCNO>\n' + \
                  '<TITLE>' + remove_sc(value.pop('title')) + '</TITLE>\n' + \
                  '<TEXT>' + remove_sc(value.pop('abstractText')) + '</TEXT>\n' + \
                  '</DOC>\n'
            docs[doc_code] = doc
        return docs

In [7]:
def doc_to_jsonfile(docs, filename):
    # Pickle to Trectext converter
    doc_list = []
    with gzip.open(filename,'wt', encoding='utf-8') as f_out:
        docus = {}
        for key, value in docs.items():
            f_out.write(value)

In [8]:
# Build corpus index 
def build_index(index_input, index_loc):
    if not os.path.exists(index_loc):
            os.makedirs(index_loc) 
    index_loc_param = '--indexPath=' + index_loc
    galago_parameters = [galago_loc + 'galago', 'build', '--stemmer+krovetz']
    [galago_parameters.append('--inputPath+' + idx) for idx in index_input]
    galago_parameters.append(index_loc_param)
    print(galago_parameters)

    index_proc = subprocess.Popen(galago_parameters,
            stdout=subprocess.PIPE, shell=False)
    (out, err) = index_proc.communicate()
    print(out.decode("utf-8"))
    print(err)

In [9]:
def generate_queries_file(queries, filename):
    queries_list = []
    queries_dict = {}
    query = {}
    q_dict = {}
    for q in queries:
#         print(q['body'])
#         text = q['body']
        text = remove_sc(q['body'])
#         print(text)
    
#         text = re.sub(r'[^\w\s]',' ',text)
##     text = text.lower()
##         text = text.rstrip('.?')
    
        q_dict[q['id']] = q['body']
        query['number'] = q['id']
        query['text'] = '#stopword(' + text + ')'
        queries_list.append(dict(query))
    queries_dict['queries'] = queries_list
    with open(filename, 'wt', encoding='utf-8') as q_file:
        json.dump(queries_dict, q_file, indent = 4)
    return q_dict

In [10]:
# Return top 100 bm25 scored docs, given query and corpus indexed by galago
def get_bm25_docs(queries_file, q_dict, index_loc, b_val=0.2, k_val=0.8):

#     query = 'List the classical triad of symptoms of the Melkersson–Rosenthal syndrome.'
#     print(query)
    index_loc_param = '--index=' + index_loc  
    b=' --b=' + str(b_val)
    k=' --k=' + str(k_val)
    
    command = galago_loc + 'galago threaded-batch-search --threadCount=50 --verbose=true \
         --casefold=true --requested=100 ' + \
         index_loc_param + ' --scorer=bm25' + \
         b + \
         k + \
         '   ' + \
         queries_file + ' | cut -d" " -f1,3'
#     print(command)
#     command = command.encode('utf-8')
    galago_bm25_exec = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, encoding='utf-8')
    (out, err) = galago_bm25_exec.communicate()
    ids_docs = out.splitlines()
    question = {}
    bm25_docs = []
    
    for key, value in q_dict.items():
        question = {}
        question['body'] = value
        question['id'] = key
        
        documents = [doc.split(' ')[1] for doc in ids_docs if key+' ' in doc]
        if "bioasq" in dataset_name: 
            documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
            question['documents'] = documents_url
        elif "rob04" in dataset_name:
            question['documents'] = documents
        bm25_docs.append(dict(question))
    return bm25_docs

In [11]:
pkl_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in ['docset', split, '.pkl'])]

# pkl_files = [ x for x in os.listdir(dataloc) if all(y in x for y in ['docset', '.pkl'])]

In [12]:
pkl_files

['../../robust04_data/split_1/rob04_bm25_docset_top1000.test.s1.pkl']

In [13]:
# Convert pickle to trectext file format to be processed with galago
# pkl_file = [s for s in pkl_files if split in s]
# [output_file, doc_list ]= pickle_to_json(pkl_file[0])
doc_list = []
output_files = []
all_docs = []
for pkl_file in pkl_files:
#     print(pkl_file)
    docs = get_pickle_docs(pkl_file)
    doc_list = doc_list + list(docs.keys())
    all_docs.append(docs)
    out_name = pkl_file.split('/')[-1:][0]
    out_name = re.sub('\.pkl', '', out_name)
    output_file = baseline_files + out_name + '.gz'
    output_files.append(output_file)
    # print(out_name)
    doc_to_jsonfile(docs, output_file)

In [14]:
pkl_file

'../../robust04_data/split_1/rob04_bm25_docset_top1000.test.s1.pkl'

In [15]:
data_split = split
print(data_split)

if "rob04" in output_files[0]:
    s = re.findall("(s[0-5]).pkl$", pkl_file)
    dataset_name = "rob04"
    dataset_name_ext = dataset_name + '_'+ s[0]
#     dataset_name_ext = dataset_name 
    gold_file = '../../robust04_data/rob04.' + split +'.json'
#     with open(gold_file, 'w') as outfile:
#         json.dump(query_data, outfile, indent = 4)
    print(dataset_name_ext)
elif "bioasq" in output_file:
    print("bioasq")
    dataset_name = "bioasq"
    dataset_name_ext = dataset_name

test
rob04_s1


In [16]:
index_loc = baseline_files + 'index' + '_' + dataset_name_ext + '_' + data_split
index_input = output_files

if build_index_flag == 'yes':
    build_index(index_input, index_loc)

['./galago-3.10-bin/bin/galago', 'build', '--stemmer+krovetz', '--inputPath+./baseline_files/rob04_bm25_docset_top1000.test.s1.gz', '--indexPath=./baseline_files/index_rob04_s1_test']
Running without server!
Use --server=true to enable web-based status page.
/home/francisco/msc_project/not-a-punching-bag/reproduction/deep-relevance-ranking/models/baselines/./baseline_files/rob04_bm25_docset_top1000.test.s1.gz detected as trectext
Done Indexing.
  - 0.10 Hours
  - 5.92 Minutes
  - 355.39 Seconds
Documents Indexed: 44682.

None


In [17]:
output_file

'./baseline_files/rob04_bm25_docset_top1000.test.s1.gz'

In [18]:
q_filename = [ x for x in os.listdir(dataloc) if all(y in x for y in [dataset_name +'.'+ data_split, '.json'])]

In [19]:
q_filename

['rob04.test.s1.json']

In [20]:
# queries_file = dataloc + q_filename[0]

def load_queries(queries_file):
    with open(queries_file, 'rb') as input_file:
        query_data = json.load(input_file)
        return query_data['questions']

In [21]:
query_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(dataloc)
             for name in files
             if all(y in name for y in [dataset_name +'.'+ data_split, '.json'])]

In [22]:
queries = []
query_data = {}
for file in query_files:
    queries = queries + load_queries(file)
query_data['questions'] = queries

In [23]:
def save_preds(file, preds):
    with open(file, 'wt') as f_out:
        json.dump(preds, f_out, indent=4)
    print('Predictions file: ' + file + ', done!')

In [24]:
query_files[0].strip('split_1')

'../../robust04_data/split_1/rob04.test.s1.json'

In [25]:
def start_process():
    print( 'Starting', multiprocessing.current_process().name)

In [26]:
def extract_question(query):
    question = {}
    question['body'] = query['body']
    question['id'] = query['id']
#     print(query['body'].rstrip('.'))
#     documents = get_bm25_docs(query['body'].rstrip('.'), index_loc)
    documents = get_bm25_docs(query['body'], index_loc)
    if "bioasq" in dataset_name: 
        documents_url = ['http://www.ncbi.nlm.nih.gov/pubmed/' + doc for doc in documents]
        question['documents'] = documents_url
    elif "rob04" in dataset_name:
        question['documents'] = documents
    return dict(question)

In [27]:
# get_bm25_docs(query_data['questions'][0]['body'], index_loc)
index_loc

'./baseline_files/index_rob04_s1_test'

In [28]:
bm25_queries_file = baseline_files + 'bm25_queries_' + dataset_name_ext + '_' + data_split + '.json'
q_dict = generate_queries_file(queries,bm25_queries_file)


In [29]:
def bm25_computing(b_k):
    b = b_k[0]
    k = b_k[1]
#     b = 0.2
#     k = 0.8
    bm25_preds_file = baseline_files + 'bm25_preds_' + dataset_name_ext + '_' + data_split + '_' + 'b' + str(b) + 'k' + str(k) + '.json'
    #     print(bm25_preds_file)
    if os.path.isfile(bm25_preds_file):
        print(bm25_preds_file + "Already exists!!")
    #     return
    bm25_preds = {}
    bm25_preds['questions'] = get_bm25_docs(bm25_queries_file, q_dict, index_loc, b, k)

    save_preds(bm25_preds_file, bm25_preds)  

In [30]:
if __name__ == '__main__':
    
    if grid_search == 'yes':
        brange = np.arange(0,1,0.05)
        krange = np.arange(0.4,2,0.05)
    else:
        brange = [0.2]
        krange = [0.8]
        pool_size = 1

    b_k = [(round(b,3), round(k,3)) for b in brange for k in krange]
    pool = multiprocessing.Pool(processes=pool_size,
                                initializer=start_process,
                                )
    pool_outputs = pool.map(bm25_computing, b_k)
    pool.close() # no more tasks
    pool.join()  # wrap up current tasks

Starting ForkPoolWorker-1
Predictions file: ./baseline_files/bm25_preds_rob04_s1_test_b0.3k0.7.json, done!
