# Searching Documents

In [1]:
import elasticsearch
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts=['localhost:9200'])

**Variables for query**

In [2]:
data_year = '2016'
index_name = 'pubmed_baseline_'+ data_year

**Basic Query: for testing connection**

In [3]:
query = {
    "query": {
        "multi_match" : {
              "query":    "evidence tomato juice lowers cholesterol levels", 
              "fields": [ "abstract", "title" ] 
        }
    }
}

res = es.search(index=index_name, body=query) 
for doc in res['hits']['hits']:
    #print("%s) %s" % (doc['_id'], doc['_source']))
    print(doc['_id'],doc['_source']['title'])

(u'17130968', u'[Short-term Lycopersicum esculentum consumption may increase plasma high density lipoproteins and decrease oxidative stress].')
(u'17629300', u'Contribution of tomato phenolics to antioxidation and down-regulation of blood lipids.')
(u'21755327', u'Effect of consumption of tomato juice enriched with n-3 polyunsaturated fatty acids on the lipid profile, antioxidant biomarker status, and cardiovascular disease risk in healthy women.')
(u'24392102', u'The inhibitory effects of bioactive compounds of tomato juice binding to hepatic HMGCR: in vivo study and molecular modelling.')
(u'15199897', u'Designer orange juice lowers cholesterol.')
(u'17617941', u'Tomato juice decreases LDL cholesterol levels and increases LDL resistance to oxidation.')
(u'17640421', u'Influence of lycopene and vitamin C from tomato juice on biomarkers of oxidative stress and inflammation.')
(u'18342167', u'Citrate levels in fresh tomato juice: a possible dietary alternative to traditional citrate sup

## To see
elasticsearch is performing poorly, we use our training data set to see what is wrong.

In [4]:
# Training 6b document path
doc_path = '/data1/bioasq/raw_dataset/BioASQ-trainingDataset6b.json'

In [5]:
import json

def return_question_document_pairs(doc_path):
    """
    Returns: a dictionary with keys: question_body and values
    equal to the array of documents
    """
    with open(doc_path) as json_data:
        training_data = json.load(json_data)
        pair_que_doc = {}
        for question in training_data["questions"]:
            pair_que_doc[ question["body"] ] = question["documents"]
        return pair_que_doc
    return

**Function to search the questions from the training set on elasticsearch**

In [6]:
def search_question(question_to_search):
    """
    Returns: A list with de documents that returns
    elasticsearch for a given question
    """
    documents_found = set()
    query = {
         "query": {
             "multi_match" : {
                "query":    question_to_search, 
                "fields": [ "abstract", "title" ] 
             }
         }
     }
    res = es.search(index=index_name, body=query)
    for doc in res['hits']['hits']:
        documents_found.add(doc['_id'])
    return documents_found

**Measure of recall and presicion**

In [7]:
def print_measures(question_to_start, number_of_questions_to_visit):

    questions_training = return_question_document_pairs(doc_path)
    questions_body = questions_training.keys()
    last_url_section = lambda x: x.split("/")[-1] # Select last part of url to identify id
    
    recall_total = 0
    precision_total = 0
    
    for i in range(question_to_start, number_of_questions_to_visit):
        question = questions_body[i]
        relevant_documents = set(map(last_url_section, questions_training[question]))    
        retrieved_documents = search_question(question)
        
        
        recall_total += len(relevant_documents.intersection(retrieved_documents))/len(relevant_documents)
        precision_total += len(relevant_documents.intersection(retrieved_documents))/len(retrieved_documents)


    print "Recall:",float(recall_total)/number_of_questions_to_visit
    print "Precision:",float(precision_total)/number_of_questions_to_visit

In [8]:
print_measures(0, 100)

Recall: 0.07
Precision: 0.0


**Changing baseline**

In [9]:
data_year = '2018'
index_name = 'pubmed_baseline_'+ data_year

In [None]:
print_measures(0, 100)