In [None]:
! pip install elasticsearch=7.6.0

In [1]:
# Import packages 
try:
    import pandas as pd
    import numpy as np
    import elasticsearch
    from elasticsearch import Elasticsearch
    from elasticsearch import helpers
    import os
    
except Exception as e:
    print(e)

In [2]:
# Connect to your elastic search instance. Follow Appendix A
es = Elasticsearch(hosts = [{"host":"localhost", "port":9200, "scheme" : "http"}], timeout=40)

In [3]:
# Test your connection
es.ping()

True

In [8]:
# Configure Elasticsearch
Settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "custom_analyzer": {
                    "type":"custom",
                    "tokenizer": "whitespace",
                    "filter": ["lowercase","kstem"]
                }
            }, 
            "filter":{
                "kstem": {
                    "type":"kstem"
                }
            }
        }
    }, 
  "mappings": {
    "properties": {
      "ArticleTitle": {
        "type": "text",
        "analyzer": "custom_analyzer"
      },
      "AbstractText":{
          "type":"text",
          "analyzer":"custom_analyzer"
      },
      "PMID":{
          "type": "keyword",
          "index": "false"
      }
    }
  }
}

In [9]:
#Create the index
es.indices.create(index='genomicsindex', ignore= [400,404], body=Settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'genomicsindex'}

In [12]:
# Check the index
indices = es.indices.get_alias("*")
for index in indices:
    print(index)

.kibana_task_manager_1
genomicsindex
.kibana_1
.apm-agent-configuration


In [4]:
# OR query generator
def generate_or_query(user_query):
    # Assuming 'text' is the field you want to search on
    query = {
        "size":30,
        "query": {
            "query_string": {
                "default_field": "AbstractText",
                "query": user_query
            }
        }
    }
    return query


In [5]:
# Contruct MWE query
def construct_mwe_query(query, mwe):
    # Remove multi-words from the query
    
    for multi_word in mwe:
        query = query.replace(multi_word, "")

    # Split the remaining query into terms
    terms = query.split()

    match_clauses = [
        {"match": {"AbstractText": term}} for term in terms
    ]

    match_phrase_clauses = [
        {"match_phrase": {"AbstractText": multi_word}} for multi_word in mwe
    ]

    bool_query = {"bool": {"should": match_clauses + match_phrase_clauses}}

    return {"size": 30, "query": bool_query}




In [6]:
# Elastic search result function. We will call this to fetch the results from index
def pretty_response(response):
    result = []
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        print("Length: ", len(response["hits"]["hits"]))
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            PMID = hit["_source"]["PMID"]
            result.append({"PMID": PMID, "score":score, "id":id})
            # pretty_output = f"\nID: {id}\nPMID: {PMID}\nScore: {score}"
        return result

In [7]:
# Enter query and the MWE present in it
query = "generate transgenic mice"
mwe = ["transgenic mice"]
mwe_query = construct_mwe_query(query, mwe)
regular_query = generate_or_query(query)
print(mwe_query)
print(regular_query)

{'size': 30, 'query': {'bool': {'should': [{'match': {'AbstractText': 'generate'}}, {'match_phrase': {'AbstractText': 'transgenic mice'}}]}}}
{'size': 30, 'query': {'query_string': {'default_field': 'AbstractText', 'query': 'generate transgenic mice'}}}


In [12]:
# Search the index
response = es.search(index="genomicsindex", body=mwe_query)
response = es.search(index="genomicsindex", body=regular_query)

In [7]:
# Get the results
def getResult(response):
    result = pretty_response(response)
    result_PMID = {}
    for docs in result:
        result_PMID[docs["PMID"]] = { "id": docs["id"], "score": docs["score"] }
        
    return result_PMID