# **Milestone 3:**
Semantic Search with ML and BERT


In [1]:
!pip install transformers
!pip install -U sentence-transformers
!pip install elasticsearch



### **Importing the require modules**

In [2]:
# import libraries
import json
import torch
import numpy as np
from elasticsearch import Elasticsearch, helpers
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
from pprint import pprint

### **Getting the data**

In [3]:
DATA_DIR = '/mnt/c/Users/Daniel/elasticsearch/'

In [4]:
# load the json file
with open(DATA_DIR + '03_data.json', 'r') as outfile:
    data = json.load(outfile)

In [5]:
# print sample sentences
g = (d for d in data)
[next(g) for i in range(2)]

[{'section_title': 'Summary',
  'text': 'Pandemic,Summary\nA pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pa

In [6]:
print(len(data))

401


### **Create Elasticsearch index**

In [7]:
# check if the elasticsearch container is running
!curl http://localhost:9200/

{
  "name" : "e3c8630687bc",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "XRHTwItGQZG8bife8EVvSQ",
  "version" : {
    "number" : "7.16.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "4e6e4eab2297e949ec994e688dad46290d018022",
    "build_date" : "2022-01-06T23:43:02.825887787Z",
    "build_snapshot" : false,
    "lucene_version" : "8.10.1",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [8]:
# instantiate elasticsearch client
client = Elasticsearch()

In [9]:
# use helpers' bulk API to index list of elasticsearch docs
response = helpers.bulk(client, data, index='pandemics')



In [10]:
# count documents in all indices
client.cat.count("_all", params={"format": "json"})

  client.cat.count("_all", params={"format": "json"})


[{'epoch': '1643030738', 'timestamp': '13:25:38', 'count': '1203'}]

In [11]:
# check currently available indices
client.indices.get_alias("_all")

  client.indices.get_alias("_all")


{'pandemics': {'aliases': {}}}

In [12]:
# count documents in specific index
client.cat.count(['pandemics'], params={"format": "json"})

  client.cat.count(['pandemics'], params={"format": "json"})


[{'epoch': '1643030738', 'timestamp': '13:25:38', 'count': '1203'}]

### **Different queries**

#### **Match query**

In [13]:
# create a query body and search a string (fuzzy query)
question = "spanish flu"
match_query_body = {"query": {"match": {"text": question}}}

In [14]:
# submit the search query to elasticsearch
docs = client.search(body = match_query_body, index="pandemics", size=15)

print(type(docs))
docs

<class 'dict'>


  docs = client.search(body = match_query_body, index="pandemics", size=15)


{'took': 33,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 135, 'relation': 'eq'},
  'max_score': 9.225321,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'vjGwi34Bc6e2DGlkoCBX',
    '_score': 9.225321,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Influenza',
     'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\nThe Greek physician Hippocrates, the "Father of Medicine", first described influenza in 412 BC.\nThe first influenza pandemic to be pathologically described occurred in 1510. Since the pandemic of 1580, influenza pandemics have occurred every 10 to 30 years.\nThe 1889–1890 flu pandemic, also known as Russian Flu or Asiatic Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By October, it had reached Tomsk and the Caucasus. It rapidly spread west and hit North America in December 1889, South America in February–April 1890, India

In [15]:
# print the three top results
def top_three_results(docs, responses):
    for doc in docs["hits"]["hits"][:3]:
        article_title = doc['_source']['article_title']
        section_title = doc['_source']['section_title']
        score = doc['_score']
        idx = doc['_index']
        text = doc['_source']['text']
        url = doc['_source']['source_url']

        responses.append({'article_title': article_title,
                         'section_title': section_title,
                         'text': text,
                          'url': url,
                          'score': score,
                         })
    return responses

In [16]:
responses = []
pprint(top_three_results(docs=docs, responses=responses))

[{'article_title': 'Pandemic',
  'score': 9.225321,
  'section_title': 'Influenza',
  'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\n'
          'The Greek physician Hippocrates, the "Father of Medicine", first '
          'described influenza in 412 BC.\n'
          'The first influenza pandemic to be pathologically described '
          'occurred in 1510. Since the pandemic of 1580, influenza pandemics '
          'have occurred every 10 to 30 years.\n'
          'The 1889–1890 flu pandemic, also known as Russian Flu or Asiatic '
          'Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By '
          'October, it had reached Tomsk and the Caucasus. It rapidly spread '
          'west and hit North America in December 1889, South America in '
          'February–April 1890, India in February–March 1890, and Australia in '
          'March–April 1890. The H3N8 and H2N2 subtypes of the Influenza A '
          'virus have each been identified as

#### **Term query**

In [17]:
# create a query body and search for a perfect match
## note that you need to use the keyword version of the field if your search query contains more that one word.
term_question = "Zoonotic viruses"
term_query_body = {"query": {"term": {"section_title.keyword": term_question}}}

In [18]:
# submit the search query to elasticsearch
term_docs = client.search(body = term_query_body, index="pandemics", size=5)

print(type(term_docs))
term_docs

<class 'dict'>


  term_docs = client.search(body = term_query_body, index="pandemics", size=5)


{'took': 13,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 5.8406415,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'XjGwi34Bc6e2DGlkoCFb',
    '_score': 5.8406415,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Zoonotic viruses',
     'text': 'Disease X,Candidates,Zoonotic viruses\nOn the addition of Disease X in 2018, the WHO said it could come from many sources citing haemorrhagic fevers and the more recent non-polio enterovirus. However, Røttingen speculated that Disease X would be more likely come from zoonotic transmission (an animal virus that jumps to humans), saying: "It\'s a natural process and it is vital that we are aware and prepare. It is probably the greatest risk". WHO special advisor Professor Marion Koopmans, also noted that the rate at which zoonotic diseases were appearing was accelerating, saying: "The intensity of

In [19]:
# print the three top results
term_responses = []
pprint(top_three_results(term_docs, responses=term_responses))

[{'article_title': 'Disease X',
  'score': 5.8406415,
  'section_title': 'Zoonotic viruses',
  'text': 'Disease X,Candidates,Zoonotic viruses\n'
          'On the addition of Disease X in 2018, the WHO said it could come '
          'from many sources citing haemorrhagic fevers and the more recent '
          'non-polio enterovirus. However, Røttingen speculated that Disease X '
          'would be more likely come from zoonotic transmission (an animal '
          'virus that jumps to humans), saying: "It\'s a natural process and '
          'it is vital that we are aware and prepare. It is probably the '
          'greatest risk". WHO special advisor Professor Marion Koopmans, also '
          'noted that the rate at which zoonotic diseases were appearing was '
          'accelerating, saying: "The intensity of animal and human contact is '
          'becoming much greater as the world develops. This makes it more '
          'likely new diseases will emerge but also modern travel and

#### **Boolean query**

In [20]:
# exclude sections from the search results
bool_question = "world health organization"
exclude_sections = "External links"
bool_query_body = {
  "query": {
    "bool": {
        "should": 
        { "match": {"text": bool_question}},
        "must_not": {
            "term" : { "section_title.keyword" : exclude_sections }
      },
    }
  }
}

In [21]:
# submit the search query to elasticsearch
bool_docs = client.search(body = bool_query_body, index="pandemics", size=5)

print(type(bool_docs))
bool_docs

<class 'dict'>


  bool_docs = client.search(body = bool_query_body, index="pandemics", size=5)


{'took': 14,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 474, 'relation': 'eq'},
  'max_score': 7.7904243,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'ZjGwi34Bc6e2DGlkoCFb',
    '_score': 7.7904243,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Summary',
     'text': 'Johns Hopkins Center for Health Security,Summary\nThe Johns Hopkins Center for Health Security (abbreviated CHS; previously the UPMC Center for Health Security, the Center for Biosecurity of UPMC, and the Johns Hopkins Center for Civilian Biodefense Strategies) is an independent, nonprofit organization of the Johns Hopkins Bloomberg School of Public Health, and part of the Environmental Health and Engineering department. It is concerned with the areas of health consequences from epidemics and disasters as well as averting biological weapons development, and implications of biosecurity for the bioeconom

In [22]:
# print the three top results
bool_responses = []
pprint(top_three_results(bool_docs, responses=bool_responses))

[{'article_title': 'Johns Hopkins Center for Health Security',
  'score': 7.7904243,
  'section_title': 'Summary',
  'text': 'Johns Hopkins Center for Health Security,Summary\n'
          'The Johns Hopkins Center for Health Security (abbreviated CHS; '
          'previously the UPMC Center for Health Security, the Center for '
          'Biosecurity of UPMC, and the Johns Hopkins Center for Civilian '
          'Biodefense Strategies) is an independent, nonprofit organization of '
          'the Johns Hopkins Bloomberg School of Public Health, and part of '
          'the Environmental Health and Engineering department. It is '
          'concerned with the areas of health consequences from epidemics and '
          'disasters as well as averting biological weapons development, and '
          'implications of biosecurity for the bioeconomy. It is a think tank '
          'that does policy research and gives policy recommendations to the '
          'United States government as well a

#### **Field boosting**

In [23]:
# boost title fields by a factor of 3
boost_question = "covid-19 pandemic"
boost_query_body = {
    "query": {
        "multi_match": {
            "query": boost_question,
            "fields": [ "article_title^3", "section_title^3" ] 
        }
    }
}

In [24]:
# submit the search query to elasticsearch
boost_docs = client.search(body = boost_query_body, index="pandemics", size=5)

print(type(boost_docs))
boost_docs

<class 'dict'>


  boost_docs = client.search(body = boost_query_body, index="pandemics", size=5)


{'took': 28,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 393, 'relation': 'eq'},
  'max_score': 28.001842,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'uzGwi34Bc6e2DGlkoCBX',
    '_score': 28.001842,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'COVID-19',
     'text': 'Pandemic,Current pandemics,COVID-19\nA new strain of coronavirus was first identified in the city of Wuhan, Hubei province, China, in late December 2019. It has caused a cluster of cases of an acute respiratory disease, which is referred to as coronavirus disease 2019 (COVID-19). According to media reports, more than 200 countries and territories have been affected by COVID-19, with major outbreaks occurring in Brazil, Russia, India, Mexico, Peru, South Africa, Western Europe and the United States. On 11 March 2020, the World Health Organization characterized the spread of COVID-19 as a pandemic. As o

In [25]:
# print the three top results
boost_responses = []
pprint(top_three_results(boost_docs, responses=boost_responses))

[{'article_title': 'Pandemic',
  'score': 28.001842,
  'section_title': 'COVID-19',
  'text': 'Pandemic,Current pandemics,COVID-19\n'
          'A new strain of coronavirus was first identified in the city of '
          'Wuhan, Hubei province, China, in late December 2019. It has caused '
          'a cluster of cases of an acute respiratory disease, which is '
          'referred to as coronavirus disease 2019 (COVID-19). According to '
          'media reports, more than 200 countries and territories have been '
          'affected by COVID-19, with major outbreaks occurring in Brazil, '
          'Russia, India, Mexico, Peru, South Africa, Western Europe and the '
          'United States. On 11 March 2020, the World Health Organization '
          'characterized the spread of COVID-19 as a pandemic. As of 16 '
          'November 2020, the number of people infected with COVID-19 has '
          'reached 54,978,057 worldwide, of whom 38,243,617 have recovered. '
          'The deat

#### **Highlights**

In [26]:
# create a query body and a search string (...)
hl_question = "genetic mutations"
hl_query_body = {
    "query": {
        "match": { "text": hl_question }
    },
    "highlight": {
        "fields": {"text": {"number_of_fragments" : 1, 'fragment_size':256}
                  }
    }
}

In [27]:
# submit the search query to elasticsearch
hl_docs = client.search(body = hl_query_body, index="pandemics", size=5)

print(type(hl_docs))
hl_docs

  hl_docs = client.search(body = hl_query_body, index="pandemics", size=5)


<class 'dict'>


{'took': 55,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 84, 'relation': 'eq'},
  'max_score': 8.6498,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'KDGwi34Bc6e2DGlkoCJe',
    '_score': 8.6498,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Genetic mutation',
     'text': 'Virus,Microbiology,Genetic mutation\nViruses undergo genetic change by several mechanisms. These include a process called antigenic drift where individual bases in the DNA or RNA mutate to other bases. Most of these point mutations are "silent"—they do not change the protein that the gene encodes—but others can confer evolutionary advantages such as resistance to antiviral drugs. Antigenic shift occurs when there is a major change in the genome of the virus. This can be a result of recombination or reassortment. When this happens with influenza viruses, pandemics might result. RNA viruses often exist

In [28]:
# print the three top results
hl_responses = []
pprint(top_three_results(hl_docs, responses=hl_responses))

[{'article_title': 'Virus',
  'score': 8.6498,
  'section_title': 'Genetic mutation',
  'text': 'Virus,Microbiology,Genetic mutation\n'
          'Viruses undergo genetic change by several mechanisms. These include '
          'a process called antigenic drift where individual bases in the DNA '
          'or RNA mutate to other bases. Most of these point mutations are '
          '"silent"—they do not change the protein that the gene encodes—but '
          'others can confer evolutionary advantages such as resistance to '
          'antiviral drugs. Antigenic shift occurs when there is a major '
          'change in the genome of the virus. This can be a result of '
          'recombination or reassortment. When this happens with influenza '
          'viruses, pandemics might result. RNA viruses often exist as '
          'quasispecies or swarms of viruses of the same species but with '
          'slightly different genome nucleoside sequences. Such quasispecies '
          'are a p

### **Reranking with BERT**

**Boolean query**

In [29]:
# using a boolean query to exclude irrelevant sections
def boolean_query(exclude_sections, query, index=""):

    bool_query_body = {
      "query": {
        "bool": {
            "should": 
            { "match": {"text": query}},
            "must_not": {
                "terms": {"section_title.keyword": exclude_sections}
          },
        }
      }
    }

    # submit a search query to ElasticSearch
    bool_docs = client.search(body = bool_query_body, index=index, size=50)
    
    # reshape search results to prepare them for sentence embeddings
    texts = []
    section_titles = []
    article_titles = []
    score = []
    
    for doc in docs["hits"]["hits"]:
        texts.append(doc["_source"]["text"])
        section_titles.append(doc["_source"]["section_title"])
        article_titles.append(doc["_source"]["article_title"])
        score.append(doc["_score"])
    
    return texts, article_titles, section_titles, score

In [30]:
query = "what is desease X?"
exclude_sections = ["See also", "Further reading", "Data and graphs", "Medical journals", "External links"]

results = boolean_query(exclude_sections=exclude_sections,
                       index="pandemics",
                       query=query)

  bool_docs = client.search(body = bool_query_body, index=index, size=50)


In [31]:
# print the titles (in descending order of elasticsearch's tfidf_scores )
print(results[1])

['Pandemic', 'Pandemic', 'Pandemic', 'Disease X', 'Disease X', 'Disease X', 'Swine influenza', 'Swine influenza', 'Swine influenza', 'Pandemic', 'Pandemic', 'Pandemic', 'Pandemic', 'Pandemic', 'Pandemic']


**Sentence embeddings**

In [32]:
def compute_embeddings(query, results, model, top_k=10):
    texts = results[0]
    article_titles = results[1]
    section_titles = results[2]
    
    emb = SentenceTransformer(model)
    corpus_emb = emb.encode(texts, convert_to_tensor=True)
    query_emb = emb.encode(query, convert_to_tensor=True)
    reranked_results = util.semantic_search(query_emb, corpus_emb, top_k=top_k)[0]
    
    rr_results_list = []
    for item in reranked_results:
        idx = item['corpus_id']
        rr_results_dict = {
            'bert_score': item['score'],
            'article_title': article_titles[idx],
            'section_title': section_titles[idx],
            'text': texts[idx]
        }
        rr_results_list.append(rr_results_dict)
    
    return rr_results_list

In [33]:
# using the distilled BERT model
reranked = compute_embeddings(query, results, model='distilbert-base-nli-stsb-mean-tokens')
pprint(reranked)

[{'article_title': 'Disease X',
  'bert_score': 0.21330812573432922,
  'section_title': 'In popular culture',
  'text': 'Disease X,In popular culture,Candidates\n'
          'In 2018, the Museum of London ran an exhibition titled "Disease X: '
          'London\'s next epidemic?", hosted for the centenary of the Spanish '
          'flu epidemic from 1918.The term features in the title of several '
          'fiction books that involve global pandemic diseases, such as '
          'Disease  (2020), and Disease X: The Outbreak (2019).'},
 {'article_title': 'Disease X',
  'bert_score': 0.21330812573432922,
  'section_title': 'In popular culture',
  'text': 'Disease X,In popular culture,Candidates\n'
          'In 2018, the Museum of London ran an exhibition titled "Disease X: '
          'London\'s next epidemic?", hosted for the centenary of the Spanish '
          'flu epidemic from 1918.The term features in the title of several '
          'fiction books that involve global pandemic d