# **Milestone 3:**
Semantic Search with ML and BERT


In [1]:
!pip install transformers
!pip install -U sentence-transformers
!pip install elasticsearch
!pip install wikipedia-api



### **Importing the require modules**

In [2]:
# import libraries
import json
import torch
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
from pprint import pprint
import wikipediaapi

## **Intro to ElasticSearch**

### **Getting the data**

In [3]:
DATA_DIR = '/mnt/c/Users/Daniel/elasticsearch/'

In [4]:
# load the json file
with open(DATA_DIR + '03_data.json', 'r') as outfile:
    data = json.load(outfile)

In [5]:
# print sample sentences
g = (d for d in data)
[next(g) for i in range(2)]

[{'section_title': 'Summary',
  'text': 'Pandemic,Summary\nA pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pa

In [6]:
print(len(data))

401


### **Create Elasticsearch index**

In [7]:
# check if the elasticsearch container is running
!curl http://localhost:9200/

{
  "name" : "3653a1693ee0",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "HNElKsF8Rv-AHXKtyJm7aw",
  "version" : {
    "number" : "7.17.0",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "bee86328705acaa9a6daede7140defd4d9ec56bd",
    "build_date" : "2022-01-28T08:36:04.875279988Z",
    "build_snapshot" : false,
    "lucene_version" : "8.11.1",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [8]:
# instantiate elasticsearch client
client = Elasticsearch()

In [9]:
# use helpers' bulk API to index list of elasticsearch docs
response = helpers.bulk(client, data, index='pandemics')



In [10]:
# count documents in all indices
client.cat.count("_all", params={"format": "json"})

  client.cat.count("_all", params={"format": "json"})


[{'epoch': '1644842545', 'timestamp': '12:42:25', 'count': '802'}]

In [11]:
# check currently available indices
client.indices.get_alias("_all")

  client.indices.get_alias("_all")


{'pandemics': {'aliases': {}}}

In [12]:
# count documents in specific index
client.cat.count(['pandemics'], params={"format": "json"})

  client.cat.count(['pandemics'], params={"format": "json"})


[{'epoch': '1644842545', 'timestamp': '12:42:25', 'count': '802'}]

### **Different queries**

#### **Match query**

In [13]:
# create a query body and search a string (fuzzy query)
question = "spanish flu"
match_query_body = {"query": {"match": {"text": question}}}

In [14]:
# submit the search query to elasticsearch
docs = client.search(body = match_query_body, index="pandemics", size=15)

print(type(docs))
docs

<class 'dict'>


  docs = client.search(body = match_query_body, index="pandemics", size=15)


{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 90, 'relation': 'eq'},
  'max_score': 9.211748,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'hiA7-H4BmN6FcmA2myTF',
    '_score': 9.211748,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Influenza',
     'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\nThe Greek physician Hippocrates, the "Father of Medicine", first described influenza in 412 BC.\nThe first influenza pandemic to be pathologically described occurred in 1510. Since the pandemic of 1580, influenza pandemics have occurred every 10 to 30 years.\nThe 1889–1890 flu pandemic, also known as Russian Flu or Asiatic Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By October, it had reached Tomsk and the Caucasus. It rapidly spread west and hit North America in December 1889, South America in February–April 1890, India i

In [15]:
# print the three top results
def top_three_results(docs, responses):
    for doc in docs["hits"]["hits"][:3]:
        article_title = doc['_source']['article_title']
        section_title = doc['_source']['section_title']
        score = doc['_score']
        idx = doc['_index']
        text = doc['_source']['text']
        url = doc['_source']['source_url']

        responses.append({'article_title': article_title,
                         'section_title': section_title,
                         'text': text,
                          'url': url,
                          'score': score,
                         })
    return responses

In [16]:
responses = []
pprint(top_three_results(docs=docs, responses=responses))

[{'article_title': 'Pandemic',
  'score': 9.211748,
  'section_title': 'Influenza',
  'text': 'Pandemic,Notable outbreaks,Major outbreaks in countries,Influenza\n'
          'The Greek physician Hippocrates, the "Father of Medicine", first '
          'described influenza in 412 BC.\n'
          'The first influenza pandemic to be pathologically described '
          'occurred in 1510. Since the pandemic of 1580, influenza pandemics '
          'have occurred every 10 to 30 years.\n'
          'The 1889–1890 flu pandemic, also known as Russian Flu or Asiatic '
          'Flu, was first reported in May 1889 in Bukhara, Uzbekistan. By '
          'October, it had reached Tomsk and the Caucasus. It rapidly spread '
          'west and hit North America in December 1889, South America in '
          'February–April 1890, India in February–March 1890, and Australia in '
          'March–April 1890. The H3N8 and H2N2 subtypes of the Influenza A '
          'virus have each been identified as

#### **Term query**

In [17]:
# create a query body and search for a perfect match
## note that you need to use the keyword version of the field if your search query contains more that one word.
term_question = "Zoonotic viruses"
term_query_body = {"query": {"term": {"section_title.keyword": term_question}}}

In [18]:
# submit the search query to elasticsearch
term_docs = client.search(body = term_query_body, index="pandemics", size=5)

print(type(term_docs))
term_docs

<class 'dict'>


  term_docs = client.search(body = term_query_body, index="pandemics", size=5)


{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 5.772064,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'JiA7-H4BmN6FcmA2myXI',
    '_score': 5.772064,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Zoonotic viruses',
     'text': 'Disease X,Candidates,Zoonotic viruses\nOn the addition of Disease X in 2018, the WHO said it could come from many sources citing haemorrhagic fevers and the more recent non-polio enterovirus. However, Røttingen speculated that Disease X would be more likely come from zoonotic transmission (an animal virus that jumps to humans), saying: "It\'s a natural process and it is vital that we are aware and prepare. It is probably the greatest risk". WHO special advisor Professor Marion Koopmans, also noted that the rate at which zoonotic diseases were appearing was accelerating, saying: "The intensity of an

In [19]:
# print the three top results
term_responses = []
pprint(top_three_results(term_docs, responses=term_responses))

[{'article_title': 'Disease X',
  'score': 5.772064,
  'section_title': 'Zoonotic viruses',
  'text': 'Disease X,Candidates,Zoonotic viruses\n'
          'On the addition of Disease X in 2018, the WHO said it could come '
          'from many sources citing haemorrhagic fevers and the more recent '
          'non-polio enterovirus. However, Røttingen speculated that Disease X '
          'would be more likely come from zoonotic transmission (an animal '
          'virus that jumps to humans), saying: "It\'s a natural process and '
          'it is vital that we are aware and prepare. It is probably the '
          'greatest risk". WHO special advisor Professor Marion Koopmans, also '
          'noted that the rate at which zoonotic diseases were appearing was '
          'accelerating, saying: "The intensity of animal and human contact is '
          'becoming much greater as the world develops. This makes it more '
          'likely new diseases will emerge but also modern travel and 

#### **Boolean query**

In [20]:
# exclude sections from the search results
bool_question = "world health organization"
exclude_sections = "External links"
bool_query_body = {
  "query": {
    "bool": {
        "should": 
        { "match": {"text": bool_question}},
        "must_not": {
            "term" : { "section_title.keyword" : exclude_sections }
      },
    }
  }
}

In [21]:
# submit the search query to elasticsearch
bool_docs = client.search(body = bool_query_body, index="pandemics", size=5)

print(type(bool_docs))
bool_docs

<class 'dict'>


  bool_docs = client.search(body = bool_query_body, index="pandemics", size=5)


{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 316, 'relation': 'eq'},
  'max_score': 7.7866364,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'LiA7-H4BmN6FcmA2myXJ',
    '_score': 7.7866364,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Summary',
     'text': 'Johns Hopkins Center for Health Security,Summary\nThe Johns Hopkins Center for Health Security (abbreviated CHS; previously the UPMC Center for Health Security, the Center for Biosecurity of UPMC, and the Johns Hopkins Center for Civilian Biodefense Strategies) is an independent, nonprofit organization of the Johns Hopkins Bloomberg School of Public Health, and part of the Environmental Health and Engineering department. It is concerned with the areas of health consequences from epidemics and disasters as well as averting biological weapons development, and implications of biosecurity for the bioeconomy

In [22]:
# print the three top results
bool_responses = []
pprint(top_three_results(bool_docs, responses=bool_responses))

[{'article_title': 'Johns Hopkins Center for Health Security',
  'score': 7.7866364,
  'section_title': 'Summary',
  'text': 'Johns Hopkins Center for Health Security,Summary\n'
          'The Johns Hopkins Center for Health Security (abbreviated CHS; '
          'previously the UPMC Center for Health Security, the Center for '
          'Biosecurity of UPMC, and the Johns Hopkins Center for Civilian '
          'Biodefense Strategies) is an independent, nonprofit organization of '
          'the Johns Hopkins Bloomberg School of Public Health, and part of '
          'the Environmental Health and Engineering department. It is '
          'concerned with the areas of health consequences from epidemics and '
          'disasters as well as averting biological weapons development, and '
          'implications of biosecurity for the bioeconomy. It is a think tank '
          'that does policy research and gives policy recommendations to the '
          'United States government as well a

#### **Field boosting**

In [23]:
# boost title fields by a factor of 3
boost_question = "covid-19 pandemic"
boost_query_body = {
    "query": {
        "multi_match": {
            "query": boost_question,
            "fields": [ "article_title^3", "section_title^3" ] 
        }
    }
}

In [24]:
# submit the search query to elasticsearch
boost_docs = client.search(body = boost_query_body, index="pandemics", size=5)

print(type(boost_docs))
boost_docs

<class 'dict'>


  boost_docs = client.search(body = boost_query_body, index="pandemics", size=5)


{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 262, 'relation': 'eq'},
  'max_score': 27.882988,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': 'gyA7-H4BmN6FcmA2myTF',
    '_score': 27.882988,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'COVID-19',
     'text': 'Pandemic,Current pandemics,COVID-19\nA new strain of coronavirus was first identified in the city of Wuhan, Hubei province, China, in late December 2019. It has caused a cluster of cases of an acute respiratory disease, which is referred to as coronavirus disease 2019 (COVID-19). According to media reports, more than 200 countries and territories have been affected by COVID-19, with major outbreaks occurring in Brazil, Russia, India, Mexico, Peru, South Africa, Western Europe and the United States. On 11 March 2020, the World Health Organization characterized the spread of COVID-19 as a pandemic. As of

In [25]:
# print the three top results
boost_responses = []
pprint(top_three_results(boost_docs, responses=boost_responses))

[{'article_title': 'Pandemic',
  'score': 27.882988,
  'section_title': 'COVID-19',
  'text': 'Pandemic,Current pandemics,COVID-19\n'
          'A new strain of coronavirus was first identified in the city of '
          'Wuhan, Hubei province, China, in late December 2019. It has caused '
          'a cluster of cases of an acute respiratory disease, which is '
          'referred to as coronavirus disease 2019 (COVID-19). According to '
          'media reports, more than 200 countries and territories have been '
          'affected by COVID-19, with major outbreaks occurring in Brazil, '
          'Russia, India, Mexico, Peru, South Africa, Western Europe and the '
          'United States. On 11 March 2020, the World Health Organization '
          'characterized the spread of COVID-19 as a pandemic. As of 16 '
          'November 2020, the number of people infected with COVID-19 has '
          'reached 54,978,057 worldwide, of whom 38,243,617 have recovered. '
          'The deat

#### **Highlights**

In [26]:
# create a query body and a search string (...)
hl_question = "genetic mutations"
hl_query_body = {
    "query": {
        "match": { "text": hl_question }
    },
    "highlight": {
        "fields": {"text": {"number_of_fragments" : 1, 'fragment_size':256}
                  }
    }
}

In [27]:
# submit the search query to elasticsearch
hl_docs = client.search(body = hl_query_body, index="pandemics", size=5)

print(type(hl_docs))
hl_docs

  hl_docs = client.search(body = hl_query_body, index="pandemics", size=5)


<class 'dict'>


{'took': 7,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 56, 'relation': 'eq'},
  'max_score': 8.6320095,
  'hits': [{'_index': 'pandemics',
    '_type': '_doc',
    '_id': '8CA7-H4BmN6FcmA2myXL',
    '_score': 8.6320095,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Genetic mutation',
     'text': 'Virus,Microbiology,Genetic mutation\nViruses undergo genetic change by several mechanisms. These include a process called antigenic drift where individual bases in the DNA or RNA mutate to other bases. Most of these point mutations are "silent"—they do not change the protein that the gene encodes—but others can confer evolutionary advantages such as resistance to antiviral drugs. Antigenic shift occurs when there is a major change in the genome of the virus. This can be a result of recombination or reassortment. When this happens with influenza viruses, pandemics might result. RNA viruses often 

In [28]:
# print the three top results
hl_responses = []
pprint(top_three_results(hl_docs, responses=hl_responses))

[{'article_title': 'Virus',
  'score': 8.6320095,
  'section_title': 'Genetic mutation',
  'text': 'Virus,Microbiology,Genetic mutation\n'
          'Viruses undergo genetic change by several mechanisms. These include '
          'a process called antigenic drift where individual bases in the DNA '
          'or RNA mutate to other bases. Most of these point mutations are '
          '"silent"—they do not change the protein that the gene encodes—but '
          'others can confer evolutionary advantages such as resistance to '
          'antiviral drugs. Antigenic shift occurs when there is a major '
          'change in the genome of the virus. This can be a result of '
          'recombination or reassortment. When this happens with influenza '
          'viruses, pandemics might result. RNA viruses often exist as '
          'quasispecies or swarms of viruses of the same species but with '
          'slightly different genome nucleoside sequences. Such quasispecies '
          'are 

### **Reranking with BERT**

**Boolean query**

In [29]:
# using a boolean query to exclude irrelevant sections
def boolean_query(exclude_sections, query, index=""):

    bool_query_body = {
      "query": {
        "bool": {
            "should": 
            { "match": {"text": query}},
            "must_not": {
                "terms": {"section_title.keyword": exclude_sections}
          },
        }
      }
    }

    # submit a search query to ElasticSearch
    bool_docs = client.search(body = bool_query_body, index=index, size=50)
    
    # reshape search results to prepare them for sentence embeddings
    texts = []
    section_titles = []
    article_titles = []
    score = []
    
    for doc in bool_docs["hits"]["hits"]:
        texts.append(doc["_source"]["text"])
        section_titles.append(doc["_source"]["section_title"])
        article_titles.append(doc["_source"]["article_title"])
        score.append(doc["_score"])
    
    return texts, article_titles, section_titles, score

In [30]:
query = "what is desease X?"
exclude_sections = ["See also", "Further reading", "Data and graphs", "Medical journals", "External links"]

results = boolean_query(exclude_sections=exclude_sections,
                       index="pandemics",
                       query=query)

  bool_docs = client.search(body = bool_query_body, index=index, size=50)


In [31]:
# print the titles (in descending order of elasticsearch's tfidf_scores )
print(results[1])

['Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Johns Hopkins Center for Health Security', 'Johns Hopkins Center for Health Security', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Disease X', 'Swine influenza', 'Swine influenza', 'Pandemic severity index', 'Pandemic severity index', 'Virus', 'Virus', 'Targeted immunization strategies', 'Targeted immunization strategies', 'Cholera', 'Cholera', 'Plague of Cyprian', 'Plague of Cyprian', 'Pandemic severity index', 'Pandemic severity index', 'Cholera', 'Cholera', 'Cholera', 'Cholera', 'Johns Hopkins Center for Health Security', 'Johns Hopkins Center for Health Security', 'Crimson Contagion', 'Crimson Contagion', 'HIV/AIDS', 'HIV/AIDS', 'HIV/AIDS', 'HIV/AIDS', 'Superspreading event', 'Superspreading event', 'Pandemic', 'Pandemic']


**Sentence embeddings**

In [32]:
'''
elasticsearch uses tfidf_scores to rank the search results. This causes
documents to be prioritized by the frequency of keywords in the documents 
normalized by the total number of terms. Hence, the results tend to be shorter 
documents rich in terms often receiving a higher relevance score compared to 
more descriptive, longer, and more information-rich documents. To improve the 
quality sentence embeddings for the results returned by elasticsearch
sentence-transformers BERT models finetuned for semantic text search will be
computed
'''

def compute_embeddings(query, results, model, top_k=10):
    texts = results[0]
    article_titles = results[1]
    section_titles = results[2]
    
    emb = SentenceTransformer(model)
    corpus_emb = emb.encode(texts, convert_to_tensor=True)
    query_emb = emb.encode(query, convert_to_tensor=True)
    # returns a list of dictionaries with the keys 'corpus_id' and 'score',
    # sorted by decreasing cosine similarity scores
    reranked_results = util.semantic_search(query_emb, corpus_emb, top_k=top_k)[0]
    
    rr_results_list = []
    for item in reranked_results:
        idx = item['corpus_id']
        rr_results_dict = {
            'bert_score': item['score'],
            'article_title': article_titles[idx],
            'section_title': section_titles[idx],
            'text': texts[idx]
        }
        rr_results_list.append(rr_results_dict)
    
    return rr_results_list

In [33]:
# using the distilled BERT model
reranked = compute_embeddings(query, results, model='distilbert-base-nli-stsb-mean-tokens')
pprint(reranked)

[{'article_title': 'Disease X',
  'bert_score': 0.29652318358421326,
  'section_title': 'Summary',
  'text': 'Disease X,Summary\n'
          'Disease X is a placeholder name that was adopted by the World '
          'Health Organization (WHO) in February 2018 on their shortlist of '
          'blueprint priority diseases to represent a hypothetical, unknown '
          'pathogen that could cause a future epidemic. The WHO adopted the '
          'placeholder name to ensure that their planning was sufficiently '
          'flexible to adapt to an unknown pathogen (e.g. broader vaccines and '
          'manufacturing facilities).  Director of the US National Institute '
          'of Allergy and Infectious Diseases Anthony Fauci stated that the '
          'concept of Disease X would encourage WHO projects to focus their '
          'research efforts on entire classes of viruses (e.g. flaviviruses), '
          'instead of just individual strains (e.g. zika virus), thus '
          'impr

## **Custom Search API using ElasticSearch**

Collect articles from Wikipedia on your topic of choice, create a new Elasticsearch index and add the articles to it.

### **Getting the data**

In [34]:
# initialize Wikipedia object
wiki_data = wikipediaapi.Wikipedia('en')

In [35]:
# to get all pages from a given category (here: artificial intelligence), use property categorymembers
# returns all members of given category

articles = []
def print_categorymembers(categorymembers, level=0, max_level=0):
        for c in categorymembers.values():
            if "Category" not in c.title:
            # print title and namespace
              print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
              articles.append(c.title)
            if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
              print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)


cat = wiki_data.page("Category:Artificial intelligence")
print("Category members: Category:Artificial Intelligence")
print_categorymembers(cat.categorymembers)


Category members: Category:Artificial Intelligence
*: Artificial intelligence (ns: 0)
*: Outline of artificial intelligence (ns: 0)
*: Template:Artificial intelligence (ns: 10)
*: List of artificial intelligence projects (ns: 0)
*: List of programming languages for artificial intelligence (ns: 0)
*: 0music (ns: 0)
*: 3D reconstruction from multiple images (ns: 0)
*: 20Q (ns: 0)
*: Abacus.AI (ns: 0)
*: Ablation (artificial intelligence) (ns: 0)
*: ACROSS Project (ns: 0)
*: Action selection (ns: 0)
*: Adaptive autonomy (ns: 0)
*: Admissible heuristic (ns: 0)
*: ADS-AC (ns: 0)
*: Agent systems reference model (ns: 0)
*: AgentSheets (ns: 0)
*: AGI (computer science) (ns: 0)
*: A.I. Artificial Intelligence (ns: 0)
*: Artificial Intelligence Cold War (ns: 0)
*: AI Companies of India (ns: 0)
*: AI Dungeon (ns: 0)
*: AI Song Contest (ns: 0)
*: AI Song Contest 2020 (ns: 0)
*: AI Song Contest 2021 (ns: 0)
*: AI Superpowers (ns: 0)
*: AI-complete (ns: 0)
*: AirSim (ns: 0)
*: Alesis Artificial Int

In [59]:
# deconstruct articles into sections
def deconstruct_article(page):
  section_list = []
  section_list = [{'level': 0,
                   'section_title': 'Summary',
                   'text': page.summary}]
  
  def get_sections(sections, level=0):
    for s in sections:
      section_dict = {'level': level,
                      'section_title': s.title,
                      'text': s.text}
      section_list.append(section_dict)
      get_sections(s.sections, level + 1)
  
  get_sections(page.sections)
  return section_list

In [60]:
# build documents to populate ElasticSearch
def build_documents(page, section_list):
    
    # transform list of dictionaries to dataframe
    df = pd.DataFrame(section_list)

    # create column "main_section"
    df['main_section'] = np.nan
    df.loc[df['level']==0, 'main_section'] = df['section_title']
    df['main_section'].fillna(method='ffill', inplace=True)
    
    # create column "subsection"
    df['subsection'] = np.nan
    df.loc[df['text']=='', 'subsection'] = df['section_title']
    df['subsection'].fillna(method='ffill', inplace=True)

    # add wikipedia article title, source url and page id
    df1 = df.replace(np.nan, '', regex=True)
    df1['article_title'] = page.title
    df1['source_url'] = page.fullurl
    df1['page_id'] = page.pageid
    
    # create a list of section tags
    df1['tags']=df1.apply(lambda row: [row['article_title'],
                                       row['main_section'], 
                                       row['subsection'], 
                                       row['section_title']],
                          axis=1)
    df1['tags']=df1['tags'].apply(lambda cell: [s for s in cell if s!=""])
    df1['tags']=df1['tags'].apply(lambda cell: list(dict.fromkeys(cell)))
    
    # drop rows with NaN values (empty sections)
    df2 = df1.replace('', np.nan, regex=True)
    df2 = df2.drop(['level', 'subsection'], axis=1).dropna()
    
    # transform a list of tags to a comma separated string
    df2['tags'] = df2.apply(lambda row: ', '.join(row['tags']), axis=1)

    # add number of section withing the article
    df2['section_number'] = df2.index
    
    return df2

In [61]:
article_df_list = []

for article in articles:
    try:
        page = wiki_data.page(article)
        sections = deconstruct_article(page)
        document_df = build_documents(page, sections)
        article_df_list.append(document_df)
    
    except Exception as error:
        print(f"Something went wrong when loading {page.pageid} article ->", error)
        pass

Something went wrong when loading 2526563 article -> Columns must be same length as key


In [62]:
df = pd.concat(article_df_list)
df.shape

(3259, 8)

In [63]:
df.head()

Unnamed: 0,section_title,text,main_section,article_title,source_url,page_id,tags,section_number
0,Summary,Artificial intelligence (AI) is intelligence d...,Summary,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,1164,"Artificial intelligence, Summary",0
1,History,Artificial beings with intelligence appeared a...,History,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,1164,"Artificial intelligence, History",1
2,Goals,The general problem of simulating (or creating...,Goals,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,1164,"Artificial intelligence, Goals",2
3,"Reasoning, problem solving",Early researchers developed algorithms that im...,Goals,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,1164,"Artificial intelligence, Goals, Reasoning, pro...",3
4,Knowledge representation,Knowledge representation and knowledge enginee...,Goals,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,1164,"Artificial intelligence, Goals, Knowledge repr...",4


In [64]:
data = df.to_dict(orient="records")
data[0]

{'section_title': 'Summary',
 'text': 'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.\nLeading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.\nSome popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Tesla), automated decision-making and competing at the highest level in strategic game systems (such as chess and Go).\nAs machines become increasingly capable,

In [65]:
# save the list of dictionaries in JSON
with open(DATA_DIR + "data.json", "w") as outfile:
    json.dump(data, outfile)

In [66]:
# load the json file
with open(DATA_DIR + "data.json", "r") as outfile:
    data = json.load(outfile)

In [67]:
# print sample sentences
g = (d for d in data)
[next(g) for i in range(2)]

[{'section_title': 'Summary',
  'text': 'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.\nLeading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.\nSome popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Tesla), automated decision-making and competing at the highest level in strategic game systems (such as chess and Go).\nAs machines become increasingly capabl

In [68]:
print(len(data))

3259


#### **Create Elasticsearch index**

In [69]:
# check if the elasticsearch container is running
!curl http://localhost:9200/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
{
  "name" : "3653a1693ee0",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "HNElKsF8Rv-AHXKtyJm7aw",
  "version" : {
    "number" : "7.17.0",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "bee86328705acaa9a6daede7140defd4d9ec56bd",
    "build_date" : "2022-01-28T08:36:04.875279988Z",
    "build_snapshot" : false,
    "lucene_version" : "8.11.1",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [70]:
# instantiate elasticsearch client
new_client = Elasticsearch()

In [71]:
# use helpers' bulk API to index list of elasticsearch docs
new_response = helpers.bulk(new_client, data, index='ai')



In [72]:
# count documents in all indices
new_client.cat.count("_all", params={"format": "json"})

  new_client.cat.count("_all", params={"format": "json"})


[{'epoch': '1644843135', 'timestamp': '12:52:15', 'count': '4462'}]

### **Reranking with BERT**

**Match query**

In [73]:
# create a query body and search a string (fuzzy query)
query = "What is the BERT model?"
match_query_body = {"query": {"match": {"text": query}}}

In [74]:
# submit the search query to elasticsearch
docs = new_client.search(body = match_query_body, index="ai", size=50)

print(type(docs))
docs

<class 'dict'>


  docs = new_client.search(body = match_query_body, index="ai", size=50)


{'took': 13,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2681, 'relation': 'eq'},
  'max_score': 14.324772,
  'hits': [{'_index': 'ai',
    '_type': '_doc',
    '_id': 'PSBE-H4BmN6FcmA2viwe',
    '_score': 14.324772,
    '_ignored': ['text.keyword'],
    '_source': {'section_title': 'Architecture',
     'text': 'BERT is at its core a transformer language model with a variable number of encoder layers and self-attention heads. The architecture is "almost identical" to the original transformer implementation in Vaswani et al. (2017).BERT was pretrained on two tasks: language modelling (15% of tokens were masked and BERT was trained to predict them from context) and next sentence prediction (BERT was trained to predict if a chosen next sentence was probable or not given the first sentence). As a result of the training process, BERT learns contextual embeddings for words. After pretraining, which is computationally

In [75]:
# reshape search results to prepare them for sentence embeddings
texts = []
section_titles = []
article_titles = []
score = []

for doc in docs["hits"]["hits"]:
    texts.append(doc["_source"]["text"])
    section_titles.append(doc["_source"]["section_title"])
    article_titles.append(doc["_source"]["article_title"])
    score.append(doc["_score"])

**Sentence embeddings**

In [76]:
'''
elasticsearch uses tfidf_scores to rank the search results. This causes
documents to be prioritized by the frequency of keywords in the documents 
normalized by the total number of terms. Hence, the results tend to be shorter 
documents rich in terms often receiving a higher relevance score compared to 
more descriptive, longer, and more information-rich documents. To improve the 
quality sentence embeddings for the results returned by elasticsearch
sentence-transformers BERT models finetuned for semantic text search will be
computed
'''

def compute_embeddings(query, texts, article_titles, section_titles, model, top_k=10):
    
    emb = SentenceTransformer(model)
    corpus_emb = emb.encode(texts, convert_to_tensor=True)
    query_emb = emb.encode(query, convert_to_tensor=True)
    # returns a list of dictionaries with the keys 'corpus_id' and 'score',
    # sorted by decreasing cosine similarity scores
    reranked_results = util.semantic_search(query_emb, corpus_emb, top_k=top_k)[0]
    
    rr_results_list = []
    for item in reranked_results:
        idx = item['corpus_id']
        rr_results_dict = {
            'bert_score': item['score'],
            'article_title': article_titles[idx],
            'section_title': section_titles[idx],
            'text': texts[idx]
        }
        rr_results_list.append(rr_results_dict)
    
    return rr_results_list

In [77]:
# using the distilled BERT model
reranked = compute_embeddings(query, texts, article_titles, section_titles, model='distilbert-base-nli-stsb-mean-tokens')
pprint(reranked)

[{'article_title': 'BERT (language model)',
  'bert_score': 0.6596279144287109,
  'section_title': 'Further reading',
  'text': 'Rogers, Anna; Kovaleva, Olga; Rumshisky, Anna (2020). "A Primer in '
          'BERTology: What we know about how BERT works". arXiv:2002.12327 '
          '[cs.CL].'},
 {'article_title': 'BERT (language model)',
  'bert_score': 0.49651145935058594,
  'section_title': 'Summary',
  'text': 'Bidirectional Encoder Representations from Transformers (BERT) is a '
          'transformer-based machine learning technique for natural language '
          'processing (NLP) pre-training developed by Google. BERT was created '
          'and published in 2018 by Jacob Devlin and his colleagues from '
          'Google. In 2019, Google announced that it had begun leveraging BERT '
          'in its search engine, and by late 2020 it was using BERT in almost '
          'every English-language query.  A 2020 literature survey concluded '
          'that "in a little over a