# **Milestone 2:**
Semantic Search with ML and BERT


In [1]:
!pip install faiss-cpu
!pip install transformers
!pip install -U sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.1.post3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 5.6 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.1.post3
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |█████████

### **Setting up the environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


###**Importing the required modules**

In [2]:
# import libraries
import json
import torch
import numpy as np
import faiss
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
from pprint import pprint

### **BERT**

#### **Getting the data**

In [4]:
DATA_DIR = '/content/drive/MyDrive/SearchToolwNLP/02_Implement Semantic Search with ML and BERT/data/'

In [5]:
# load the json file
with open(DATA_DIR + 'data_01.json', 'r') as outfile:
    sentences = json.load(outfile)

In [6]:
# print sample sentences
g = (s for s in sentences)
[next(g) for i in range(2)]

['A pandemic is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people.',
 'The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century.']

In [7]:
print(len(sentences))

11


#### **Vectorizing the dataset**

In [8]:
# load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# function that vectorizes the text
def encode(doc):
  tokens = tokenizer(doc, return_tensors='pt')
  emb = model(**tokens)[0].detach().squeeze()
  return torch.mean(emb, dim=0)

In [10]:
# vectorize the documents
%%capture
b_emb = [encode(d) for d in sentences]

In [11]:
print(b_emb[0][:50])

tensor([ 0.0486,  0.0974, -0.0493, -0.2006,  0.2463, -0.2616,  0.2512,  0.9330,
        -0.1771, -0.0981, -0.3161, -0.8144, -0.4257,  0.9705,  0.0080,  0.5728,
         0.3163,  0.1127, -0.5386,  0.5193, -0.1393, -0.1869, -0.0203,  0.6890,
         0.0840, -0.3651, -0.0831,  0.1566, -0.0449, -0.3126,  0.6875,  0.0942,
        -0.1704, -0.1667, -0.0444,  0.2498, -0.0677, -0.2164, -0.2832,  0.0593,
        -0.7061, -0.4712, -0.0615,  0.2580,  0.0127, -0.2020,  0.1321,  0.4225,
        -0.0172,  0.2383])


#### **Building a faiss index**

In [12]:
# create a flat faiss index for the BERT model
index_b = faiss.IndexIDMap(faiss.IndexFlatIP(768))
# add the vectors into the index
index_b.add_with_ids(np.array([e.numpy() for e in b_emb]), # convert to numpy array
                   np.array(range(0, len(sentences)))) # IDs from 0 to len(sentences)

#### **Searching the index**

In [13]:
# function to search faiss
def search_b(query, k=5):
  query_encoded = encode(query).unsqueeze(dim=0).numpy()
  top_k = index_b.search(query_encoded, k)
  scores = top_k[0][0]
  results = [sentences[_id] for _id in top_k[1][0]]
  return list(zip(results, scores))

In [14]:
# test a query
pprint(search_b("how many people died from black death?", k=5))

[('As of 2018, approximately 37.9 million people are infected with HIV '
  'globally.',
  53.436695),
 ('A pandemic is an epidemic of an infectious disease that has spread across a '
  'large region, for instance multiple continents or worldwide, affecting a '
  'substantial number of people.',
  51.632618),
 ('The death toll of Spanish Flu is estimated to have been somewhere between '
  '17 million and 50 million, and possibly as high as 100 million, making it '
  'one of the deadliest pandemics in human history.',
  51.374493),
 ('The most fatal pandemic in recorded history was the Black Death (also known '
  'as The Plague), which killed an estimated 75–200 million people in the 14th '
  'century.',
  50.724808),
 ('The Spanish flu, also known as the 1918 flu pandemic, was an unusually '
  'deadly influenza pandemic caused by the H1N1 influenza A virus.',
  48.253498)]


### **Sentence BERT**

#### **Getting the data**

In [15]:
DATA_DIR = '/content/drive/MyDrive/SearchToolwNLP/02_Implement Semantic Search with ML and BERT/data/'

In [16]:
# load the json file
with open(DATA_DIR + 'data_02.json', 'r') as outfile:
    documents = json.load(outfile)

In [17]:
# print sample documents
gg = (d for d in documents)
[next(gg) for i in range(2)]

[{'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (SARS-CoV-2) and HIV/AI

In [18]:
print(len(documents))

26


#### **Vectorizing the dataset**

In [19]:
# load the SBERT model
# 'distilbert-base-nli-stsb-mean-tokens' performs great in semantic textual similarity tasks and it's quite faster than BERT as it is considerably smaller
sb_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

In [20]:
# vectorize the documents
%%capture
sb_emb = sb_model.encode([d['text'] for d in documents], convert_to_tensor=True)

In [21]:
print(sb_emb[0][:50])

tensor([ 0.2995, -0.7021, -0.4336, -0.0191, -0.2439,  0.5708,  0.0871, -0.3716,
        -0.1410, -1.0819,  0.1625, -0.3774, -1.0932,  0.3509,  0.5480,  0.0126,
         0.7217,  0.2254, -0.4995,  0.1995, -0.4766,  0.1005,  0.2127,  0.2675,
        -1.4378, -0.0797,  0.2499, -0.0476,  1.0231, -0.5252,  0.4197,  0.3438,
        -0.1554, -0.9829, -0.4632, -0.1334, -0.1556,  0.7530,  0.1263,  0.1310,
         0.0292, -0.6407, -0.0671, -0.3750,  0.1365,  0.4260, -0.4883,  0.6365,
        -0.2857,  0.2323])


### **Building a faiss index**

In [22]:
# create a flat faiss index for the SBERT model
index_sb = faiss.IndexIDMap(faiss.IndexFlatIP(768))
# add the vectors into the index
index_sb.add_with_ids(np.array([e.numpy() for e in sb_emb]), # convert to numpy array
                   np.array(range(0, len(documents)))) # IDs from 0 to len(documents)

In [23]:
# save index to file
faiss.write_index(index_sb, DATA_DIR + "faiss.index")

### **Searching the index**

In [24]:
# function to search faiss
def search_sb(query, k=5):
  query_encoded = sb_model.encode([query])
  top_k = index_sb.search(query_encoded, k)
  scores = top_k[0][0]
  results = [documents[_id] for _id in top_k[1][0]]
  return list(zip(results, scores))

In [25]:
# test a query
pprint(search_sb("how many people died from black death?", k=5))

[({'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") '
           'is an epidemic of an infectious disease that has spread across a '
           'large region, for instance multiple continents or worldwide, '
           'affecting a substantial number of people. A widespread endemic '
           'disease with a stable number of infected people is not a pandemic. '
           'Widespread endemic diseases with a stable number of infected '
           'people such as recurrences of seasonal influenza are generally '
           'excluded as they occur simultaneously in large regions of the '
           'globe rather than being spread worldwide.\n'
           'Throughout human history, there have been a number of pandemics of '
           'diseases such as smallpox and tuberculosis. The most fatal '
           'pandemic in recorded history was the Black Death (also known as '
           'The Plague), which killed an estimated 75–200 million people in '
           '

### **BERT for Q&A**

#### **Vectorizing the dataset**

In [26]:
# load the generated index
index = faiss.read_index(DATA_DIR + "faiss.index")

In [27]:
# load the SBERT model
# 'distilbert-base-nli-stsb-mean-tokens' performs great in semantic textual similarity tasks and it's quite faster than BERT as it is considerably smaller
sb_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

In [28]:
# load a question answering pipeline from HuggingFace
nlp = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


### **Searching the index**

In [29]:
# function to search faiss with q&a functionality included
def find_answer(query, k=5):
  query_encoded = sb_model.encode([query])
  top_k = index.search(query_encoded, k)
  scores = top_k[0][0]
  results = [documents[_id] for _id in top_k[1][0]]
  answer = [nlp(question=query, context=result['text']) for result in results]
  answer = sorted(answer, key=lambda x: x['score'], reverse=True)
  return [list(map(a.get, ['answer', 'score'])) for a in answer]

In [30]:
# test a query
pprint(find_answer("How to prevent the spread of viral infections?", k=5))

  return array(a, dtype, copy=False, order=order)


[['improved sanitation and access to clean water', 0.5752676725387573],
 ['by giving both the mother and child antiretroviral medication',
  0.41623830795288086],
 ['measures to reduce causes of new infectious diseases', 0.27838316559791565],
 ['Tracking viral load is used to monitor therapy', 0.15053565800189972],
 ['administration of vaccines', 0.11565064638853073]]
