In [1]:
#!pip install faiss-cpu
#!pip install transformers
#!pip install sentence-transformers

import json
import torch
import numpy as np
import faiss
import sentence_transformers
from sentence_transformers import SentenceTransformer
import pandas as pd

In [2]:
with open('/content/data.json') as f:
  data = pd.DataFrame(json.load(f))
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
data

Unnamed: 0,title,text,url
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality
5,Cholera,Cholera is an infection of the small intestine...,https://en.wikipedia.org/wiki/Cholera
6,COVID-19 pandemic,"The COVID-19 pandemic, also known as the coron...",https://en.wikipedia.org/wiki/COVID-19_pandemic
7,Crimson Contagion,Crimson Contagion was a joint exercise conduct...,https://en.wikipedia.org/wiki/Crimson_Contagion
8,Disease X,Disease X is a placeholder name that was adopt...,https://en.wikipedia.org/wiki/Disease_X
9,Event 201,The Johns Hopkins Center for Health Security (...,https://en.wikipedia.org/wiki/Johns_Hopkins_Ce...


In [3]:
embeddings = model.encode(data.text.to_list(), show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
index = faiss.IndexFlatL2(embeddings.shape[1])
index = faiss.IndexIDMap(index)
index.add_with_ids(embeddings, data.index.to_numpy())
print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 26


In [5]:
D, I = index.search(np.array([embeddings[12]]), k=5)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')
for i in I:
  print(data.iloc[i])

L2 distance: [0.0, 0.7909256815910339, 1.016036868095398, 1.0702069997787476, 1.1270612478256226]

MAG paper IDs: [12, 0, 22, 13, 6]
                                     title  \
12                     Pandemic prevention   
0                                 Pandemic   
22        Targeted immunization strategies   
13  Pandemic Severity Assessment Framework   
6                        COVID-19 pandemic   

                                                 text  \
12  Pandemic prevention is the organization and ma...   
0   A pandemic (from Greek πᾶν, pan, "all" and δῆμ...   
22  Targeted immunization strategies are approache...   
13  The Pandemic Severity Assessment Framework (PS...   
6   The COVID-19 pandemic, also known as the coron...   

                                                  url  
12  https://en.wikipedia.org/wiki/Pandemic_prevention  
0              https://en.wikipedia.org/wiki/Pandemic  
22  https://en.wikipedia.org/wiki/Targeted_immuniz...  
13  https://en.wikipedi

In [6]:
def sentence_search(search_string):
  D, I = index.search(np.array(model.encode([search_string])).astype("float32"), k=5)
  print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')
  for i in I:
    print(data.iloc[i])

In [7]:
with open('/content/questions.json') as f:
  questions = json.load(f)
questions

['How many people have died during Black Death?',
 'Which diseases can be transmitted by animals?',
 'Connection between climate change and a likelihood of a pandemic',
 'What is an example of a latent virus',
 'Viruses in nanotechnology',
 'Giant viruses classification',
 'What are the notable pandemic prevention organizations?',
 'How many leprosy outbreaks are known to happen?',
 'What are the geographic areas with the highest transmission of malaria?',
 'How to prevent the spread of viral infections?']

In [8]:
for question in questions:
  print(question)
  sentence_search(question)
  print('-------------------------------------------')

How many people have died during Black Death?
L2 distance: [1.1505247354507446, 1.1768505573272705, 1.3450472354888916, 1.4118623733520508, 1.4184632301330566]

MAG paper IDs: [0, 2, 1, 19, 4]
                       title  \
0                   Pandemic   
2            Antonine Plague   
1   Epidemiology of HIV/AIDS   
19               Spanish flu   
4         Bills of mortality   

                                                 text  \
0   A pandemic (from Greek πᾶν, pan, "all" and δῆμ...   
2   The Antonine Plague of 165 to 180 AD, also kno...   
1   HIV/AIDS, or Human Immunodeficiency Virus, is ...   
19  The Spanish flu, also known as the 1918 flu pa...   
4   Bills of mortality were the weekly mortality s...   

                                                  url  
0              https://en.wikipedia.org/wiki/Pandemic  
2       https://en.wikipedia.org/wiki/Antonine_Plague  
1   https://en.wikipedia.org/wiki/Epidemiology_of_...  
19          https://en.wikipedia.org/wiki/Spani

In [9]:
faiss.write_index(index, '/content/Bert_Search_Engine_2-2.index')