# Load Dataset

In [None]:
!pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset
dataset = load_dataset("nreimers/trec-covid", split="train")



In [None]:
dataset

Dataset({
    features: ['_id', 'title', 'text', 'metadata'],
    num_rows: 171332
})

In [None]:
fetch_total_docs=1000
corpus = []
for row in dataset:
    if len(str(row["title"])) > 20 and len(str(row["text"])) > 100:
        text = row["title"] + " " + row["text"]

        text_lower = text.lower()

        # The dataset also contains many papers on other diseases. To make the training in this demo
        # more efficient, we focus on papers that talk about COVID.
        if "covid" in text_lower or "corona" in text_lower or "sars-cov-2" in text_lower:
            corpus.append(text)

        if len(corpus) >= fetch_total_docs:
            break

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Query Generation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'doc2query/msmarco-t5-base-v1'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

In [None]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 5
num_queries = 3  # number of queries to generate for each passage
count = 0
lines = []
passage_batch = []

# reinitialize passage generator
passages = corpus

for passage in passages:
        # remove tab + newline characters if present
        passage_batch.append(passage.replace('\t', ' ').replace('\n', ' '))
        
        # we encode in batches
        if len(passage_batch) == batch_size:
            # tokenize the passage
            inputs = tokenizer(
                passage_batch,
                truncation=True,
                padding=True,
                max_length=256,
                return_tensors='pt'
            )

            # generate three queries per doc/passage
            outputs = model.generate(
                input_ids=inputs['input_ids'].cuda(),
                attention_mask=inputs['attention_mask'].cuda(),
                max_length=64,
                do_sample=True,
                top_p=0.95,
                num_return_sequences=num_queries
            )

            # decode query to human readable text
            decoded_output = tokenizer.batch_decode(
                outputs,
                skip_special_tokens=True
            )

            # loop through to pair query and passages
            for i, query in enumerate(decoded_output):
                query = query.replace('\t', ' ').replace('\n', ' ')  # remove newline + tabs
                passage_idx = int(i/num_queries)  # get index of passage to match query
                lines.append(query+'\t'+passage_batch[passage_idx])
                count += 1
            
            passage_batch = []



In [None]:
# write (Q, P+) pairs to file
with open('pairs.tsv', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(lines))

In [None]:
from google.colab import files
files.download('pairs.tsv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate negetive sampminingling

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

In [None]:
from tqdm.auto import tqdm
all_passages=[]
pairs=[]
with open('pairs.tsv', 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
for line in tqdm(lines):
     
    query, passage = line.split('\t')
    pairs.append((query, passage))
    all_passages.append(passage)
          

  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
#pair_gen = get_text()

# pairs = []
# to_upsert = []
# passage_batch = []
# id_batch = []
# batch_size = 5

# for i, (query, passage) in enumerate(pair_gen):
#     pairs.append((query, passage))
#     # we do this to avoid passage duplication in the vector DB
#     if passage not in passage_batch: 
#         passage_batch.append(passage)
#         id_batch.append(str(i))
#     # on reaching batch_size, we encode and upsert
#     if len(passage_batch) == batch_size:
#         embeds = model.encode(passage_batch).tolist()
#         # upload to index
#         #index.upsert(vectors=list(zip(id_batch, embeds)))
#         # refresh batches
#         passage_batch = []
#         id_batch = []
        
# # check number of vectors in the index
# #index.describe_index_stats()

In [None]:
corpus_embeddings = model.encode(all_passages, convert_to_tensor=True)

In [None]:
import random

batch_size = 1
triplets = []
TOP_K=10
for i in tqdm(range(0, len(pairs), batch_size)):
    # embed queries and query pinecone in batches to minimize network latency
    i_end = min(i+batch_size, len(pairs))
    queries = [pair[0] for pair in pairs[i:i_end]][0]
    pos_passage = [pair[1] for pair in pairs[i:i_end]][0]
    # create query embeddings
    query_embs = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
    # search for top_k most similar passages
    #res = index.query(query_embs.tolist(), top_k=10)

    
    correct_hits = util.semantic_search(query_embs, corpus_embeddings, top_k=TOP_K)[0]
    correct_hits_ids = list([hit['corpus_id'] for hit in correct_hits])
   



    # iterate through queries and find negatives
    top_results = correct_hits_ids
        # shuffle results so they are in random order
    
    random.shuffle(top_results)
    for hit in top_results:
            neg_passage = pairs[hit][1]
            # check that we're not just returning the positive passage
            if neg_passage !=pos_passage:
                # if not we can add this to our (Q, P+, P-) triplets
                triplets.append(query+'\t'+pos_passage+'\t'+neg_passage)
                break



  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
triplets[0]

"what class is coronavirus\tInsertion/Deletion Polymorphism of Angiotensin Converting Enzyme Gene in Kawasaki Disease Polymorphism of angiotensin converting enzyme (ACE) gene is reported to be associated with ischemic heart disease, hypertrophic cardiomyopathy, and idiopathic dilated cardiomyopathy. In this study, we investigated the relationship between Kawasaki disease and insertion/deletion polymorphism of ACE gene. Fifty five Kawasaki disease patients and 43 healthy children were enrolled. ACE genotype was evaluated from each of the subjects' DNA fragments through polymerase chain reaction (PCR). Frequencies of ACE genotypes (DD, ID, II) were 12.7%, 60.0%, 27.3% in Kawasaki group, and 41.9%, 30.2%, 27.9% in control group respectively, indicating low rate of DD and high rate of ID genotype among Kawasaki patients (p<0.01). Comparing allelic (I, D) frequencies, I allele was more prevalent in Kawasaki group than in control group (57.3% vs. 43.0%, p<0.05). In Kawasaki group, both genot

In [None]:
with open('triplets.tsv', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(triplets))

In [None]:
from google.colab import files
files.download('triplets.tsv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Cross-Encoder Scoring

In [None]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder at 0x7f4958537040>

In [None]:
from tqdm.auto import tqdm

def get_lines():
    # loop through each file
    with open('triplets.tsv', 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    # loop through each line in the current file
    for line in tqdm(lines):
        q, p, n = line.split('\t')
        # return the query, positive, negative
        yield q, p, n

In [None]:
lines = get_lines()
label_lines = []

for line in lines:
    q, p, n = line
    p_score = model.predict((q, p))
    n_score = model.predict((q, n))
    margin = p_score - n_score
    # append pairs to label_lines with margin score
    label_lines.append(
        q + '\t' + p + '\t' + n + '\t' + str(margin)
    )

with open("triplets_margin.tsv", 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(label_lines))

  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
from google.colab import files
files.download('triplets_margin.tsv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Fine-tuning

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from tqdm.auto import tqdm
from sentence_transformers import InputExample

training_data = []

with open('triplets_margin.tsv', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')
# loop through each line and return InputExample
for line in tqdm(lines):
    q, p, n, margin = line.split('\t')
    training_data.append(InputExample(
        texts=[q, p, n],
        label=float(margin)
    ))

len(training_data)

  0%|          | 0/3000 [00:00<?, ?it/s]

3000

In [None]:
import torch

torch.cuda.empty_cache()

batch_size = 32

loader = torch.utils.data.DataLoader(
    training_data, batch_size=batch_size, shuffle=True
)

In [None]:
from sentence_transformers import losses

loss = losses.MarginMSELoss(model)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='msmarco-distilbert-base-tas-b-covid',
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/94 [00:00<?, ?it/s]

In [None]:
from google.colab import files
files.download('msmarco-distilbert-base-tas-b-covid') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# search

In [None]:
corpus[0]

"Insertion/Deletion Polymorphism of Angiotensin Converting Enzyme Gene in Kawasaki Disease Polymorphism of angiotensin converting enzyme (ACE) gene is reported to be associated with ischemic heart disease, hypertrophic cardiomyopathy, and idiopathic dilated cardiomyopathy. In this study, we investigated the relationship between Kawasaki disease and insertion/deletion polymorphism of ACE gene. Fifty five Kawasaki disease patients and 43 healthy children were enrolled. ACE genotype was evaluated from each of the subjects' DNA fragments through polymerase chain reaction (PCR). Frequencies of ACE genotypes (DD, ID, II) were 12.7%, 60.0%, 27.3% in Kawasaki group, and 41.9%, 30.2%, 27.9% in control group respectively, indicating low rate of DD and high rate of ID genotype among Kawasaki patients (p<0.01). Comparing allelic (I, D) frequencies, I allele was more prevalent in Kawasaki group than in control group (57.3% vs. 43.0%, p<0.05). In Kawasaki group, both genotype and allelic frequencies

In [None]:
model = SentenceTransformer('msmarco-distilbert-base-tas-b')


In [None]:
new_corpus_embeddings = model.encode(corpus,show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
query = "how is the covid-19 virus transferred from one person to another"
query_emb = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_emb, new_corpus_embeddings, top_k=10)[0]
all_hits_ids = set([hit['corpus_id'] for hit in hits])

In [None]:

for i in list(all_hits_ids):
  print(corpus[i])

Effects of Covid-19 outbreak on environment and renewable energy sector Many articles have been written in the medical field related to the Covid-19 outbreak that has surrounded the World and killed many people. However, its environmental and energy impacts have not been sufficiently studied. Some sources argue that Covid-19 outbreak reduces pollution environmentally, while others say that environmentally significant damages await us. On the other hand, it is wondered how the global flexible renewable energy sector will react to Covid-19 outbreak. In this study, the effects of Covid-19 outbreak in terms of the environment and renewable energy sector in the literature were examined in detail and the findings obtained were discussed. The main aim of this study is to shed light on the future studies of environmental and renewable energy researchers.
The effect of COVID-19 on global population and its fatality rate: Retrospective study by online database OBJECTIVE: Coronavirus disease 2019