In [1]:
#!pip install beir
#!pip install -U pyarrow
#!conda install -y pytorch torchvision -c pytorch-nightly
#!conda install -y -c conda-forge sentence-transformers
#!conda install -y datasets
#!pip install jsonlines

In [2]:
#import torch

In [1]:
#print(torch.backends.mps.is_available())
#print(torch.backends.mps.is_built())

In [3]:
import csv
import json
import os
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir import util

  from tqdm.autonotebook import tqdm


In [4]:
data_dir = os.path.expanduser("~/Google Drive/Shared drives/Data")

In [4]:
def iter_docs(mesh_file=f"{data_dir}/BioASQ/allMeSH_2020.json"):
    with open(mesh_file, "r", encoding="utf8", errors="ignore") as f:
        for i, line in enumerate(f):
            # first line does not contain data
            if i == 0:
                continue
            obj = json.loads(line[:-2])
            doc = {
                'pmid': obj['pmid'],
                'title': obj['title'],
                'text': obj['abstractText'],
                'mesh_terms': obj['meshMajor'],
                'journal': obj['journal']
            }
            yield obj['pmid'], doc

def iter_fixes(fixes_file=f"{data_dir}/BioASQ/Manual-fixes - BioASQ-Task8b.csv"):
    #Add manual fixes provided by BEIR authors
    with open(fixes_file, "r") as f:
        csv_f = csv.reader(f)
        for row in csv_f:
            yield row[0], {"pmid": row[0], "text": row[2], "title": row[1]}


In [5]:
settings = {
    "index": {
        "number_of_shards": 5,
        "number_of_replicas": 0
    },
    "analysis": {
        "analyzer": {
            "custom_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase",
                    "custom_stemmer"
                ]
            }
        },
        "filter": {
            "custom_stemmer": {
                "type": "stemmer",
                "language": 'minimal_english'
            }
        }
    }
}

mappings = {
    "properties": {
        "pmid": {
            "type": "keyword"
        },
        "title": {
            "type": "text"
        },
        "text": {
            "type": "text"
        },
        "journal": {
            "type": "keyword"
        },
        "meshMajor": {
            "type": "keyword"
        }
    }
}

In [6]:
from elasticstore import Store
from elasticsearch import AsyncElasticsearch
es = AsyncElasticsearch(['http://localhost:9200'], timeout=300, retry_on_timeout=True, verify_certs=False)
store = Store(es, 'bioasq', settings=settings, mappings=mappings)

In [8]:
from itertools import islice

In [None]:
from tqdm import tqdm
from aitertools import aiter

async def aiter_docs(start):
    for i in tqdm(islice(iter_docs(), start, None), total=14_9139_39-start):
        yield i

await store.bulk_update(aiter_docs(11370000))

In [12]:
async def aiter_fixes():
    for i in tqdm(iter_fixes()):
        yield i

await store.bulk_update(aiter_fixes())


0it [00:00, ?it/s][A
128it [00:00, 239.12it/s][A
776it [00:00, 1003.10it/s][A


(776, [])

In [32]:
### Test dataset with 500 queries as used in BEIR
test_data_path = f"{data_dir}/BioASQ/Task8BGoldenEnriched"
with jsonlines.open(f'{data_dir}/BEIR/bioasq/queries.jsonl', 'w') as queries_out:
    with open(f"{data_dir}/BEIR/bioasq/qrels/test.tsv", "w") as record_file:
        # Write header
        record_file.write("query-id\tcorpus-id\tscore\n")
        for test_json in os.listdir(test_data_path):
            with open(os.path.join(test_data_path, test_json), "r") as content:

                queries_answers = json.load(content)

                for query in queries_answers["questions"]:
                    query_line = {"_id": query["id"], "text": query["body"]}
                    queries_out.write(query_line)

                    for doc in query["documents"]:
                        doc_id = doc.split("/")[-1]
                        relevance = 1

                        record_file.write(f"{query['id']}\t{doc_id}\t{relevance}\n")

In [22]:
# empty file because we have already indexed into ES
with open(f'{data_dir}/BEIR/bioasq/corpus.jsonl', 'w') as f:
    f.write('{}')

In [5]:
### TEST DATASET RESULTS ###

import json

from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

dataset = "bioasq"
data_path = f'{data_dir}/BEIR/{dataset}'
_, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Only use first 100
query_keys = list(queries.keys())
queries_ = {k: v for k, v in queries.items() if k in query_keys[:100]}
qrels_ = {k: v for k, v in qrels.items() if k in query_keys[:100]}

## BM25

In [8]:
from vectorspace.store import Store as ESDict
from elasticsearch import Elasticsearch
corpus = ESDict(Elasticsearch(['http://localhost:9200']), 'bioasq')
len(corpus)

14914604

In [9]:
model = BM25(index_name=dataset, hostname='localhost', initialize=False, keys={"title": "title", "body": "text"})
retriever = EvaluateRetrieval(model)
#### Retrieve dense results (format of results is identical to qrels)
bm25_results = retriever.retrieve(corpus, queries_)
# Save scores for top 1000 docs for each query, i.e. 1000 * queries lines
with open(f"{data_dir}/BEIR/results_{dataset}_bm25_100.json", 'w') as fp:
    json.dump(bm25_results, fp)
ndcg, _map, recall, precision = retriever.evaluate(qrels_, bm25_results, retriever.k_values)

que: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:04<00:00, 64.24s/it]


In [10]:
ndcg

{'NDCG@1': 0.44,
 'NDCG@3': 0.442,
 'NDCG@5': 0.434,
 'NDCG@10': 0.42897,
 'NDCG@100': 0.50329,
 'NDCG@1000': 0.53361}

In [34]:
from vectorspace.utils import json_load
bm25_results = json_load(f"{data_dir}/BEIR/results_{dataset}_bm25_100.json")

In [43]:
from typing import Dict, List

class Rerank:
    
    def __init__(self, model, batch_size: int = 64, **kwargs):
        self.cross_encoder = model
        self.batch_size = batch_size
        self.rerank_results = {}
        
    def rerank(self, 
               corpus: Dict[str, Dict[str, str]], 
               queries: Dict[str, str],
               results: Dict[str, Dict[str, float]],
               top_k: int) -> Dict[str, Dict[str, float]]:
        
        sentence_pairs, pair_ids = [], []
        
        for query_id in results:
            if len(results[query_id]) > top_k:
                for (doc_id, _) in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
                    pair_ids.append([query_id, doc_id])
                    corpus_text = (corpus[doc_id].get("title", "") + " " + corpus[doc_id].get("text", "")).strip()
                    sentence_pairs.append([queries[query_id], corpus_text])
            
            else:
                for doc_id in results[query_id]:
                    pair_ids.append([query_id, doc_id])
                    corpus_text = (corpus[doc_id].get("title", "") + " " + corpus[doc_id].get("text", "")).strip()
                    sentence_pairs.append([queries[query_id], corpus_text])

        #### Starting to Rerank using cross-attention
        print("Starting To Rerank Top-{}....".format(top_k))
        rerank_scores = [float(score) for score in self.cross_encoder.predict(sentence_pairs, batch_size=self.batch_size)]

        #### Reranking results
        self.rerank_results = {query_id: {} for query_id in results}
        for pair, score in zip(pair_ids, rerank_scores):
            query_id, doc_id = pair[0], pair[1]
            self.rerank_results[query_id][doc_id] = score

        return self.rerank_results 

## MiniLM

In [3]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [4]:
from beir.reranking.models import CrossEncoder
from beir.reranking import Rerank

cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
reranker = Rerank(cross_encoder_model, batch_size=128)

In [5]:
cross_encoder_model.model._target_device = device

In [13]:
from vectorspace.utils import json_load
bm25_results = json_load(f"{data_dir}/BEIR/results_bioasq_bm25_100.json")

In [20]:
# Rerank top-100 results retrieved by BM25
rerank_results = reranker.rerank(corpus, queries_, bm25_results, top_k=100)

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [21]:
with open(f'{data_dir}/BEIR/results_bioasq_bm25_minilm_100.json', 'w') as fp:
    json.dump(rerank_results, fp)

In [14]:
rerank_results = json_load(f'{data_dir}/BEIR/results_bioasq_bm25_minilm_100.json')

In [17]:
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels_, rerank_results, [1, 3, 5, 10, 100, 1000])

In [18]:
ndcg

{'NDCG@1': 0.55,
 'NDCG@3': 0.48511,
 'NDCG@5': 0.49446,
 'NDCG@10': 0.5055,
 'NDCG@100': 0.55799,
 'NDCG@1000': 0.55799}

In [26]:
recall

{'Recall@1': 0.27609,
 'Recall@3': 0.35597,
 'Recall@5': 0.43591,
 'Recall@10': 0.52172,
 'Recall@100': 0.70038}

In [27]:
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels_, bm25_results, [1, 3, 5, 10, 100, 1000])

In [30]:
recall

{'Recall@1': 0.21,
 'Recall@3': 0.32956,
 'Recall@5': 0.3762,
 'Recall@10': 0.43371,
 'Recall@100': 0.70038,
 'Recall@1000': 0.85916}