In [5]:
#!conda install -y -c conda-forge elasticsearch=7.17.0

In [1]:
#!conda install -y -c conda-forge accelerate sacremoses transformers

In [1]:
import os
data_dir = os.path.expanduser("~/Google Drive/Shared drives/Data")
dataset = "bioasq"
data_path = f'{data_dir}/BEIR/{dataset}'

In [2]:
prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'


In [33]:
from vectorspace.utils import json_load
from beir.datasets.data_loader import GenericDataLoader
_, queries, qrels = GenericDataLoader(data_path).load(split="test")
bm25_results = json_load(f"{data_dir}/BEIR/results_{dataset}_bm25.json")

  0%|          | 0/1 [00:00<?, ?it/s]

In [34]:
# Only use first 100
n = 100
query_keys = list(queries.keys())
queries_ = {k: v for k, v in queries.items() if k in query_keys[:n]}
qrels_ = {k: v for k, v in qrels.items() if k in query_keys[:n]}
results = {k: v for k, v in bm25_results.items() if k in query_keys[:n]}

In [35]:
from vectorspace.store import Store as ESDict
from elasticsearch import Elasticsearch
corpus = ESDict(Elasticsearch(['http://localhost:9200']), dataset)

In [6]:
#top_k = 100
#sentence_pairs, pair_ids = [], []
#for query_id in results:
#    for (doc_id, _) in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
#        pair_ids.append([query_id, doc_id])
#        corpus_text = (corpus[doc_id].get("title", "") + " " + corpus[doc_id].get("text", "")).strip()
#        sentence_pairs.append([queries_[query_id], corpus_text])

In [7]:
#from vectorspace.utils import jsonl_dump
#jsonl_dump([{'query_id': p[0], 'doc_id': p[1]} for p in pair_ids], f'{data_path}/pair_ids_100.jsonl')
#jsonl_dump([{'query': p[0], 'text': p[1]} for p in sentence_pairs], f'{data_path}/sentence_pairs_100.jsonl')

In [8]:
from vectorspace.utils import jsonl_load
sentence_pairs = [[p['query'], p['text']] for p in jsonl_load(f'{data_path}/sentence_pairs_100.jsonl')]
pair_ids = [[p['query_id'], p['doc_id']] for p in jsonl_load(f'{data_path}/pair_ids_100.jsonl')]

## GPT

In [60]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from scipy.spatial.distance import cosine

# Get models - The package will take care of downloading the models automatically
# For best performance: EleutherAI/gpt-j-6B
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
device = 'mps'
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").to(device).eval()

In [52]:
from tqdm import tqdm

def compute_gpt_scores(tokenizer, model, sentence_pairs, model_name='gpt'):
    scores = []
    prompt = '''Documents are searched to find matches with the same content.\n
    The document "{}" is a good search result for "'''
    for query, doc in tqdm(sentence_pairs, total=len(sentence_pairs)):
        context = prompt.format(doc)
        context_enc = tokenizer.encode(context, add_special_tokens=False)
        continuation_enc = tokenizer.encode(query, add_special_tokens=False)
        # Slice off the last token, as we take its probability from the one before
        model_input = torch.tensor(context_enc+continuation_enc[:-1]).to(device)
        continuation_len = len(continuation_enc)
        # [seq_len] -> [seq_len, vocab]
        # probability of each word of following the previous words
        if model_name=='gpt':
            output = model(model_input)[0]
        else:
            output = model(model_input.unsqueeze(dim=0))[0][0]
        logprobs = torch.nn.functional.log_softmax(output, dim=-1).cpu()
        # vocab indices of continuation tokens
        continuation_index = torch.tensor(continuation_enc).unsqueeze(-1)
        # Gather the log probabilities of the continuation tokens -> [continuation_len]
        logprobs_continuation = torch.gather(logprobs[-continuation_len:], 1, continuation_index).squeeze(-1)
        # The query probability
        score = torch.sum(logprobs_continuation)
        scores.append(float(score.detach()))
    return scores

In [32]:
scores = compute_gpt_scores(tokenizer, model, sentence_pairs)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [3:57:40<00:00,  1.43s/it]


In [39]:
results_rerank = {query_id: {} for query_id in results}
for pair, score in zip(pair_ids, scores):
    query_id, doc_id = pair[0], pair[1]
    results_rerank[query_id][doc_id] = score

In [42]:
import json
from beir.retrieval.evaluation import EvaluateRetrieval
# Save rerank results
with open(f'{data_dir}/BEIR/results_bioasq_bm25_sgpt_100.json', 'w') as fp:
    json.dump(results_rerank, fp)

ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results_rerank, k_values=[1, 3, 5, 10, 100])

In [43]:
ndcg

{'NDCG@1': 0.49,
 'NDCG@3': 0.49791,
 'NDCG@5': 0.49257,
 'NDCG@10': 0.49016,
 'NDCG@100': 0.55045}

In [71]:
recall

{'Recall@1': 0.25412,
 'Recall@3': 0.39397,
 'Recall@5': 0.4514,
 'Recall@10': 0.49981,
 'Recall@100': 0.70038}

In [72]:
_map

{'MAP@1': 0.25412,
 'MAP@3': 0.34575,
 'MAP@5': 0.37491,
 'MAP@10': 0.39969,
 'MAP@100': 0.43331}

## BioGPT

In [45]:
from transformers import BioGptTokenizer, BioGptForCausalLM
cache_dir = f"{data_dir}/huggingface/"
model_name = "microsoft/biogpt"
device = 'mps'
model = BioGptForCausalLM.from_pretrained(model_name, cache_dir=cache_dir, low_cpu_mem_usage=True).to(device).eval()
tokenizer = BioGptTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

In [53]:
scores_biogpt = compute_gpt_scores(tokenizer, model, sentence_pairs, model_name='biogpt')

  0%|                                                                                                        | 7/10000 [00:54<21:45:58,  7.84s/it]


In [61]:
context_enc = tokenizer.encode(context, add_special_tokens=False)
continuation_enc = tokenizer.encode(query, add_special_tokens=False)
# Slice off the last token, as we take its probability from the one before
model_input = torch.tensor(context_enc+continuation_enc[:-1]).to(device)
continuation_len = len(continuation_enc)
input_len, = model_input.shape

In [70]:
%%time
# [seq_len] -> [seq_len, vocab]
# probability of each word of following the previous words
#output = model(model_input.unsqueeze(dim=0))[0][0]
output = model(model_input)[0]
logprobs = torch.nn.functional.log_softmax(output, dim=-1).cpu()
# vocab indices of continuation tokens
continuation_index = torch.tensor(continuation_enc).unsqueeze(-1)
# Gather the log probabilities of the continuation tokens -> [continuation_len]
logprobs_continuation = torch.gather(logprobs[-continuation_len:], 1, continuation_index).squeeze(-1)
# The query probability
score = torch.sum(logprobs_continuation)

CPU times: user 121 ms, sys: 38.4 ms, total: 160 ms
Wall time: 135 ms


In [20]:
import numpy as np
scores_np = np.array([-n.detach().numpy() for n in scores])
scores_sorted = np.argsort(scores_np)

In [21]:
[sentence_pairs[s] for s in scores_sorted]

[['Are gut microbiota profiles altered by irradiation?',
  'Evaluating gut microbiota profiles from archived fecal samples. BACKGROUND: Associations between colorectal cancer and microbiota have been identified. Archived fecal samples might be valuable sample sources for investigating causality in carcinogenesis and biomarkers discovery due to the potential of performing longitudinal studies. However, the quality, quantity and stability of the gut microbiota in these fecal samples must be assessed prior to such studies. We evaluated i) cross-contamination during analysis for fecal blood and ii) evaporation in stored perforated fecal immunochemical tests (iFOBT) samples, iii) temperature stability as well as iv) comparison of the gut microbiota diversity and composition in archived, iFOBT and fresh fecal samples in order to assess feasibility of large scale microbiota studies.METHODS: The microbiota profiles were obtained by sequencing the V3-V4 region of 16S rDNA gene.RESULTS: The iFOB

In [48]:
model.to(device)
model_input = torch.tensor(context_enc+continuation_enc[:-1]).to(device)