In [None]:
!pip install pycolbertdb

In [None]:
!pip install llama-index

In [None]:
!pip install llama-index-readers-web

In [None]:
!pip install datasets

In [None]:
!pip install pinecone-client

In [133]:
import os
from dotenv import load_dotenv
from pycolbertdb.client import Colbertdb
from datasets import load_dataset
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import random
from pinecone import Pinecone

load_dotenv()

True

In [None]:
dataset = load_dataset("bigbio/pubmed_qa", split="validation")

In [111]:

sample = random.sample(list(dataset), 5000)

In [115]:
passages = []
for item in sample:
    passages.extend(item["CONTEXTS"])

15522

In [178]:
url = os.getenv("COLBERTDB_URL")
api_key = os.getenv("COLBERTDB_API_KEY")
store_name = os.getenv("COLBERTDB_STORE_NAME")
openai_key = os.getenv("OPENAI_API_KEY")
client = Colbertdb(url, api_key, store_name)
openai_client = OpenAI(api_key=openai_key, model="gpt-4o")
openai_embedding = OpenAIEmbedding(api_key=openai_key, model="text-embedding-3-large", embed_batch_size=100)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("pubmed")

In [None]:
import uuid
documents = []

for passage in passages:
    documents.append({"content": passage, "metadata": {}})
    embedding = openai_embedding.get_text_embedding(passage)
    item = {"id": str(uuid.uuid4()), "values": embedding, "metadata": {"text": passage}}
    index.upsert([item])


In [None]:
client.create_collection(name="pubmed", documents=documents, options={"force_create": True})
collection = client.load_collection("pubmed")

In [175]:

qa_prompt_tmpl_str = """\
---------------------
Passages: {passages}
---------------------

---------------------
Question: {question}
---------------------

---------------------
Answer: {answer}
---------------------

Score the group of passages (as a whole) on its relevance to answering the question given the provided answer.

Output: Score from 1 - 10. Only the score. no explanation needed. one score not multiple scores.
"""
import re

cbd_scores = []
pc_scores = []
for entry in random.sample(sample, 50):
    cbd_content = ''
    pc_content = ''
    question = entry['QUESTION']
    answer = entry['LONG_ANSWER'][0]
    cbd_response = collection.search(question, k=3)
    pc_response = index.query(vector=openai_embedding.get_text_embedding(question), top_k=3, include_metadata=True)
    for document in cbd_response.documents:
        cbd_content += document.content + '\n\n'
    for _document in pc_response['matches']:
        pc_content += _document['metadata']['text'] + '\n\n'
    prompt_template= PromptTemplate(qa_prompt_tmpl_str)
    cbd_prompt = prompt_template.format(passages=cbd_content, question=question, answer=answer)
    pc_prompt = prompt_template.format(passages=pc_content, question=question, answer=answer)
    cbd_score = openai_client.complete(cbd_prompt)
    pc_score = openai_client.complete(pc_prompt)
    cbd_result = re.sub('[^0-9]','', cbd_score.text)
    pc_result = re.sub('[^0-9]','', pc_score.text)
    cbd_scores.append(cbd_result)
    pc_scores.append(pc_result)


In [176]:
print("Average score for ColbertDB: ", sum([int(score) for score in cbd_scores])/len(cbd_scores))
print("Average score for Pinecone: ", sum([int(score) for score in pc_scores])/len(pc_scores))

Average score for ColbertDB:  8.54
Average score for Pinecone:  9.28
