In [None]:
from pathlib import Path

In [None]:
from datasets import load_from_disk

In [None]:
dataset_path = Path.cwd().joinpath("datasets", "embeddings_pubmed_qa")

In [None]:
context_dataset_with_embeddings = load_from_disk(dataset_path)

In [None]:
context_dataset_with_embeddings.info

#### Connecting to PostgreSQL

In [None]:
from os import getenv
from dotenv import load_dotenv, find_dotenv

In [None]:
from urllib.parse import quote

In [None]:
load_dotenv()
database_user = getenv('POSTGRES_USER')
database_password = getenv('POSTGRES_PASSWORD')
database_host = getenv('POSTGRES_HOST')
database_port = getenv('POSTGRES_PORT')
database_name = getenv('POSTGRES_DB')


In [None]:
from psycopg2 import connect
from pgvector.psycopg2 import register_vector

In [None]:
database_connection = connect(
    user=database_user,
    password=database_password,
    host=database_host,
    port=database_port,
    database=database_name
)

In [None]:
database_connection.set_session(autocommit=True)

In [None]:
cursor = database_connection.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")

In [None]:
register_vector(database_connection)

In [None]:
cursor.execute("DROP TABLE IF EXISTS pubmed_qa")

In [None]:
database_creation_string ="""
    CREATE TABLE pubmed_qa (id bigserial PRIMARY KEY,context TEXT, context_vector VECTOR(1024)
    );"""

In [None]:
database_creation_string

In [None]:
cursor.execute(database_creation_string)
    

With our database connection, our dataset with embedding, let now insert the embedding and then text into the database.

In [None]:
context_dataset_with_embeddings

In [None]:
from psycopg2.extras import execute_values      

In [None]:
context_dataset_with_embeddings

In [None]:
def save_batch_to_database(batch):
    """insert batch into database

    Args:
        batch (_type_): _description_
    """
    embeddings = batch["embedding"]
    contexts = batch["context"]
    execute_values(
    cur=cursor, sql="INSERT INTO pubmed_qa (context, context_vector) VALUES %s",
    argslist=zip(contexts, embeddings)
)
    

In [None]:
context_dataset_with_embeddings.map(save_batch_to_database, batched=True, batch_size=1000)

At this point we have the embedding saved in the postgres, the next step will be to build the embeddings

cursor.close()

In [None]:
database_connection.close()
cursor.close()

### Retrieval Part

With our vector and context saved in the database we will move to the next step of our RAG application, the text retrieval.
We will use the questions embeddings, and query the postgres database to find the cosine similarity with the context embeddings, and then return the top 5 context related to the question. 

In [None]:
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
from typing import List, Dict

In [None]:
def semantic_search(query: str, embedding_model: SentenceTransformer, k: int) -> List[Dict]:
    """
    Preform semantic search on the database.
    Given the query, return the top k relevant documents from the database.

    Args:
        query (str): _description_
        embedding_model (SentenceTransformer): _description_
        k (int): _description_

    Returns:
        _type_: _description_
    """
    embedding = np.array(embedding_model.encode(query))
    with connect(
        user=database_user,
        password=database_password,
        host=database_host,
        port=database_port,
        database=database_name
    ) as conn:
        register_vector(conn)
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT context, context_vector FROM pubmed_qa ORDER BY context_vector <=> %s LIMIT %s", (embedding, k),)
            rows = cur.fetchall()
            semantic_context = [
                {"text": row[0], "source": row[1][:10]} for row in rows]
    return semantic_context

Let us now download the question dataset and perform the query again.

In [None]:
from datasets import load_dataset

In [None]:
dataset_id = "pubmed_qa"

In [None]:
unlabeled_dataset = load_dataset(dataset_id,  "pqa_unlabeled")
labeled_dataset = load_dataset(dataset_id,  "pqa_labeled")

In [None]:
unlabeled_dataset

In [None]:
test_question = labeled_dataset["train"][0]["question"]

In [None]:
embedding_model_name = 'michiyasunaga/BioLinkBERT-large'

In [None]:
embedding_model = SentenceTransformer(embedding_model_name)

In [None]:
test_question

In [None]:
labeled_dataset["train"][0]["context"]

In [None]:
contexts = semantic_search(test_question, embedding_model, 5)

At first glance we can see that, with our sample question we are able to find the answer context in the top 5 context, we a evaluation need to be performed on the whole dataset to check how our retrieval system is working. 

Nevertheless, let continue with our work and perform the response generation.

In [None]:
random_index = np.random.randint(0, len(unlabeled_dataset["train"]))

In [None]:
random_index

In [None]:
random_question = unlabeled_dataset["train"][random_index]["question"]

In [None]:
random_question

In [None]:
random_retrieved_contexts = semantic_search(random_question, embedding_model, 5)

In [None]:
from pprint import pprint

In [None]:
random_answer = unlabeled_dataset["train"][random_index]["long_answer"]
pprint(random_answer)

In [None]:
random_gold_contexts = unlabeled_dataset["train"][random_index]["context"]

In [None]:
random_gold_contexts["contexts"]

In [None]:
for context in random_retrieved_contexts:
    pprint(context.get("text"))
    print("**" * 10)

### The response generation part

The question answering system, we will use the language  model that have been trained on the pub med qa! The model is called BiomedGPT.

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-large")
model = AutoModelForQuestionAnswering.from_pretrained("michiyasunaga/BioLinkBERT-large")

In [None]:
concatenated_context = " ".join([context["text"]
                                for context in random_retrieved_contexts])

In [None]:
concatenated_context

In [None]:
input_ids = tokenizer(random_question, concatenated_context,  return_tensors="pt")

In [None]:
import torch

In [None]:
with torch.no_grad():
    outputs = model(**input_ids)

In [None]:
answers_start_index = outputs.start_logits.argmax()
answers_end_index = outputs.end_logits.argmax()

In [None]:
predicted_answer_tokens = input_ids["input_ids"][0, answers_start_index:answers_end_index+1]

In [None]:
tokenizer.decode(predicted_answer_tokens)

In [None]:
test_question

In [None]:
labeled_dataset["train"][0]["long_answer"]

Not sure if the model is working, but I will come back here to check if the model was working.. 

In [None]:
context_dataset_with_embeddings

### Trial with BioGPT

Let us now try to generate the answer with the GPT model which is a generative model for question answering

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

In [None]:
input = f"question: {random_question} context: {concatenated_context}"
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)
output = model.generate(input_ids=encoded_input.input_ids,
                        attention_mask=encoded_input.attention_mask, 
                        max_new_tokens=200)

In [None]:
output.shape

In [None]:
from pprint import pprint

In [None]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
input

In [None]:
generated_text = generated_text.replace(input, "")

In [None]:
pprint(generated_text)

In [None]:
pprint(random_answer)

- Few conclusion for the first part, we can see that the quality of the answers depend widelly on  the quality of the retrieved paragraphs. As it stand now, we will deploy the project and expose it to users and we will later come back on the evaluation and it's improvements.

We can also conclude that splitting the text in shorter paragraphs have impacted the quality of the retrieved answers, we will need to find  a better way to deal with that.

Few resources to consider: 
- https://www.reddit.com/r/LocalLLaMA/comments/15mq1ri/what_are_the_text_chunkingsplitting_and_embedding/
- https://www.reddit.com/r/LangChain/comments/16m73j4/how_to_optimize_text_chunking_for_improved/
- https://towardsdatascience.com/how-to-chunk-text-data-a-comparative-analysis-3858c4a0997a



At this point, we can deploy the model in production but we will get back to it to improve the quality of embeddings. 

And we will use the gpt model to generate the embeddings.