In [1]:
from pathlib import Path

In [2]:
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_path = Path.cwd().joinpath("datasets", "embeddings_pubmed_qa")

In [4]:
context_dataset_with_embeddings = load_from_disk(dataset_path)

In [5]:
context_dataset_with_embeddings.info

DatasetInfo(description='', citation='', homepage='', license='', features={'context': Value(dtype='string', id=None), 'embedding': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

#### Connecting to PostgreSQL

In [2]:
from os import getenv
from dotenv import load_dotenv, find_dotenv

In [3]:
from urllib.parse import quote

In [4]:
load_dotenv()
database_user = getenv('POSTGRES_USER')
database_password = getenv('POSTGRES_PASSWORD')
database_host = getenv('POSTGRES_HOST')
database_port = getenv('POSTGRES_PORT')
database_name = getenv('POSTGRES_DB')


In [5]:
from psycopg2 import connect
from pgvector.psycopg2 import register_vector

In [10]:
database_connection = connect(
    user=database_user,
    password=database_password,
    host=database_host,
    port=database_port,
    database=database_name
)

In [11]:
database_connection.set_session(autocommit=True)

In [12]:
cursor = database_connection.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")

In [13]:
register_vector(database_connection)

In [14]:
cursor.execute("DROP TABLE IF EXISTS pubmed_qa")

In [15]:
database_creation_string ="""
    CREATE TABLE pubmed_qa (id bigserial PRIMARY KEY,context TEXT, context_vector VECTOR(1024)
    );"""

In [16]:
database_creation_string

'\n    CREATE TABLE pubmed_qa (id bigserial PRIMARY KEY,context TEXT, context_vector VECTOR(1024)\n    );'

In [17]:
cursor.execute(database_creation_string)
    

With our database connection, our dataset with embedding, let now insert the embedding and then text into the database.

In [18]:
context_dataset_with_embeddings

Dataset({
    features: ['context', 'embedding'],
    num_rows: 206613
})

In [19]:
from psycopg2.extras import execute_values      

In [20]:
context_dataset_with_embeddings

Dataset({
    features: ['context', 'embedding'],
    num_rows: 206613
})

In [21]:
def save_batch_to_database(batch):
    """insert batch into database

    Args:
        batch (_type_): _description_
    """
    embeddings = batch["embedding"]
    contexts = batch["context"]
    execute_values(
    cur=cursor, sql="INSERT INTO pubmed_qa (context, context_vector) VALUES %s",
    argslist=zip(contexts, embeddings)
)
    

In [22]:
context_dataset_with_embeddings.map(save_batch_to_database, batched=True, batch_size=1000)



Map: 100%|██████████| 206613/206613 [07:05<00:00, 485.73 examples/s]


Dataset({
    features: ['context', 'embedding'],
    num_rows: 206613
})

At this point we have the embedding saved in the postgres, the next step will be to build the embeddings

cursor.close()

In [23]:
database_connection.close()
cursor.close()

### Retrieval Part

With our vector and context saved in the database we will move to the next step of our RAG application, the text retrieval.
We will use the questions embeddings, and query the postgres database to find the cosine similarity with the context embeddings, and then return the top 5 context related to the question. 

In [1]:
import numpy as np

In [7]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from typing import List, Dict

In [9]:
def semantic_search(query: str, embedding_model: SentenceTransformer, k: int) -> List[Dict]:
    """
    Preform semantic search on the database.
    Given the query, return the top k relevant documents from the database.

    Args:
        query (str): _description_
        embedding_model (SentenceTransformer): _description_
        k (int): _description_

    Returns:
        _type_: _description_
    """
    embedding = np.array(embedding_model.encode(query))
    with connect(
        user=database_user,
        password=database_password,
        host=database_host,
        port=database_port,
        database=database_name
    ) as conn:
        register_vector(conn)
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT context, context_vector FROM pubmed_qa ORDER BY context_vector <=> %s LIMIT %s", (embedding, k),)
            rows = cur.fetchall()
            semantic_context = [
                {"text": row[0], "source": row[1][:10]} for row in rows]
    return semantic_context

Let us now download the question dataset and perform the query again.

In [11]:
from datasets import load_dataset

In [12]:
dataset_id = "pubmed_qa"

In [13]:
unlabeled_dataset = load_dataset(dataset_id,  "pqa_unlabeled")
labeled_dataset = load_dataset(dataset_id,  "pqa_labeled")

In [30]:
unlabeled_dataset

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer'],
        num_rows: 61249
    })
})

In [14]:
test_question = labeled_dataset["train"][0]["question"]

In [15]:
embedding_model_name = 'michiyasunaga/BioLinkBERT-large'

In [16]:
embedding_model = SentenceTransformer(embedding_model_name)

No sentence-transformers model found with name /Users/esp.py/.cache/torch/sentence_transformers/michiyasunaga_BioLinkBERT-large. Creating a new one with MEAN pooling.


In [17]:
test_question

'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?'

In [18]:
labeled_dataset["train"][0]["context"]

{'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
  'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoT

In [19]:
contexts = semantic_search(test_question, embedding_model, 5)

At first glance we can see that, with our sample question we are able to find the answer context in the top 5 context, we a evaluation need to be performed on the whole dataset to check how our retrieval system is working. 

Nevertheless, let continue with our work and perform the response generation.

In [20]:
random_index = np.random.randint(0, len(unlabeled_dataset["train"]))

In [21]:
random_index

37027

In [22]:
random_question = unlabeled_dataset["train"][random_index]["question"]

In [23]:
random_question

'Does a novel penalized likelihood reconstruction of 18F-FDG PET-CT improve signal-to-background in colorectal liver metastases?'

In [24]:
random_retrieved_contexts = semantic_search(random_question, embedding_model, 5)

In [25]:
from pprint import pprint

In [26]:
random_answer = unlabeled_dataset["train"][random_index]["long_answer"]
pprint(random_answer)

('This BPL reconstruction algorithm improved SNR and SBR for colorectal liver '
 'metastases detected by 18F-FDG-PET/CT, increasing the lesion SUVmax without '
 'increasing background liver SUV or image noise. This may improve the '
 'detection of FDG-avid focal liver lesions and the diagnostic performance of '
 'clinical 18F-FDG-PET/CT in this setting, with the largest impact for small '
 'foci.')


In [27]:
random_gold_contexts = unlabeled_dataset["train"][random_index]["context"]

In [28]:
random_gold_contexts["contexts"]

['Iterative reconstruction algorithms are widely used to reconstruct positron emission tomography computerised tomography (PET/CT) data. Lesion detection in the liver by 18F-fluorodeoxyglucose PET/CT (18F-FDG-PET/CT) is hindered by 18F-FDG uptake in background liver parenchyma. The aim of this study was to compare semi-quantitative parameters of histologically-proven colorectal liver metastases detected by 18F-FDG-PET/CT using data based on a Bayesian penalised likelihood (BPL) reconstruction, with data based on a conventional time-of-flight (ToF) ordered subsets expectation maximisation (OSEM) reconstruction.',
 "A BPL reconstruction algorithm was used to retrospectively reconstruct sinogram PET data. This data was compared with OSEM reconstructions. A volume of interest was placed within normal background liver parenchyma. Lesions were segmented using automated thresholding. Lesion maximum standardised uptake value (SUVmax), standard deviation of background liver parenchyma SUV, sign

In [29]:
for context in random_retrieved_contexts:
    pprint(context.get("text"))
    print("**" * 10)

('Iterative reconstruction algorithms are widely used to reconstruct positron '
 'emission tomography computerised tomography (PET/CT) data. Lesion detection '
 'in the liver by 18F-fluorodeoxyglucose PET/CT (18F-FDG-PET/CT) is hindered '
 'by 18F-FDG uptake in background liver parenchyma. The aim of this study was '
 'to compare semi-quantitative parameters of histologically-proven colorectal '
 'liver metastases detected by 18F-FDG-PET/CT using data based on a Bayesian '
 'penalised likelihood (BPL) reconstruction, with data based on a conventional '
 'time-of-flight (ToF) ordered subsets expectation maximisation (OSEM) '
 'reconstruction.')
********************
('The aim of this study is to evaluate the quality of I-124 PET images with '
 'and without prompt gamma compensation (PGC) by comparing the recovery '
 'coefficients (RC), the signal to noise ratios (SNR) and the contrast to F-18 '
 'and Ga-68. Furthermore, the influence of the PGC on the quantification and '
 'image quality

### The response generation part

The question answering system, we will use the language  model that have been trained on the pub med qa! The model is called BiomedGPT.

In [30]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-large")
model = AutoModelForQuestionAnswering.from_pretrained("michiyasunaga/BioLinkBERT-large")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
concatenated_context = " ".join([context["text"]
                                for context in random_retrieved_contexts])

In [32]:
concatenated_context

'Iterative reconstruction algorithms are widely used to reconstruct positron emission tomography computerised tomography (PET/CT) data. Lesion detection in the liver by 18F-fluorodeoxyglucose PET/CT (18F-FDG-PET/CT) is hindered by 18F-FDG uptake in background liver parenchyma. The aim of this study was to compare semi-quantitative parameters of histologically-proven colorectal liver metastases detected by 18F-FDG-PET/CT using data based on a Bayesian penalised likelihood (BPL) reconstruction, with data based on a conventional time-of-flight (ToF) ordered subsets expectation maximisation (OSEM) reconstruction. The aim of this study is to evaluate the quality of I-124 PET images with and without prompt gamma compensation (PGC) by comparing the recovery coefficients (RC), the signal to noise ratios (SNR) and the contrast to F-18 and Ga-68. Furthermore, the influence of the PGC on the quantification and image quality is evaluated. The current perception of using contrast-enhanced CT (CECT)

In [33]:
input_ids = tokenizer(random_question, concatenated_context,  return_tensors="pt")

In [34]:
import torch

In [35]:
with torch.no_grad():
    outputs = model(**input_ids)

In [36]:
answers_start_index = outputs.start_logits.argmax()
answers_end_index = outputs.end_logits.argmax()

In [37]:
predicted_answer_tokens = input_ids["input_ids"][0, answers_start_index:answers_end_index+1]

In [38]:
tokenizer.decode(predicted_answer_tokens)

'f - 18 and ga -'

In [39]:
test_question

'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?'

In [40]:
labeled_dataset["train"][0]["long_answer"]

'Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.'

Not sure if the model is working, but I will come back here to check if the model was working.. 

In [99]:
context_dataset_with_embeddings

Dataset({
    features: ['context', 'embedding'],
    num_rows: 206613
})

### Trial with BioGPT

Let us now try to generate the answer with the GPT model which is a generative model for question answering

In [41]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

In [50]:
input = f"question: {random_question} context: {concatenated_context}"
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)
output = model.generate(input_ids=encoded_input.input_ids,
                        attention_mask=encoded_input.attention_mask, 
                        max_new_tokens=200)

In [51]:
output.shape

torch.Size([1, 627])

In [52]:
from pprint import pprint

In [53]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [48]:
input

'question: Does a novel penalized likelihood reconstruction of 18F-FDG PET-CT improve signal-to-background in colorectal liver metastases? context: Iterative reconstruction algorithms are widely used to reconstruct positron emission tomography computerised tomography (PET/CT) data. Lesion detection in the liver by 18F-fluorodeoxyglucose PET/CT (18F-FDG-PET/CT) is hindered by 18F-FDG uptake in background liver parenchyma. The aim of this study was to compare semi-quantitative parameters of histologically-proven colorectal liver metastases detected by 18F-FDG-PET/CT using data based on a Bayesian penalised likelihood (BPL) reconstruction, with data based on a conventional time-of-flight (ToF) ordered subsets expectation maximisation (OSEM) reconstruction. The aim of this study is to evaluate the quality of I-124 PET images with and without prompt gamma compensation (PGC) by comparing the recovery coefficients (RC), the signal to noise ratios (SNR) and the contrast to F-18 and Ga-68. Furt

In [54]:
generated_text = generated_text.replace(input, "")

In [55]:
pprint(generated_text)

('question: Does a novel penalized likelihood reconstruction of 18F-FDG PET-CT '
 'improve signal-to-background in colorectal liver metastases? context: '
 'Iterative reconstruction algorithms are widely used to reconstruct positron '
 'emission tomography computerised tomography (PET / CT) data. Lesion '
 'detection in the liver by 18F-fluorodeoxyglucose PET / CT (18F-FDG-PET / CT) '
 'is hindered by 18F-FDG uptake in background liver parenchyma. The aim of '
 'this study was to compare semi-quantitative parameters of '
 'histologically-proven colorectal liver metastases detected by 18F-FDG-PET / '
 'CT using data based on a Bayesian penalised likelihood (BPL) reconstruction, '
 'with data based on a conventional time-of-flight (ToF) ordered subsets '
 'expectation maximisation (OSEM) reconstruction. The aim of this study is to '
 'evaluate the quality of I-124 PET images with and without prompt gamma '
 'compensation (PGC) by comparing the recovery coefficients (RC), the signal '
 't

In [49]:
pprint(random_answer)

('This BPL reconstruction algorithm improved SNR and SBR for colorectal liver '
 'metastases detected by 18F-FDG-PET/CT, increasing the lesion SUVmax without '
 'increasing background liver SUV or image noise. This may improve the '
 'detection of FDG-avid focal liver lesions and the diagnostic performance of '
 'clinical 18F-FDG-PET/CT in this setting, with the largest impact for small '
 'foci.')


- Few conclusion for the first part, we can see that the quality of the answers depend widelly on  the quality of the retrieved paragraphs. As it stand now, we will deploy the project and expose it to users and we will later come back on the evaluation and it's improvements.

We can also conclude that splitting the text in shorter paragraphs have impacted the quality of the retrieved answers, we will need to find  a better way to deal with that.

Few resources to consider: 
- https://www.reddit.com/r/LocalLLaMA/comments/15mq1ri/what_are_the_text_chunkingsplitting_and_embedding/
- https://www.reddit.com/r/LangChain/comments/16m73j4/how_to_optimize_text_chunking_for_improved/
- https://towardsdatascience.com/how-to-chunk-text-data-a-comparative-analysis-3858c4a0997a



At this point, we can deploy the model in production but we will get back to it to improve the quality of embeddings. 

And we will use the gpt model to generate the embeddings.