# Retrieval Augmented Generation for Clinical Trials

Implementation of RAG with sample clinical trial dataset using ChatGPT 3.5.

Installations

In [1]:
!pip install langchain_community langchain_openai 'openai==1.55.3' nltk 'chromadb==0.4.14'



In [2]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai
import shutil

from langchain.evaluation import load_evaluator
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from chromadb.config import Settings
import chromadb

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data processing

In [3]:
os.environ['OPENAI_API_KEY'] = '___'
openai.api_key = os.environ['OPENAI_API_KEY']

DATA_PATH = "/content/drive/MyDrive/dataset(1)"

CHROMA_PATH = "/content/drive/MyDrive/chromafoldder"
!mkdir -p "{CHROMA_PATH}"

In [4]:
def generate_data_store():
  documents = load_documents()
  chunks = split_text(documents)
  save_to_chroma(chunks)

def load_documents():
  loader = DirectoryLoader(DATA_PATH, glob="*.md")
  documents = loader.load()
  return documents

In [5]:
def split_text(documents: list[Document]):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=300,
      chunk_overlap=100,
      length_function=len,
      add_start_index=True,
  )
  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
  return chunks

def save_to_chroma(chunks: list[Document]):
  if os.path.exists(CHROMA_PATH):
      shutil.rmtree(CHROMA_PATH)
  db = Chroma.from_documents(
      chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
  )
  db.persist()
  print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
generate_data_store()

Split 1 documents into 17 chunks.
Saved 17 chunks to /content/drive/MyDrive/chromafoldder.


  db.persist()


Prompt

In [7]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [8]:
query_text = "Whats NCT-002? "

Comparing embedding distance and generating output

In [9]:
embedding_function = OpenAIEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

results = db.similarity_search_with_relevance_scores(query_text, k=1)
if len(results) == 0 or results[0][1] < 0.1:
    print(f"Unable to find matching results.")

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Human: 
Answer the question based only on the following context:

Trial ID: NCT-003
Title: Mindfulness Therapy for Anxiety Disorders
Phase: IV
Status: Active
Conditions: Generalized Anxiety Disorder
Interventions: Behavioral Therapy
Sponsor: University of Toronto
Start Date: 03/2021
End Date: 03/2024
Enrollment: 300

---

Answer the question based on the above context: Whats NCT-002? 



  response_text = model.predict(prompt)


Response: Sorry, I cannot provide information on NCT-002 as it is not included in the context provided.
Sources: ['/content/drive/MyDrive/dataset(1)/Trial ID: NCT-001.md']
