# PubMed RAG Version 0.01

This notebook implements a Retrieval-Augmented Generation (RAG) pipeline for PubMed data.

In [None]:
# Code implementation
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

## 1. Importing Necessary Libraries

In this section, we import the required Python libraries.

In [None]:
# Import necessary libraries
import os
from google.colab import userdata
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
secret_langchain_key_value = userdata.get('LANGCHAIN_API_KEY')
secret_openai_key_value = userdata.get('OPENAI_API_KEY')
os.environ['LANGCHAIN_API_KEY'] = secret_langchain_key_value
os.environ['OPENAI_API_KEY'] = secret_openai_key_value



## 2. Loading and Preprocessing Data

This section loads and preprocesses the PubMed dataset for further analysis.

In [None]:
# Import necessary libraries
!pip install datasets
from datasets import load_dataset

# Enable streaming to avoid full dataset download
dataset = load_dataset("MedRAG/pubmed", split="train", streaming=True)

# Take only the first 100 samples (or any number)
subset = dataset.skip(500).take(100)  # Skip first 500, then take 100

# Convert to list for easy access (optional)
subset_list = list(subset)

# Print first sample
print(subset_list[0])

## 3. Embedding Generation

We generate embeddings for the textual data to facilitate similarity search.

In [None]:
# Import necessary libraries
import pandas as pd
df = pd.DataFrame(subset_list)
# store to csv
df.to_csv('pubmed_subset.csv', index=False)

# download the csv
from google.colab import files
files.download('pubmed_subset.csv')

## 4. Retrieval Mechanism

Here, we implement the retrieval process using FAISS or another vector search method.

In [None]:
# Import necessary libraries
documents = [
    {
        "id": item["id"],
        "title": item["title"],
        "content": item["content"],
        "contents": item["contents"],
        "PMID": item["PMID"]
    }
    for item in subset_list
]

from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)

# Prepare chunks with metadata
chroma_docs = []

for doc in documents:
    chunks = text_splitter.split_text(doc["content"])  # Split document into smaller chunks
    for chunk in chunks:
        chroma_docs.append({"text": chunk, "metadata": {"id": doc["id"], "title": doc["title"], "PMID": doc["PMID"]}})

# Verify results
print(f"Total Chunks: {len(chroma_docs)}")
print(f"Sample Chunk: {chroma_docs[0]}")

## 5. RAG Model Implementation

This section integrates the retrieved information with a language model to generate responses.

In [None]:
# Import necessary libraries
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Initialize ChromaDB
vectorstore = Chroma.from_texts(
    texts=[doc["text"] for doc in chroma_docs],
    metadatas=[doc["metadata"] for doc in chroma_docs],
    embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

## 6. Evaluation and Testing

We evaluate the system's performance and test the retrieval-augmented generation pipeline.

In [None]:
# Code implementation
queries = [
    "Lysosomal hydrolases of the epidermis",
    "Micellar solubilization of fatty acids",
    "Influence of phospholipolysis on solubility",
    "Purification and characterization of folate binding proteins",
    "Effects of anaerobic bacteria in wound healing"
]

for query in queries:
    print(f"Query: {query}\n")
    results = retriever.get_relevant_documents(query)

    for r in results:
        print(f"Title: {r.metadata['title']}\nPMID: {r.metadata['PMID']}\nText: {r.page_content[:500]}\n---")
    print("\n" + "="*80 + "\n")