In [1]:
from pathlib import Path

data_dir = Path("../data")
file = data_dir / "The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf"

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

def extract_text_from_pdf(path):
    loader = DirectoryLoader(
        path=path,
        glob="*.pdf",
        loader_cls=PyPDFLoader 
    )

    document = loader.load()

    return document

In [3]:
extracted_data = extract_text_from_pdf(data_dir)

In [4]:
extracted_data[1]

Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': '../data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505, 'page': 1, 'page_label': 'ii'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION')

In [5]:
# only needs source & page_content

from langchain.schema import Document


def filter_source_and_page_content(docs):
    filtered_docs: list[Document] = []

    for doc in docs:
        src = doc.metadata.get("source")
        filtered_docs.append(
            Document(
                metadata={"source": src},
                page_content=doc.page_content
            )
        )
    
    return filtered_docs

In [6]:
filtered_docs = filter_source_and_page_content(extracted_data)
filtered_docs[1]

Document(metadata={'source': '../data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION')

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_content(doc):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=64,
        length_function=len
    )

    text_chunks = text_splitter.split_documents(doc)
    return text_chunks

In [15]:
text_chunks = chunk_content(filtered_docs)
len(text_chunks)

42695

In [16]:
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embedding():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"device": DEVICE}
    )

    return embeddings

In [23]:
embedding = download_embedding()

In [24]:
vector = embedding.embed_query("hello world")
len(vector)

384

In [27]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [28]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [29]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
from pinecone import ServerlessSpec, CloudProvider, AwsRegion

index_name = "medbot"

if not pc.has_index(index_name):
    pc.create_index(name=index_name,
                    dimension=384,
                    metric="cosine",
                    spec=ServerlessSpec(
                        cloud=CloudProvider.AWS, 
                        region=AwsRegion.US_EAST_1 # pinecone free tier only have this region available
                    ) 
    )

index = pc.Index(index_name)

In [None]:
# add embeddings to index

# from langchain_pinecone import PineconeVectorStore

# doc_search = PineconeVectorStore.from_documents(
#     documents=text_chunks,
#     embedding=embedding,
#     index_name=index_name
# )

In [35]:
# load existing index

from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [None]:
# add more data to existing index

lorem = Document(
    page_content="Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",
    metadata={"source": "Lipsum"}
)

# doc_search.add_documents(documents=[lorem,])

In [None]:
retriever = doc_search.as_retriever(search_type="similarity", search_kwargs={"k": 3}) # top 3

In [37]:
retrieved_docs = retriever.invoke("What is acne?")
retrieved_docs

[Document(id='f3ad989f-2264-4c87-a3ef-0322b15a8f6e', metadata={'source': '../data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf'}, page_content='pimples on the face, chest, and back. It occurs when\nthe pores of the skin become clogged with oil, dead\nskin cells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne,\nis the most common skin disease. It affects nearly 17\nmillion people in the United States. While acne can\narise at any age, it usually begins atpuberty and wor-\nsens during adolescence. Nearly 85% of people\ndevelop acne at some time between the ages of 12-25\nyears. Up to 20% of women develop mild acne. It is'),
 Document(id='52392759-da58-4e47-b9cf-24b6fbb97f57', metadata={'source': '../data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf'}, page_content='turnover (death and replacement) of skin cells.\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous glands\nbecome inflamed.

In [44]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-oss-20b") # gpt-4o

In [45]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a medical assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Make you answer concise by having maximum of "
    "three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [46]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
response = rag_chain.invoke({"input": "Tips on preventing acne."})
print(response["answer"])

NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-oss-20b` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}