In [None]:
#print("Test")

In [None]:
# import os
# os.chdir("OneDrive - Carleton University\Medical-Chatbot")
# %pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# extract text from pdfs
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    docs = loader.load()
    return docs

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
# filter function
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    
    # iterate through each document and get the content and source metadata
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )

    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [None]:
# chunking the documents
def chunk_docs(minimal_docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # 500 characters = 1 chunk
        chunk_overlap=20, # 20 characters overlap between chunks
    )

    chunks = splitter.split_documents(minimal_docs)
    return chunks

In [None]:
chunks = chunk_docs(minimal_docs)
print("Num of chunks:", len(chunks))

In [None]:
chunks

In [5]:
# embedding model
from langchain.embeddings import HuggingFaceEmbeddings

def create_embedding_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
    )

    return embeddings

embedding = create_embedding_model()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [7]:
# test the embedding model
vector = embedding.embed_query("This is a test sentence.") # embedding vector for the test sentence
vector

[0.08429646492004395,
 0.05795375257730484,
 0.004493334796279669,
 0.10582110285758972,
 0.007083463482558727,
 -0.017844678834080696,
 -0.016888096928596497,
 -0.015228349715471268,
 0.040473055094480515,
 0.03342251107096672,
 0.10432768613100052,
 -0.047035831958055496,
 0.006884740665555,
 0.04101800173521042,
 0.01871195249259472,
 -0.04149232804775238,
 0.023647429421544075,
 -0.05650181323289871,
 -0.033696215599775314,
 0.05099101364612579,
 0.06930329650640488,
 0.054784249514341354,
 -0.009788368828594685,
 0.023697199299931526,
 0.019996512681245804,
 0.009717293083667755,
 -0.05889919772744179,
 0.007307454943656921,
 0.04702645167708397,
 -0.004510104190558195,
 -0.055799711495637894,
 -0.004159437958151102,
 0.06475706398487091,
 0.04807630926370621,
 0.017020801082253456,
 -0.0031833855900913477,
 0.05740238353610039,
 0.03523186966776848,
 -0.0058838683180511,
 0.014832884073257446,
 0.011576333083212376,
 -0.10748078674077988,
 0.01910419389605522,
 0.0220857169479131

In [8]:
print("Length of embedding vector:", len(vector))

Length of embedding vector: 384


In [9]:
from dotenv import load_dotenv
import os
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [10]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key) # initialize pinecone client

In [11]:
pc

<pinecone.pinecone.Pinecone at 0x1e5ffcf7c40>

In [12]:
index_name = "medical-chatbot"
index = pc.Index(index_name)

In [13]:
# # create pinecone index
# from pinecone import ServerlessSpec

# index_name = "medical-chatbot"

# if not pc.has_index(index_name):
#     pc.create_index(
#         name=index_name,
#         dimension=384, # higher dimension = more accurate embeddings and more info
#         metric="cosine",
#         spec=ServerlessSpec(cloud="aws", region="us-east-1")
#     )

# index = pc.Index(index_name)

In [14]:
# use langchain pinecone to create vector store
# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_documents(
#     documents=chunks,
#     embedding=embedding,
#     index_name=index_name
# )

In [15]:
# load exsisting pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name
)

In [16]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3}) # retrieve top 3 similar/relevant responses from the knowledge base

In [17]:
# test the retriever
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='46b1b2e0-d0e8-4446-bebe-db5da8f40862', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='dbc9c6d0-0078-415b-b309-0205adc2a73f', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='d90783e6-9670-4452-9424-74d9d158a090', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged 

In [18]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")

In [19]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [20]:
# define custom prompt template
system_prompts = (
    "You are a medical assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    "Use three sentences maximum to answer the question and keep it concise."
    "\n\n"
    "{context}"
)

# create prompt template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompts),
        ("human", "{input}"),
    ]
)

In [21]:
question_answering_chain = create_stuff_documents_chain(chatModel, prompt) # create a chain that stuffs all retrieved documents into the prompt 
rag_chain = create_retrieval_chain(retriever, question_answering_chain) # create a rag chain using the retriever and the question answering chain