In [1]:
import os
from dotenv import load_dotenv

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

load_dotenv()

True

In [2]:
# Singleton holder for the embeddings instance
_embeddings_instance: HuggingFaceEmbeddings | None = None

def get_hf_embeddings(
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
) -> HuggingFaceEmbeddings:
    """
    Return a cached HuggingFaceEmbeddings instance for `model_name`.
    On first call it will download (if needed), later calls reuse the same object.
    """
    global _embeddings_instance
    if _embeddings_instance is None:
        # This will download the model into your HF cache dir if not already present
        _embeddings_instance = HuggingFaceEmbeddings(model_name=model_name)
    return _embeddings_instance


model = get_hf_embeddings()

  _embeddings_instance = HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# extract data  from pdf file
def load_pdf(data):
    loader = DirectoryLoader(data, # <- directory file
                             glob="*.pdf", # <- all pdf on folder
                             loader_cls=PyPDFLoader # <- function implement to load pdf
                             )
    documents = loader.load()
    return documents


#function chucking documents
def text_splitter(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks   = text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
document = load_pdf('../data/')

In [7]:
type(document)

list

In [8]:
text_chunk = text_splitter(document)
len(text_chunk)

5860

In [9]:
type(text_chunk)

list

In [3]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_KEY

'pcsk_6sNBQJ_L3HnKrjUCYGzYbEXCFxXGTicumokQvpNEY85zWXWQCPypmS9vRvAnWc5z4fDWSv'

In [4]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'medicalbot'


# pc.create_index(
#     name=index_name,
#     dimension=384,
#     metric='cosine',
#     spec=ServerlessSpec(
#         cloud='aws',
#         region="us-east-1"
#     )
# )

In [5]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-index"

# create index (kalau belum ada)
if index_name not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,  # sesuaikan embedding
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )


In [12]:
# # pc = Pinecone(api_key=PINECONE_API_KEY)

# import pinecone
# from langchain_community.vectorstores import Pinecone

# # pinecone.init(
# #     api_key=PINECONE_API_KEY,
# #     environment="us-east-1"
# # )

# docsearch = Pinecone.from_documents(
#     documents=text_chunk,
#     index_name=index_name,
#     embedding=model
# )


from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=PINECONE_API_KEY)


# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunk,
#     embedding=model,
#     index_name="medical-index"
# )


In [13]:
#load existing index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=model
)

In [14]:
#setting retiever
rertriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [15]:
#testing
rertriever.invoke("What is acne ?")

[Document(metadata={'page': 39, 'source': '../Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(metadata={'page': 38, 'source': '../Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(metadata={'page': 37, 'source': '../Data/Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term

In [6]:
from langchain_groq import ChatGroq
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')


#define groq llm
llm = ChatGroq(temperature=0.5,
                groq_api_key=GROQ_API_KEY, 
                model_name="llama-3.3-70b-versatile")

# from langchain_google_genai import ChatGoogleGenerativeAI
# import os
# load_dotenv()


# llm = ChatGoogleGenerativeAI(
#     model="gemini-3-flash-preview",
#     temperature=0.5,
#     google_api_key=os.environ["GOOGLE_API_KEY"],
# )
llm.invoke("Hello, world!")

AIMessage(content="Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?", response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 39, 'total_tokens': 64, 'completion_time': 0.040050444, 'completion_tokens_details': None, 'prompt_time': 0.004725686, 'prompt_tokens_details': None, 'queue_time': 0.092963407, 'total_time': 0.04477613}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_dae98b5ecb', 'finish_reason': 'stop', 'logprobs': None}, id='run-62106366-de02-41f1-8ec0-fdbaad862470-0', usage_metadata={'input_tokens': 39, 'output_tokens': 25, 'total_tokens': 64})

In [7]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [8]:
system_prompt = (
    "You are an asistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you dot't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system",system_prompt),
    ("human","{input}")
])

In [16]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)

rag_chain = create_retrieval_chain(rertriever, question_answer_chain)

In [17]:
response = rag_chain.invoke({
        "input":"What is definition for Doppler?"
    })

print(response['answer'])

The Doppler effect refers to the apparent change in frequency of sound wave echoes returning to a stationary source from a moving target. This change in frequency can be used to compute the object's speed, whether it's a car or blood in an artery. The Doppler effect holds true for all types of radiation, not just sound.


In [18]:
response = rag_chain.invoke({
        "input":"What is machine learning?"
    })

print(response['answer'])

I don't know what machine learning is based on the provided context, as it doesn't mention machine learning. The context appears to be related to human memory, neurodegenerative diseases, and cancer treatments.
