In [1]:
# ! pip install -r requirements.txt
# ! pip install python-dotenv pypdf
# ! pip install ctransformers
# ! ollama run llama3

Obtaining file:///home/eloi/Documents/AAP_Generative_AI/slides/GenerativeAiCourse/Day12%2613/MedicalChatbotLlama2 (from -r requirements.txt (line 12))
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: Medical-Chatbot
  Attempting uninstall: Medical-Chatbot
    Found existing installation: Medical-Chatbot 0.0.0
    Uninstalling Medical-Chatbot-0.0.0:
      Successfully uninstalled Medical-Chatbot-0.0.0
  Running setup.py develop for Medical-Chatbot
Successfully installed Medical-Chatbot-0.0.0


In [1]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
import pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import CTransformers 
import os
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
# for llama3
from langchain_community.llms import Ollama
llm_model3 = Ollama(model="llama3")


In [3]:
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [4]:
llm_model1 = "llama-2-7b-chat.ggmlv3.q4_0.bin"
llm_model2 = "mixtral-8x7b-instruct-v0.1-limarp-zloss-dare-ties.Q4_0.gguf"

embedding_model1 = "sentence-transformers/all-MiniLM-L6-v2"

In [5]:
# Extract text from PDFs
def load_pdf (data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf("data")

In [7]:
len(extracted_data)

4005

In [8]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("nomber of chunks:",len(text_chunks))

nomber of chunks: 21526


In [10]:
def download_hugging_face_model(embedding_model):
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    return embeddings

In [11]:
embeddings = download_hugging_face_model(embedding_model1)

  warn_deprecated(


In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
query_result = embeddings.embed_query("What is the capital of France?")
print("length:",len(query_result))

length: 384


In [14]:
# query_result

In [14]:
pc = Pinecone(
    api_key=PINECONE_API_KEY
)

In [15]:
index_name = "medical-chatbot"

In [16]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=len(query_result),
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

In [16]:
index = pc.Index(index_name)

In [19]:
docsearch = LangchainPinecone.from_texts(
    texts=[t.page_content for t in text_chunks], 
    embedding=embeddings, 
    index_name=index_name
)

In [17]:
docsearch = LangchainPinecone.from_existing_index(index_name=index_name, embedding=embeddings)

query = "What are Allergies?"

docs = docsearch.similarity_search(query, k=3)

print("Results:",docs)

Results: [Document(page_content='Purpose\nAllergy is a reaction of the immune system. Nor-\nmally, the immune system responds to foreign microor-ganisms and particles, like pollen or dust, by producingspecific proteins called antibodies that are capable ofbinding to identifying molecules, or antigens, on theforeign organisms. This reaction between antibody andantigen sets off a series of reactions designed to protectthe body from infection. Sometimes, this same series ofreactions is triggered by harmless, everyday substances.This is the condition known as allergy, and the offend-ing substance is called an allergen. Common inhaledallergens include pollen, dust, and insect parts from tinyhouse mites. Common food allergens include nuts, fish,and milk.\nAllergic reactions involve a special set of cells in'), Document(page_content='Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, ormore than one in every five people, suffer f

In [18]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [19]:
PROMPT=PromptTemplate(template=prompt_template, 
                      input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
# import transformers
# import torch

# model_id = "model/meta-llama/Meta-Llama-3-8B-Instruct"

# pipeline = transformers.pipeline(
#   "text-generation",
#   model="meta-llama/Meta-Llama-3-8B-Instruct",
#   model_kwargs={"torch_dtype": torch.bfloat16},
#   device="cuda",
# )

Downloading shards:  75%|███████▌  | 3/4 [1:37:14<32:21, 1941.41s/it]  

In [20]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.4})

In [22]:
qa=RetrievalQA.from_chain_type(
    llm=llm_model3, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [23]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

  warn_deprecated(


Response :  Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
Response :  Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
Response :  A substance that provokes an allergic response.
Response :  A contraceptive implant is inserted under the skin of a woman's arm.
Response :  An allergy occurs when a person's immune system overreacts to a specific substance, known as an allergen. This overreaction can cause a range of symptoms, including inflammation of the mucous membranes in the nose and eyes (allergic rhinitis), increased sensitivity that can lead to serious reactions such as anaphylaxis, or severe non-inflammatory swelling of the skin, organs, and brain (angioedema). The immune system produces specific proteins called antibodies i