In [1]:
pip install -qqq langchain langchain_groq langchain_pinecone pinecone-client pypdf langchain groq google-generativeai duckduckgo-search lxml huggingface_hub datasets beautifulsoup4 requests selenium python-dotenv aiohttp

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from langchain_groq import ChatGroq
import google.generativeai as genai
from langchain.document_loaders import UnstructuredPDFLoader, PyPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone
from langchain.schema import Document
from langchain_core.embeddings import Embeddings
from typing import List
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset
data = load_dataset('lavita/ChatDoctor-HealthCareMagic-100k')

In [4]:
import pandas as pd
df = pd.DataFrame(data['train'].shuffle(42)[:5000])[['input' , 'output']]

In [5]:
df = df.rename(columns={'input':'question' , 'output':'answer'})

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<YOUR API KEY>'
os.environ['GROQ_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR API KEY>'
os.environ['PINECONE_ENV'] = 'gemini-rag'
os.environ["GEMINI_API_KEY"] = '<YOUR API KEY>'

In [7]:
class GeminiEmbeddings(Embeddings):
    def __init__(self, api_key):
        genai.configure(api_key=api_key)
        self.model_name = "models/embedding-001"  

    def embed_documents(self, texts):
        return [self._convert_to_float32(genai.embed_content(model=self.model_name, content=text, task_type="retrieval_document")["embedding"]) for text in texts]

    def embed_query(self, text):
        response = genai.embed_content(model=self.model_name, content=text, task_type="retrieval_query")
        return self._convert_to_float32(response["embedding"])

    @staticmethod
    def _convert_to_float32(embedding):
        return np.array(embedding, dtype=np.float32).tolist()

In [8]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
embeddings = GeminiEmbeddings(api_key=os.environ["GEMINI_API_KEY"])

In [9]:
pc = Pinecone(os.environ['PINECONE_API_KEY'])

In [10]:
docs = [ 
    Document(page_content=f"Q: {row['question']}\nA: {row['answer']}", metadata={'Page index':i+1})
    for i, row in df.iterrows()]

In [11]:
import time
from tqdm import tqdm
def upsert(docs , embeddings , split_size = 100):
    for start in tqdm(range(0, len(docs), split_size)):
        doc = docs[start : start + split_size]
        vectorstore = PineconeVectorStore.from_documents(
            doc,
            embeddings,
            index_name= os.environ['PINECONE_ENV']
        )
        time.sleep(30)
    return vectorstore
    
def load_existing(embeddings):
    vectorstore = PineconeVectorStore.from_existing_index(
    embedding=embeddings,                   
    index_name=os.environ["PINECONE_ENV"],   
)
    return vectorstore

In [12]:
vectorstore = upsert(docs , embeddings)

100%|█████████████████████████████████████████| 50/50 [1:12:25<00:00, 86.92s/it]


In [26]:
from langchain import LLMChain
from langchain_core.prompts import ChatPromptTemplate

answer_template = ChatPromptTemplate.from_template(
    """
You are a helpful assistant.

Context:
{context}

User question:
{question}

Task: Provide a concise, accurate answer based solely on the context.
If the context does NOT contain the information needed to answer the question, reply exactly:
"No suitable answer found in database."
"""
)

clarify_template = ChatPromptTemplate.from_template(
    """
You answered:
{answer}

The user originally asked:
{question}

Task: Determine if more information is needed to answer correctly.
- If you CANNOT answer because the database lacked relevant information, reply exactly `ENOUGH`.
- If you NEED more info, ask a single, specific follow-up question.
- If you have enough context to be confident, reply exactly `ENOUGH`.
"""
)

summary_template = ChatPromptTemplate.from_template(
    """
Here are the question/answer pairs that occurred:
{history}

Task: Provide a concise final summary of the information above.
If all answers were 'No suitable answer found in database.', reply:
"No information available to summarize."
"""
)

web_summary_template = ChatPromptTemplate.from_template(
    """
Here is the webcontext we have scraped it from internet:
{history}

Task: Provide a concise final summary of the information above.
If all answers are not relevent simply reply 'No relevent information found on the web'.
If relevent content is found, summarise the content properly and properly cite it as web search content.
"""
)

final_summary_template = ChatPromptTemplate.from_template(
    """
Here is the webcontext we have scraped it from internet:
{web}
Below is the context we have found from the corpus text:
{text}

Task: Provide a concise and detailed final summary of the information above.
If all answers are not relevent simply reply 'No relevent information Is available please contact support'.
If relevent content is found, summarise the content properly and properly cite content, give more priority to corpus text.
If corpus text is less relevent then give priority to web content.
Properly cite web content in the final summary. Make it sure the user knows the additional answer is from the web.
"""
)

In [14]:
answer_chain = LLMChain(
    llm=ChatGroq(model="llama3-70b-8192"), 
    prompt=answer_template,
)

  answer_chain = LLMChain(


In [15]:
clarify_chain = LLMChain(
    llm=ChatGroq(model="llama3-70b-8192", temperature=0.2),
    prompt=clarify_template,
)

In [27]:
summary_chain = LLMChain(
    llm=ChatGroq(model="llama3-70b-8192"),
    prompt=summary_template,
)

web_summary_chain = LLMChain(
    llm=ChatGroq(model="llama3-70b-8192"),
    prompt=web_summary_template,
)

final_summary_chain = LLMChain(
    llm=ChatGroq(model="llama3-70b-8192"),
    prompt=final_summary_template,
)

In [17]:
def self_clarifying_qa(user_question: str,
                       vectorstore: PineconeVectorStore,
                       max_queries: int = 3,
                       k_retrieval: int = 3):
    history = []
    q = user_question

    for depth in range(max_queries):
        docs = vectorstore.similarity_search(q, k=k_retrieval)
        ctx  = "\n\n".join(d.page_content for d in docs)

        a = answer_chain.run(question=q, context=ctx)
        history.append((q, a))

        if a.strip().lower() == "no suitable answer found in database.":
            break

        follow = clarify_chain.run(question=q, answer=a).strip()
        if follow.upper() == "ENOUGH":
            break

        q = follow

    hist_text = "\n".join(f"Q: {x}\nA: {y}" for x, y in history)
    summary = summary_chain.run(history=hist_text)

    return {"history": history, "summary": summary}

In [18]:
q = 'I am having high fever and nausea since this morning.'
result = self_clarifying_qa(q, vectorstore)

  a = answer_chain.run(question=q, context=ctx)


In [19]:
result['summary']

'Here is a concise final summary of the information:\n\nA person experiencing high fever and nausea since morning may have a viral illness or stomach infection. They should take paracetamol for fever relief, consider antibiotics if they persist, and stay hydrated with water and a light diet. If symptoms worsen or they experience severe abdominal pain or vomiting, they should seek further guidance from a doctor.'

# Web Scraping part

In [20]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from langchain_community.tools import DuckDuckGoSearchRun
from lxml import html
import time
import random
from tqdm import tqdm
from typing import List

In [None]:
    "def get_gemini_model(model_name = \"gemini-2.0-flash-lite\", temperature = 0.4):\n",
    return genai.GenerativeModel(model_name)

def get_generation_config(temperature = 0.4):
    return {
        "temperature": temperature,
        "top_p": 1,
        "top_k": 1,
        "max_output_tokens": 2048,
    }

def get_safety_settings():
    return [
        {"category": category, "threshold": "BLOCK_NONE"}
        for category in [
            "HARM_CATEGORY_HARASSMENT",
            "HARM_CATEGORY_HATE_SPEECH",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT",
            "HARM_CATEGORY_DANGEROUS_CONTENT",
        ]
    ]
    
def generate_gemini_response(model, prompt):
    response = model.generate_content(
        prompt,
        generation_config=get_generation_config(),
        safety_settings=get_safety_settings()
    )
    if response.candidates and len(response.candidates) > 0:
        return response.candidates[0].content.parts[0].text
    return ''

def create_search_prompt(query, context = ""):
    system_prompt = """You are a smart assistant designed to determine whether a query needs data from a web search or can be answered using a document database. 
    Consider the provided context if available. 
    If the query requires external information, No context is provided, Irrelevent context is present or latest information is required, then output the special token <SEARCH> 
    followed by relevant keywords extracted from the query to optimize for search engine results. 
    Ensure the keywords are concise and relevant. If document data is sufficient, simply return blank."""
    
    if context:
        return f"{system_prompt}\n\nContext: {context}\n\nQuery: {query}"
    
    return f"{system_prompt}\n\nQuery: {query}"

def create_summary_prompt(content):
    return f"""Please provide a comprehensive yet concise summary of the following content, highlighting the most important points and maintaining factual accuracy. Organize the information in a clear and coherent manner:

Content to summarize:
{content}

Summary:"""

def init_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def extract_static_page(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        
        text = soup.get_text(separator=" ", strip=True)
        return text[:5000]

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page: {e}")
        return None
        
def extract_dynamic_page(url, driver):
    try:
        driver.get(url)
        time.sleep(random.uniform(2, 5))
        
        body = driver.find_element(By.TAG_NAME, "body")
        ActionChains(driver).move_to_element(body).perform()
        time.sleep(random.uniform(2, 5))
        
        page_source = driver.page_source
        tree = html.fromstring(page_source)
        
        text = tree.xpath('//body//text()')
        text_content = ' '.join(text).strip()
        return text_content[:1000]

    except Exception as e:
        print(f"Error fetching dynamic page: {e}")
        return None

def scrape_page(url):
    if "javascript" in url or "dynamic" in url:
        driver = init_selenium_driver()
        text = extract_dynamic_page(url, driver)
        driver.quit()
    else:
        text = extract_static_page(url)
    
    return text

def scrape_web(urls, max_urls = 5):
    texts = []
    
    for url in tqdm(urls[:max_urls], desc="Scraping websites"):
        text = scrape_page(url)
        
        if text:
            texts.append(text)
        else:
            print(f"Failed to retrieve content from {url}")
            
    return texts

def check_search_needed(model, query , context):
    prompt = create_search_prompt(query , context)
    response = generate_gemini_response(model, prompt)
    
    if "<SEARCH>" in response:
        search_terms = response.split("<SEARCH>")[1].strip()
        return True, search_terms
    return False, None

def summarize_content(model, content):
    prompt = create_summary_prompt(content)
    return generate_gemini_response(model, prompt)

def process_query(query , context = ''):
    model = get_gemini_model()
    search_tool = DuckDuckGoSearchRun()
    
    needs_search, search_terms = check_search_needed(model, query , context)
    
    result = {
        "original_query": query,
        "needs_search": needs_search,
        "search_terms": search_terms,
        "web_content": None,
        "summary": None,
        "raw_response": None
    }
    
    if needs_search:
        search_results = search_tool.run(search_terms)
        result["web_content"] = search_results
        
        summary = summarize_content(model, search_results)
        result["summary"] = summary
    
    return result

In [22]:
def run(question , context = '' , debug = False):
    result = process_query(question , context = '')
    if debug:
        print("\nQuery Results:")
        print(f"Search needed: {result['needs_search']}")
        
        if result['needs_search']:
            print(f"\nSearch terms used: {result['search_terms']}")
            print("\nSummary of findings:")
            print(result['summary'])
    return result

In [23]:
run('I am having high fever and nausea since this morning.')

{'original_query': 'I am having high fever and nausea since this morning.',
 'needs_search': True,
 'search_terms': 'high fever nausea treatment',
 'web_content': "Call your healthcare professional if the fever doesn't respond to the medicine, stays at 103 F (39.4 C) or higher or lasts longer than three days. There is a problem with information submitted for this request. Evans SS, Repasky EA, Fisher DT. Fever and the thermal regulation of immunity: the immune system feels the heat. Nat Rev Immunol. 2015;15(6):335-49. doi:10.1038/nri3843 Baran G, Turan E. Investigation of the effect of the training on fever and febrile convulsion management given to pediatric nurses on their knowledge level. Int J Caring Sci. 2018;11(1):478-87. The appropriate course of treatment is needed to eliminate the symptoms of fever. The most common treatment options for acute febrile illness include antibiotics, multivitamins, antiviral medications, or antimalarial drugs. These medicines help avoid lethargy, d

In [28]:
def QA_chain_with_websearch(user_question, vectorstore, max_queries = 3, k_retrieval = 3 , web_mode = True):
    result = self_clarifying_qa(user_question , vectorstore , max_queries , k_retrieval)
    if web_mode:
        scrape_summary = run(user_question)
        print('Searching The web!')
        if scrape_summary:
            web_content = scrape_summary['web_content']
            web_summary = web_summary_chain.run(history=web_content)
            if result['summary'].strip().lower() == "no suitable answer found in database.":
                return web_summary
    try:
        final = final_summary_chain.run(web = web_summary , text = result['summary'])
        return final
    except:
        return result['summary']

In [29]:
question = 'I am having high fever and nausea since this morning.'
QA_chain_with_websearch(question , vectorstore)

Searching The web!


"Here is a concise and detailed final summary of the information:\n\n**Fever and Nausea Management**\n\nBased on the corpus text, if you are experiencing high fever and nausea, it is recommended to:\n\n* Take paracetamol for fever relief\n* Stay hydrated\n* Avoid outside or junk food\n\nIf your symptoms worsen or other severe symptoms appear, consider visiting the nearest Emergency Room.\n\n**Additional Guidelines from Web Content**\n\n* For adults, seek medical attention if fever is over 104°F (40°C) or 105°F (40.5°C) for emergency care. (Source: Web search content)\n* For newborns (0-3 months), call healthcare provider or go to emergency room if fever is 100.4°F (38°C). (Source: Web search content)\n* Viral gastroenteritis (stomach flu) can cause fever, diarrhea, nausea, and vomiting, and is often spread through contact with an infected person or contaminated food/water. (Source: Web search content)\n\nNote: The additional guidelines from web content are provided for reference and ma