In [1]:
import os
os.chdir("../")

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#Extract information from a pdf 
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [4]:
data_loaded = load_pdf(data="DATA/")

In [None]:
#data_loaded

In [5]:
#Split the data into chunks 

def text_split(data_loaded):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(data_loaded)
    return text_chunks

In [6]:
text_chunks= text_split(data_loaded)



In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

In [8]:
#Download the embeddings from hugging face
def download_hugging_face_embed():
    embeddings= HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [9]:
embeddings = download_hugging_face_embed()

  embeddings= HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [10]:
query_result =  embeddings.embed_query("Hello World")
print("Length", len (query_result))

Length 384


In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')

In [17]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key = PINECONE_API_KEY)
index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-gw3p5a1.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [18]:
import os 
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY


In [19]:
#Embed chunk and upsert embedding into Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents (
    documents = text_chunks,
    index_name = index_name,
    embedding= embeddings
)

In [20]:
#Load index

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [21]:
retriever= docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [22]:
retrieved_docs = retriever.invoke ("What is Acne?")

In [23]:
retrieved_docs

[Document(id='c59cd73e-41cc-41aa-92e8-4c582b7fb817', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 54.0, 'page_label': '25', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'DATA\\medical_book.pdf', 'total_pages': 4505.0}, page_content='Acne vulgaris, the medical term for common acne,\nis the most common skin disease. It affects nearly 17\nmillion people in the United States. While acne can\narise at any age, it usually begins atpuberty and wor-\nsens during adolescence. Nearly 85% of people\ndevelop acne at some time between the ages of 12-25\nyears. Up to 20% of women develop mild acne. It is\nalso found in some newborns.\nThe sebaceous glands lie just beneath the skin’s\nsurface. They produce an oil called sebum, the skin’s'),
 Document(id='5766d21f-b1f0-4ea9-8562-11d6972d0ddd', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:

In [24]:
import os
from typing import List, Optional

# 1) Install needed libraries:
#    pip install groq requests langchain pydantic

from groq import Groq
from pydantic import BaseModel, Field
from langchain.llms.base import LLM

class OfficialGroqLLM(LLM, BaseModel):
    """
    A LangChain-compatible LLM wrapper that calls Groq's official Python client.
    """

    api_key: str = Field(..., description="Your Groq API key")
    model: str = Field("llama-3.3-70b-versatile", description="Groq model name")
    temperature: float = Field(0.4, description="Sampling temperature")
    max_tokens: int = Field(500, description="Maximum tokens to generate")

    @property
    def _llm_type(self) -> str:
        return "groq_official"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """
        Sends a single 'user' message to Groq. If you need a 'system' message,
        incorporate it into the prompt or customize messages below.
        """
        # Initialize the Groq client
        client = Groq(api_key=self.api_key)

        # Convert the LangChain prompt into Groq's chat format
        # You can add a system message if desired:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]

        # Call the official Groq API
        chat_completion = client.chat.completions.create(
            messages=messages,
            model=self.model,
            temperature=self.temperature,
            max_tokens=self.max_tokens
            # Groq might support additional parameters if needed
        )

        # Return the text from the first choice
        return chat_completion.choices[0].message.content

    async def _acall(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        raise NotImplementedError("Async method not implemented for OfficialGroqLLM.")



In [25]:


groq_llm = OfficialGroqLLM(
    api_key=os.environ.get("GROQ_API_KEY", "YOUR_GROQ_KEY_HERE"),
    model="llama-3.3-70b-versatile", 
    temperature=0.4,
    max_tokens=500
)


In [26]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [27]:
question_answer_chain = create_stuff_documents_chain(groq_llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [28]:
respone = rag_chain.invoke ({"input": "What is stats?"})
print (respone["answer"])

I don't know what "stats" refers to in this context. The provided information discusses psychological testing and terms related to it, but it does not define "stats." If you meant to ask about statistics, I can tell you that it involves the study of data collection and analysis, but this is not explicitly mentioned in the given context. Without more information, I cannot provide a specific answer.
