In [131]:
from langchain import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from pinecone import Pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceInstructEmbeddings


In [132]:
def load_pdf(Data):
    loader=DirectoryLoader(Data,
                           glob="*.pdf",
                           loader_cls=PyMuPDFLoader)
    documents=loader.load()
    return documents

In [133]:
extracted_data=load_pdf("Data/")

In [134]:
#creating text chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks  =text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [135]:
text_chunks=text_split(extracted_data)
print("length of chunks:",len(text_chunks))

length of chunks: 6975


In [136]:
#download embedding model
def download_hugging_face_embedding():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [137]:
embeddings=download_hugging_face_embedding()

In [138]:
query_result=embeddings.embed_query("Hello World")
print("length",len(query_result))

length 384


In [140]:
import os
import pinecone
os.environ["PINECONE_API_KEY"] = "93eeab5f-2929-47af-9bc7-6359838839f2"

index_name =pinecone.Index("medicalchatbot",host="https://medicalchatbot-2whmgb2.svc.aped-4627-b74a.pinecone.io")


In [142]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pinecone
# Create vector embeddings for your text chunks in batches
def batch_encode(texts, model, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch)
        embeddings.extend(batch_embeddings)
    return embeddings

# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Get the text contents
text_contents = [t.page_content for t in text_chunks]

# Generate embeddings in batches
embeddings = batch_encode(text_contents, model)
embeddings = np.array(embeddings).tolist()

# Prepare data for upsert with metadata
data_to_upsert = [(str(i), embedding, {"text": text_contents[i]}) for i, embedding in enumerate(embeddings)]

# Define a function to upsert data in batches
def upsert_in_batches(index, data, batch_size=100):
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        index.upsert(vectors=batch)

# Upsert data into Pinecone in smaller batches
upsert_in_batches(index, data_to_upsert)

print("Vector embeddings have been stored in Pinecone.")

Vector embeddings have been stored in Pinecone.


In [145]:
embeddings=download_hugging_face_embedding()
query="what causes legs to swell?"
query_embedding = model.encode([query])[0] 
print(query_embedding) 


[-1.89489760e-02  9.86824394e-04  5.44166230e-02  4.78827208e-02
  1.87477507e-02 -3.38590406e-02  6.16860762e-02  1.34538069e-01
 -2.23285053e-02  3.50672193e-02 -6.04711771e-02  2.66611390e-03
  9.16530006e-03  4.48654778e-02 -2.36200504e-02  3.32889855e-02
 -8.24605823e-02  1.73038431e-03 -3.18099484e-02 -3.31121720e-02
  3.34683359e-02  5.20865135e-02 -1.01919688e-01 -8.30984861e-03
  2.69129165e-02  8.68218485e-03 -7.20772520e-03  1.43263554e-02
  4.80915681e-02 -4.13518883e-02 -4.81268987e-02  8.62404034e-02
 -1.00215197e-01 -1.80150289e-02  3.40068862e-02 -2.16871891e-02
  1.76383164e-02 -2.32825857e-02 -4.74188067e-02 -4.02738387e-03
  3.32778580e-02 -6.36156276e-02  1.83737837e-03 -2.07873452e-02
  3.44032981e-02  3.59809808e-02  3.00427377e-02  7.08342046e-02
  2.30521541e-02  9.15039405e-02  2.16997564e-02 -1.16451746e-02
  3.73596996e-02  1.84948463e-02 -5.48047386e-02 -1.07157283e-01
 -8.56020674e-02 -2.00351495e-02  2.85031218e-02 -8.82086381e-02
 -5.76434247e-02  8.87116

In [148]:
docsearch = index_name.query(
    namespace="ns1",
    vector=query_embedding.tolist(),  
    top_k=5,  # Adjust the number of results you want
    include_values=True,
    include_metadata=True,
    filter={"genre": {"$eq": "medical"}}  # Adjust the filter as needed
)

print("Search results:", docsearch)

ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 27 Jul 2024 23:21:58 GMT', 'Content-Type': 'text/plain', 'Content-Length': '9', 'Connection': 'keep-alive', 'x-pinecone-auth-rejected-reason': 'Wrong API key', 'www-authenticate': 'Wrong API key', 'server': 'envoy'})
HTTP response body: Forbidden


In [110]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer.just say that you don't know,don't try to make up an answer.

Context:{context}
Question:{question}

only return the helpful answer below and nothing else.
Helpful answer:
"""

In [111]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=["context","question"])
chain_type_kwargs={"prompt":PROMPT}

In [112]:
llm=CTransformers(model="Model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,'temperature':0.8})


In [116]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriver=docsearch.as_retriever(search_kwargs={'k':2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\venga\anaconda3\envs\mchatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\venga\AppData\Local\Temp\ipykernel_13384\2246711962.py", line 1, in <module>
    qa=RetrievalQA.from_chain_type(
  File "c:\Users\venga\anaconda3\envs\mchatbot\lib\site-packages\langchain\chains\retrieval_qa\base.py", line 95, in from_chain_type
    )
  File "c:\Users\venga\anaconda3\envs\mchatbot\lib\site-packages\langchain\load\serializable.py", line 74, in __init__
  File "pydantic\main.py", line 341, in pydantic.main.BaseModel.__init__
pydantic.error_wrappers.ValidationError: 2 validation errors for RetrievalQA
retriever
  field required (type=value_error.missing)
retriver
  extra fields not permitted (type=value_error.extra)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\venga\anaco

In [114]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query":user_input})
    print("response :",result["result"])

NameError: name 'qa' is not defined