In [1]:
import os 
os.chdir("../")
%pwd

'f:\\Generative AI course\\Projects\\Chatbot\\Pdf_Query_Pinecone'

In [31]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [32]:
# Splitting up the text into smaller chunks for indexing
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
text_chunks = text_splitter.split_documents(documents)

In [33]:
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')
PINECONE_HOST_URL = os.environ.get('PINECONE_HOST_URL')
PINECONE_PORT = os.environ.get('PINECONE_PORT')

In [34]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [35]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [36]:
import pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, host = PINECONE_HOST_URL)
index = pc.Index("medicalbot", host = PINECONE_HOST_URL)

# save vecotors and index them to database

In [37]:
from langchain_pinecone import Pinecone

index_name = "medicalbot"

docsearch = Pinecone.from_documents(text_chunks, embeddings, index_name=index_name)

# add  more texts like below
# vectorstore = Pinecone(index_name=index_name, embedding=embeddings)
# vectorstore.add_texts(["More text!"])

In [42]:

query = "what human required to progress?"
docs = docsearch.similarity_search(query, k = 4)
print(docs[3].page_content)

powers, and capacity to scale with human creativity, human 
judgment, and human guidance.


In [None]:
#In addition to using similarity search in the retriever object, you can also use mmr as retriever.
query = "what human required to progress?"
retriever = docsearch.as_retriever(search_type="mmr")
matched_docs = retriever.get_relevant_documents(query)
for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


# or use directly below function
# found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)
# for i, doc in enumerate(found_docs):
#     print(f"{i + 1}.", doc.page_content, "\n")

In [43]:
# Step 3: Load transformer model for question answering
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

In [15]:
def query_embeddings(text: str, num_results):
    doc_result = embeddings.embed_documents([text])
    results = index.query(vector=doc_result[0], top_k=num_results, include_values=True, include_metadata=True)
    return results

In [None]:
text = "what human required to progress?"
results = query_embeddings(text, num_results = 2)


In [None]:
results.matches[0].metadata["text"]

In [None]:
results.matches[0].score

In [None]:
results.matches[0].metadata["page"]

In [None]:
type(results.matches[1].metadata["text"])

In [None]:
results.matches[1].metadata["page"]

In [None]:
results.matches[2].metadata["text"]

In [None]:
results.matches[2].metadata["page"]

In [None]:
results.matches[3].metadata["page"]

In [None]:
results.matches[4].metadata["page"]

In [None]:
len(results.matches)

In [None]:
results.matches[0].values