In [1]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
import os

# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_API_ENV")
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)

# Specify the index name
index_name = "medical-bot"
index = pc.Index(index_name)

In [2]:
def load_data(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
extracted_pdf = load_data("E:\Girish Documents\Study\Data Science\DataScience_GenAI_Study\GenAI_Project_Medical-Chatbot-Using-LLAMA2\data")

In [4]:
# Function to create embeddings from text using Sentence Transformers
def create_embedding(text):
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    embeddings = model.encode([text], show_progress_bar=False)
    return embeddings[0]

In [5]:
# Function to split text into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
extracted_data = text_split(extracted_pdf)

In [7]:
extracted_data

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'E:\\Girish Documents\\Study\\Data Science\\DataScience_GenAI_Study\\GenAI_Project_Medical-Chatbot-Using-LLAMA2\\data\\Medical_book.pdf', 'page': 1}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B1', metadata={'source': 'E:\\Girish Documents\\Study\\Data Science\\DataScience_GenAI_Study\\GenAI_Project_Medical-Chatbot-Using-LLAMA2\\data\\Medical_book.pdf', 'page': 2}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\n

In [10]:
# Assuming documents is a list of Document objects
for i, document in enumerate(extracted_data):
    # Access the page content and metadata attributes correctly
    if hasattr(document, 'page_content') and hasattr(document, 'metadata'):
        page_content = document.page_content
        metadata_source = document.metadata.get('source', '')
        metadata_page = document.metadata.get('page', '')
        
        print(f"Document {i + 1}:\nPage Content: {page_content}\nMetadata - Source: {metadata_source}\nMetadata - Page: {metadata_page}\n")
        
        # Create embeddings for the document
        embedding = create_embedding(page_content)

        # Define metadata for each document
        max_metadata_size = 40500  # Maximum allowed metadata size in Pinecone
        chunk_metadata = {"page_content_chunk": page_content[:min(500, len(page_content))]}  # Example metadata

        # Ensure that metadata size does not exceed the limit
        truncated_metadata = chunk_metadata if len(str(chunk_metadata)) <= max_metadata_size else None

        # Create Pinecone data
        pinecone_data = {"id": f"your_id_{i}", "values": embedding.tolist(), "metadata": chunk_metadata}

        # Index the data in Pinecone
        index.upsert([pinecone_data])
    else:
        # If page_content or metadata attributes are not present, assume Document is a string
        print(f"Document {i + 1} (Assuming Document is a string):\n{document}\n")


Document 1:
Page Content: TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
Metadata - Source: E:\Girish Documents\Study\Data Science\DataScience_GenAI_Study\GenAI_Project_Medical-Chatbot-Using-LLAMA2\data\Medical_book.pdf
Metadata - Page: 1

Document 2:
Page Content: TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
A-B1
Metadata - Source: E:\Girish Documents\Study\Data Science\DataScience_GenAI_Study\GenAI_Project_Medical-Chatbot-Using-LLAMA2\data\Medical_book.pdf
Metadata - Page: 2

Document 3:
Page Content: STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow, Manager, Imaging and Multimedia
Content
Robyn V . Young, Project Man

In [11]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [19]:
from sentence_transformers import SentenceTransformer
import numpy as np

def embed_text(question):
    # Load a pre-trained model (you can replace 'bert-base-nli-mean-tokens' with other available models)
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Convert the question into a numerical vector
    question_embedding = model.encode([question])[0]

    return question_embedding.tolist()  # Convert NumPy array to a Python list

# Example question
question = "What is the information about allergy?"

# Embed the question
question_embedding = embed_text(question)

print("Question Embedding:", question_embedding)

Question Embedding: [0.5090919733047485, 0.46127447485923767, 0.7511740922927856, -0.1276034563779831, 0.36084991693496704, -0.13014070689678192, 0.02417243830859661, 0.19066718220710754, 0.6438664197921753, 0.20932559669017792, -0.506260097026825, 0.6784447431564331, -0.0448787622153759, 0.9384334683418274, 0.3237760365009308, 0.32427161931991577, -0.4839401841163635, 0.3733147978782654, 0.40260785818099976, -0.12346889078617096, -0.0573415532708168, 0.26762163639068604, -0.4738863408565521, -0.46266070008277893, 0.5659618377685547, -0.05392877385020256, 0.01528379786759615, -0.30587926506996155, -0.43391257524490356, -0.5557271242141724, 0.2791118323802948, 0.9488075375556946, -0.42270296812057495, -0.26407164335250854, -0.8306460380554199, 0.8476095199584961, -0.07939176261425018, 0.37859758734703064, -0.37516671419143677, 0.20264932513237, -0.696886420249939, 0.2200450599193573, 0.06680841743946075, 0.4754409193992615, -0.6031928062438965, -0.548994779586792, 0.40602049231529236, -

In [None]:
from pinecone import Pinecone

# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_API_ENV")
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)

# Specify the index name
index_name = "medical-bot"
index = pc.Index(index_name)

# Search for similar vectors
results = index.query(queries=[question_embedding], top_k=3)

# Retrieve document information for the top result
# Search for similar vectors
try:
    results = index.query(queries=[question_embedding], top_k=3)
    
    if results.count > 0:
        top_result_id = results.ids[0]
        retrieved_document = index.retrieve(ids=[top_result_id])

        # Access the information from the retrieved document
        document_information = retrieved_document.data[0]
        print("Document Information:", document_information)
    else:
        print("No matching documents found.")
except Exception as e:
    print(f"Error: {e}")

In [34]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_API_ENV")
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)

# Specify the index name
index_name = "medical-bot"
index = pc.Index(index_name)

# Example question
question = "how many types of disease?"

# Download Hugging Face embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Embed the question
question_embedding = embeddings.embed_documents([question])[0]  # Convert to Python list

print(question_embedding)

# Search for similar vectors
try:
    results = index.query(vector=[question_embedding], top_k=3)
    
    if results and results.count and results.count > 0:
        top_result_id = results.ids[0]
        retrieved_document = index.retrieve(ids=[top_result_id])

        # Access the information from the retrieved document
        document_information = retrieved_document.data[0]
        print("Document Information:", document_information)
    else:
        print("No matching documents found.")
except Exception as e:
    print(f"Error: {e}")


[0.08352620154619217, -0.04414195939898491, 0.02134587988257408, 0.033023279160261154, -0.08061781525611877, 0.02993840165436268, 0.04730486497282982, 0.0779271125793457, -0.010661964304745197, 0.05830582231283188, 0.01779111474752426, -0.006209086161106825, -0.039664916694164276, 0.032966356724500656, -0.06736266613006592, -0.021798064932227135, -0.1204691082239151, -0.058986686170101166, 0.05263691768050194, 0.069086953997612, -0.037173911929130554, 0.05823799967765808, 0.04499741643667221, -0.03436688706278801, -0.10213454067707062, -0.05586773902177811, 0.007713689468801022, -0.044681448489427567, -0.007628698833286762, -0.03667287901043892, -0.08169661462306976, 0.08009258657693863, 0.0066360472701489925, 0.05284389853477478, -0.027964819222688675, -0.06969517469406128, -0.0227035079151392, 0.03622368723154068, -0.019291523844003677, 0.06015699729323387, 0.0034167200792580843, -0.04931028559803963, 0.07794719189405441, 0.006849334575235844, 0.04026544839143753, -0.0671956166625022