In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\dccha\\Desktop\\CODING\\Projects\\AI-powered-Health-Assistant'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Extract data from the PDF file
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf_file(data='Data/')

In [6]:
# Split the Data into Text Chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                   chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunk = text_split(extracted_data)
print("Length of Text Chunk : ", len(text_chunk))

Length of Text Chunk :  6970


In [8]:
# Download the Embeddings from Hugging Face 

def download_hugging_face_embedding():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings    

In [9]:
embeddings = download_hugging_face_embedding()

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Testing the embedding
query_result = embeddings.embed_query("Acrogemaly")
print("Length", len(query_result))
print("Query result : ", query_result)

Length 384
Query result :  [-0.06402626633644104, 0.013144535943865776, -0.07630716264247894, 0.04439324513077736, -0.035104747861623764, -0.04568645730614662, 0.13939104974269867, 0.056085795164108276, -0.012278692796826363, -0.0015933518297970295, 0.07787694036960602, -0.0348651297390461, -0.0010845958022400737, -0.0024508845526725054, -0.06255487352609634, 0.04585983604192734, -0.02959422394633293, 0.014354272745549679, -0.13443471491336823, -0.05806855857372284, -0.037634968757629395, 0.01632891409099102, -0.04902853071689606, -0.02839430421590805, 0.002721433062106371, 0.012373951263725758, -0.05997093766927719, -0.007044442929327488, -0.01936623454093933, -0.07877066731452942, 0.04980788007378578, 0.03340892493724823, 0.042316410690546036, 0.007830302231013775, 0.03347261995077133, -0.027929510921239853, -0.025156429037451744, -0.04247191920876503, 0.026612475514411926, 0.04046086221933365, -0.07097961753606796, -0.06501173973083496, -0.01745101436972618, 0.031125977635383606, -0

In [11]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

In [45]:
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medichat"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [46]:
# Embed each chunk and upsert the embeddings into your Pinecone index
from langchain.vectorstores import Pinecone

docsearch = Pinecone.from_documents(
    documents=text_chunk,
    index_name=index_name,
    embedding=embeddings
)

In [47]:
# Load Existing index
from langchain_pinecone import Pinecone
docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [14]:
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x298012c3a90>

In [15]:
retriever = docsearch.as_retriever(search_type="similarity",
                                   search_kwargs={"k":3})

In [16]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 425.0, 'page_label': '426', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteriod—A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 298.0, 'page_lab

In [17]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [39]:
from langchain_openai import ChatOpenAI

# Use Groq Cloud Llama 3 API
llm = ChatOpenAI(
    model="mixtral-8x7b-32768",  
    openai_api_base="https://api.groq.com/openai/v1",
    openai_api_key="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",
    temperature=0.4,
    max_tokens=500
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question. If You don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)

In [41]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [43]:
response = rag_chain.invoke({"input":"What is acne?"})
print(response['answer'])

Corticosteroids are a group of synthetic hormones used to prevent or reduce inflammation, but toxic effects can result from rapid withdrawal after prolonged use or from continued use of large doses. A patch test is a skin test used to identify allergens, where a suspected substance is applied to the skin and if the area is red and swollen after 24-48 hours, the test is positive for that substance. This is relevant to your question as corticosteroids and patch tests are both related to skin conditions and immune responses.
