In [9]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
def extract_pdf_data(directory):
    loader = DirectoryLoader(
        path=directory,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()

    return documents

## Function: `extract_pdf_data`

This function extracts and loads PDF documents from a specified directory using the `langchain` library.

### Parameters:
- **`directory`** (str): The path to the directory containing PDF files.

### Returns:
- **`documents`** (list): A list of documents extracted from the PDFs.

### Example:
```python
extracted_data = extract_pdf_data("data/")


In [10]:
data = extract_pdf_data("data/")

In [3]:
data

[Document(metadata={'source': 'data/Medical_book_compressed-15-20-1.pdf', 'page': 0}, page_content='Abdominal aorta ultrasound seeAbdominal\nultrasound\nAbdominal aortic aneurysm seeAortic\naneurysm\nAbdominal hernia seeHernia\nAbdominal thrust seeHeimlich maneuver\nAbdominal ultrasound\nDefinition\nUltrasound technology allows doctors to “see”\ninside a patient without resorting to surgery. A transmit-\nter sends high frequency sound waves into the body,where they bounce off the different tissues and organs toproduce a distinctive pattern of echoes. A receiver“hears” the returning echo pattern and forwards it to acomputer, which translates the data into an image on atelevision screen. Because ultrasound can distinguishsubtle variations between soft, fluid-filled tissues, it isparticularly useful in providing diagnostic images of theabdomen. Ultrasound can also be used in treatment.\nPurpose\nThe potential medical applications of ultrasound\nwere first recognized in the 1940s as an out

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text_into_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(documents)

    return text_chunks


## Function: `split_text_into_chunks`

This function splits the text from documents into smaller, manageable chunks. It's useful for processing large texts in batches.

### Parameters:
- **`documents`** (list): A list of documents containing text data.

### Returns:
- **`text_chunks`** (list): A list of text chunks, each with a specified size and overlap.

### Example:
```python
text_chunks = split_text_into_chunks(extracted_data)


In [12]:
chunks = split_text_into_chunks(data)
print("length of my chunks is :", len(chunks))
chunks

length of my chunks is : 10


[Document(metadata={'source': 'data/Medical_book_compressed-15-20-1.pdf', 'page': 0}, page_content='Abdominal aorta ultrasound seeAbdominal\nultrasound\nAbdominal aortic aneurysm seeAortic\naneurysm\nAbdominal hernia seeHernia\nAbdominal thrust seeHeimlich maneuver\nAbdominal ultrasound\nDefinition\nUltrasound technology allows doctors to “see”\ninside a patient without resorting to surgery. A transmit-'),
 Document(metadata={'source': 'data/Medical_book_compressed-15-20-1.pdf', 'page': 0}, page_content='ter sends high frequency sound waves into the body,where they bounce off the different tissues and organs toproduce a distinctive pattern of echoes. A receiver“hears” the returning echo pattern and forwards it to acomputer, which translates the data into an image on atelevision screen. Because ultrasound can distinguishsubtle variations between soft, fluid-filled tissues, it isparticularly useful in providing diagnostic images of theabdomen. Ultrasound can also be used in treatment.\nP

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings

def fetch_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
    return embeddings

## Function: `fetch_hugging_face_embeddings`

This function downloads and returns the Hugging Face embeddings model for use in NLP tasks.

### Returns:
- **`HuggingFaceEmbeddings`**: The embeddings model initialized with 'sentence-transformers/all-MiniLM-L6-v2'.

### Example:
```python
embeddings = fetch_hugging_face_embeddings()


In [14]:
embeddings = fetch_hugging_face_embeddings()

  warn_deprecated(


In [8]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
emb_dimention = embeddings.embed_query("This is LifeLine")
len(emb_dimention)

384

In [15]:
from langchain_community.vectorstores import FAISS

DB_FAISS_PATH = 'vectstore/db'
db = FAISS.from_documents(chunks, embeddings)
db.save_local(DB_FAISS_PATH)

In [16]:
prompt_template = """
You are provided with some context and a question. Your task is to use the context to answer the user's question accurately and concisely.

Follow these instructions:

1. Carefully read the context and identify the key information relevant to the question.
2. Summarize the relevant parts of the context in your mind before formulating your answer.
3. Ensure your answer is factually accurate based on the provided context.
4. If you are unsure about any part of the answer, it's better to say "I don't know" than to provide an incorrect answer.
5. Keep your answer concise and to the point.

Context: {context}
Question: {question}

Helpful answer:
"""


In [17]:
from langchain import PromptTemplate
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}


In [2]:
from langchain.llms import CTransformers
model =CTransformers(model="/home/mg/lifeCare/model/llama-2-7b-chat.ggmlv3.q2_K.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=model,
                                        chain_type='stuff',
                                       retriever=db.as_retriever(search_kwargs={'k': 1}),
                                       return_source_documents=True,
                                       chain_type_kwargs=chain_type_kwargs
                                       )


NameError: name 'db' is not defined

In [20]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])


  warn_deprecated(


KeyboardInterrupt: 