## Importing Libraries

In [95]:
from dotenv import load_dotenv
import os
from groq import Groq
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
import faiss
import numpy as np
from langchain.vectorstores import FAISS

## checking llama from groq

In [6]:
load_dotenv()
GROQ_API = os.environ.get("GROQ_ENV")


In [11]:

client = Groq(
    api_key=  GROQ_API,
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "you are a helpful assistant."
        },
        {
            "role": "user",
            "content": "Where is mount everest located",
        }
    ],
    model="llama3-8b-8192",
    temperature=0.5,
    top_p=,
    stop=None,
    max_tokens=200



)

print(chat_completion.choices[0].message.content)

Mount Everest, also known as Chomolungma or Sagarmatha, is located in the Himalayas mountain range in Asia. It is situated on the border between Nepal and Tibet, China.

To be more specific, the summit of Mount Everest is located at:

* Latitude: 27.9881° N
* Longitude: 86.9253° E
* Elevation: 8,848 meters (29,029 feet) above sea level

The mountain is part of the Mahalangur Himal sub-range of the Himalayas, and it is considered one of the most iconic and challenging mountains to climb in the world.


In [13]:
for chunk in chat_completion:
    print(chunk)

('id', 'chatcmpl-36a58436-0734-47cc-a1ef-bcdb5f8841ff')
('choices', [Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Mount Everest, also known as Chomolungma or Sagarmatha, is located in the Himalayas mountain range in Asia. It is situated on the border between Nepal and Tibet, China.\n\nTo be more specific, the summit of Mount Everest is located at:\n\n* Latitude: 27.9881° N\n* Longitude: 86.9253° E\n* Elevation: 8,848 meters (29,029 feet) above sea level\n\nThe mountain is part of the Mahalangur Himal sub-range of the Himalayas, and it is considered one of the most iconic and challenging mountains to climb in the world.', role='assistant', function_call=None, tool_calls=None))])
('created', 1726410746)
('model', 'llama3-8b-8192')
('object', 'chat.completion')
('system_fingerprint', 'fp_6a6771ae9c')
('usage', CompletionUsage(completion_tokens=132, prompt_tokens=27, total_tokens=159, completion_time=0.11, prompt_time=0.004860752, queue_time=0

## Using langchain

In [16]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
user_query = "where is mount everest located"
template = """You are a chatbot that answers user queries based on the provided documents or context. Help users in an engaging, conversational manner.
User: {user_query}
Chatbot:"""

prompt = PromptTemplate(input_variables=["user_query"], template=template)
# Format the prompt with the user query
formatted_prompt = prompt.format(user_query=user_query)

# Output the formatted prompt
print(formatted_prompt)

You are a chatbot that answers user queries based on the provided documents or context. Help users in an engaging, conversational manner.
User: where is mount everest located
Chatbot:


In [21]:
#extracting data from the pdf
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [23]:
extracted_data = load_pdf("../data/pdfs")

In [24]:
extracted_data

[Document(metadata={'source': '..\\data\\pdfs\\mountainpeaks.pdf', 'page': 0}, page_content='See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/353620262\nMou ntain Peaks of Nepal Himala ya\nArticle \xa0\xa0 in\xa0\xa0Journal of T ourism and Himalay an Adv entures · June 2021\nDOI: 10.3126/ jtha. v3i1.39118\nCITATIONS\n0READS\n2,204\n1 author:\nSher Bahadur Gurung\nTribhuv an Univ ersity\n59 PUBLICA TIONS \xa0\xa0\xa033 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Sher Bahadur Gurung  on 28 Oct ober 2021.\nThe user has r equest ed enhanc ement of the do wnlo aded file.'),
 Document(metadata={'source': '..\\data\\pdfs\\mountainpeaks.pdf', 'page': 1}, page_content='Journal of Tourism and \nHimalayan Adventures\nAn International Research Journal\nJune 2021, V ol. 3, ISSN: 2717-5030 (Print) 2738-9642 (Online)Nepal Mountain Academy\nAbstract\nNepal is a mountainous country with n

In [28]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [29]:
text_chunks = text_split(extracted_data)


In [32]:
len(extracted_data)

12

In [31]:
len(text_chunks)

103

In [34]:
# download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") #sentence transformers
    return embeddings

In [37]:
embeddings = download_hugging_face_embeddings()

  from tqdm.autonotebook import tqdm, trange


In [38]:
query_result = embeddings.embed_query("Hello world!")
print("length", len(query_result))

length 384


In [112]:
from langchain.docstore.document import Document

def store_embeddings_in_faiss_langchain(text_chunks, embeddings_model):
    # Initialize the embedding model (sentence-transformers/all-MiniLM-L6-v2)

    # Prepare the texts and corresponding embeddings
    texts = [chunk.page_content for chunk in text_chunks]  # Extract text content from chunks
    
    # Use LangChain's FAISS to generate embeddings and store them
    faiss_store = FAISS.from_texts(texts, embeddings_model)

    return faiss_store  # Returns the FAISS vector store with indexed documen

In [55]:
# Create a FAISS index and store embeddings
def store_embeddings_in_faiss(text_chunks, embeddings_model):
    embedding_dim = 384  # All-MiniLM-L6-v2 produces 384-dimensional embeddings

    index = faiss.IndexFlatL2(embedding_dim)

    # Create a list to store the documents and their embeddings
    texts = []
    embeddings = []

    # Generate embeddings for each chunk and store them in the FAISS index
    for chunk in text_chunks:
        text = chunk.page_content
        texts.append(text)
        
        embedding = embeddings_model.embed_documents([text])[0]  
        embeddings.append(embedding)

    # Convert the list of embeddings into a numpy array and add to FAISS index
    np_embeddings = np.array(embeddings, dtype="float32")
    index.add(np_embeddings)

    return index, texts

In [122]:
# Function to query FAISS for the nearest neighbors
def query_faiss(index, embeddings_model, query_text, texts, k=3):
    # Generate the embedding for the query using `embed_query`
    query_embedding = embeddings_model.embed_query(query_text)  # Pass string directly, not a list
    
    # Perform similarity search (find k nearest neighbors)
    distances, indices = index.search(np.array([query_embedding], dtype="float32"), k)
    
    # Retrieve the corresponding texts for the nearest neighbors
    results = [texts[i] for i in indices[0]]
    
    return results, distances

In [116]:
# Function to query the FAISS vector store


def query_faiss_langchain(faiss_store, query_text):

    # Initialize the embedding model again for query embedding generation
    embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Perform the similarity search on the FAISS index
    results = faiss_store.similarity_search(query_text, k=3)  # Retrieve top-k most similar documents

    return results

In [113]:
    # Store embeddings in FAISS
texts = store_embeddings_in_faiss_langchain(text_chunks, embeddings)

In [None]:
index, texts = store_embeddings_in_faiss(text_chunks, embeddings)

In [119]:
query_text = "Where is Mount Everest?"
results = query_faiss_langchain(texts, query_text)

print("Query Results:")
for result in results:
    print(results[0].page_content)
    print("=======================================")



Query Results:
USGS. (1999). The Himalayas: Two continents collide . United State Geological Society, USGS. 5 May 
1999. Retrieved 3 January 2015.
Viviroli, D., Du¨rr, H. H., Messerli, B., Meybeck, M., & Weingartner, R. (2007). Mountains of the 
world, water towers for humanity: Typology, mapping, and global signi ﬁcance, Water Resour . 
Res., 43, W07447, doi:10.1029/2006WR005653
Ward, M. P. (1994). Mapping Everest. The Cartographic Journal, 31: 1, 33-44, DOI: 
10.1179/000870494787073637
USGS. (1999). The Himalayas: Two continents collide . United State Geological Society, USGS. 5 May 
1999. Retrieved 3 January 2015.
Viviroli, D., Du¨rr, H. H., Messerli, B., Meybeck, M., & Weingartner, R. (2007). Mountains of the 
world, water towers for humanity: Typology, mapping, and global signi ﬁcance, Water Resour . 
Res., 43, W07447, doi:10.1029/2006WR005653
Ward, M. P. (1994). Mapping Everest. The Cartographic Journal, 31: 1, 33-44, DOI: 
10.1179/000870494787073637
USGS. (1999). The Himalayas: 

In [64]:
# Query the FAISS index
query_text = "Where is Mount Everest?"
results, distances = query_faiss(index, embeddings, query_text, texts, k=3)

print("Query Results:")
for result, distance in zip(results, distances[0]):
    print(f"Text: {result}, Distance: {distance}")
    print("=======================================")

Query Results:
Text: USGS. (1999). The Himalayas: Two continents collide . United State Geological Society, USGS. 5 May 
1999. Retrieved 3 January 2015.
Viviroli, D., Du¨rr, H. H., Messerli, B., Meybeck, M., & Weingartner, R. (2007). Mountains of the 
world, water towers for humanity: Typology, mapping, and global signi ﬁcance, Water Resour . 
Res., 43, W07447, doi:10.1029/2006WR005653
Ward, M. P. (1994). Mapping Everest. The Cartographic Journal, 31: 1, 33-44, DOI: 
10.1179/000870494787073637, Distance: 0.7722309827804565
Text: Himalayan range based on systematic exploration (Britanica, 2021). In the mid 19th century, Nepal and Indian mountains were measured by the systematic trigonometric survey, and during this time, the highest peak of the world was named Sagarmatha (Mt. Everest) after Sir George Everest in 1865. In Nepal, the ﬁrst efforts were carried out to mapping peaks in the, Distance: 0.8387680053710938
Text: Nepal is a mountainous country with numerous peaks and pinnacles. I

In [120]:
def generate_llama_response(client, query_text, retrieved_texts):
    # Create a prompt with the query and the retrieved documents
    template = """You are a helpful assistant. Use the following retrieved documents to answer the question in a helpful way.

    Retrieved Documents:
    {retrieved_texts}
    
    Question: {query_text}
    
    Answer:"""
    
    # Define the PromptTemplate
    prompt = PromptTemplate(
        input_variables=["retrieved_texts", "query_text"],
        template=template,
    )
    
    # Format the prompt with the retrieved documents and query
    formatted_prompt = prompt.format(
        retrieved_texts="\n".join(retrieved_texts),
        query_text=query_text,
    )
    
    # Call the Groq LLaMA3 model to generate an answer
    # chat_completion = client.chat.completions.create(
    #     messages=[
    #         {
    #             "role": "system",
    #             "content": "You are a helpful assistant."
    #         },
    #         {
    #             "role": "user",
    #             "content": formatted_prompt,  # Pass the formatted prompt as the user input
    #         }
    #     ],
    #     model="llama3-8b-8192",  # Adjust model as necessary
    #     temperature=0.5,          # Adjust the temperature for randomness
    #     top_p=0.9,                # Use top-p sampling for diversity
    #     stop=None,                # Option to define stopping tokens
    #     max_tokens=200            # Limit the number of tokens in the response
    # )

    response = client.generate(formatted_prompt)

    # Extract and return the generated answer from the response
    # return chat_completion.choices[0].message.content
    return response
    


In [123]:
query_text = "Where is Mount Everest located?"

    # Step 1: Retrieve relevant documents from FAISS
retrieved_texts = query_faiss(index, embeddings, query_text, texts, k=1)

retrieved_texts = "\n".join(retrieved_texts[0])


    # Step 2: Use LLaMA to generate an answer based on the retrieved documents
answer = generate_llama_response(client, query_text, retrieved_texts[0])

    # Print the generated answer
print("Generated Answer:", answer)

TypeError: 'FAISS' object is not subscriptable

In [88]:
answer.choices[0].message.content

'Based on the retrieved documents, I can answer your question!\n\nMount Everest is located in the Himalayas mountain range on the border between Nepal and Tibet, China.'