In [1]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt

In [9]:
documents = SimpleDirectoryReader('E:\Girish Documents\Study\Data Science\DataScience_GenAI_Study\GenAI_Project_RAG-LLM-App-Using-LLAMA2-LAAMAIndex\data').load_data()

In [3]:
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.
"""
#Default format supportable Llama2
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT>")


In [5]:
#Execute this --> "huggingface-cli login" in Terminal prompt or conda prompt. 
##****  USE THE BASE ENV AS THIS NEEDS PyTORCH and it is configured in BASE ENV Only***

In [4]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext
from llama_index.embeddings import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [7]:
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=4096, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=LangchainEmbedding(model_name='sentence-transformers/all-mpnet-base-v2', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001B2C7396750>), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001B2C7396750>, id_func=<function default_id_func at 0x000001B281EA8860>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.logger.base.LlamaLogger object at 0x000001B4007F1410>, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0

In [10]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [11]:
index

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x1b400db4690>

In [13]:
query_engine=index.as_query_engine()

In [16]:
response = query_engine.query("What is attention is all you need?")

In [17]:
print(response)

Attention is all you need is a phrase used in the context of natural language processing and machine learning, particularly in the field of transformer models. It suggests that attention mechanisms, which allow the model to selectively focus on certain parts of the input when processing it, are sufficient for achieving good performance in various NLP tasks.

In the context of the provided PDF, the phrase is used to highlight the importance of attention mechanisms in the Transformer model, which is a type of transformer architecture that uses self-attention mechanisms to process input sequences. The author of the PDF is suggesting that attention is the key to understanding how the Transformer model works and that it is sufficient to focus on attention mechanisms when trying to understand the model.

It is worth noting that the phrase "attention is all you need" is an oversimplification, as other components such as feedforward networks and layer normalization are also important in transf