In [1]:
import os

os.environ["HF_HOME"] = "/teamspace/studios/this_studio/weights"
os.environ["TORCH_HOME"] = "/teamspace/studios/this_studio/weights"

import gc
import re
import uuid
import textwrap
import subprocess
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext

from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

from rag_101.retriever import (
    load_embedding_model,
    load_reranker_model
)

In [2]:
# allows nested access to the event loop
nest_asyncio.apply()

In [3]:
# setting up the llm
llm=Ollama(model="llama3", request_timeout=60.0)

# setting up the embedding model
lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)

In [4]:
# utility functions
def parse_github_url(url):
    pattern = r"https://github\.com/([^/]+)/([^/]+)"
    match = re.match(pattern, url)
    return match.groups() if match else (None, None)

def clone_github_repo(repo_url):    
    try:
        print('Cloning the repo ...')
        result = subprocess.run(["git", "clone", repo_url], check=True, text=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return None


def validate_owner_repo(owner, repo):
    return bool(owner) and bool(repo)

In [5]:
# Setup a query engine

def setup_query_engine(github_url):
    
    owner, repo = parse_github_url(github_url)
    
    if validate_owner_repo(owner, repo):
        # Clone the GitHub repo & save it in a directory
        input_dir_path = f"/teamspace/studios/this_studio/{repo}"

        if os.path.exists(input_dir_path):
            pass
        else:
            clone_github_repo(github_url)
        
        loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".py", ".ipynb", ".js", ".ts", ".md"],
            recursive=True
        )

        try:
            docs = loader.load_data()

            # ====== Create vector store and upload data ======
            Settings.embed_model = embed_model
            index = VectorStoreIndex.from_documents(docs, show_progress=True)
            # ====== Setup a query engine ======
            Settings.llm = llm
            query_engine = index.as_query_engine(similarity_top_k=4)
            
            # ====== Customise prompt template ======
            qa_prompt_tmpl_str = (
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "You are llama3, a large language model developed by Meta AI. Surya has integrated you into this environment so you can answer any user's coding questions! Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
            "Query: {query_str}\n"
            "Answer: "
            )
            qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

            query_engine.update_prompts(
                {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
            )

            if docs:
                print("Data loaded successfully!!")
                print("Ready to chat!!")
            else:
                print("No data found, check if the repository is not empty!")
            
            return query_engine

        except Exception as e:
            print(f"An error occurred: {e}")
    else:
        print('Invalid github repo, try again!')
        return None

In [6]:
# Provide url to the repository you want to chat with
github_url = "https://github.com/meta-llama/llama3"

query_engine = setup_query_engine(github_url=github_url)

Cloning the repo ...


Parsing nodes:   0%|          | 0/69 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/77 [00:00<?, ?it/s]

Data loaded successfully!!
Ready to chat!!


In [7]:
response = query_engine.query('Tell me in detail how we use kv cache in model.py with attention mechanism?')
display(Markdown(str(response)))

I'm LLaMA3!

Given the context information provided, I can help you understand how KV caching is used in `model.py` along with the attention mechanism.

KV caching is a technique used to improve the efficiency of transformer-based models like ours. In our case, we use KV caching to store the attention scores for tokens that have been processed previously.

Here's how it works:

1. **Attention Scores Calculation**: When processing a token, we calculate its attention scores using the attention mechanism (a combination of self-attention and cross-attention). These attention scores represent the importance of each token in the input sequence with respect to the current token.
2. **KV Cache Construction**: We create a cache that stores these attention scores for tokens that have been processed previously. This cache is divided into two parts: Key (K) and Value (V).
3. **KV Caching**: When processing a new token, we first check if its attention scores are already cached in the KV cache. If they are, we can reuse those scores instead of recalculating them from scratch.
4. **Attention Mechanism with KV Cache**: In the attention mechanism, we use the KV cache to retrieve the attention scores for tokens that have been processed previously. We then use these cached scores to compute the attention weights.

Here's a snippet from `model.py` that illustrates how KV caching is used in conjunction with the attention mechanism:
```python
def forward(self, x: torch.Tensor, start_pos: int):
    ...
    for layer in self.layers:
        h = layer(h, start_pos, freqs_cis, mask)
    ...
```
In this code snippet, `layer` represents a TransformerBlock, which applies the attention mechanism to the input sequence. The `start_pos` variable indicates the starting position of the token being processed.

The `freqs_cis` tensor is used as an input to the attention mechanism, and it's where KV caching comes into play. We cache the attention scores for tokens that have been processed previously in a KV cache (Key-Value cache). When processing a new token, we check if its attention scores are already cached in the KV cache. If they are, we reuse those cached scores instead of recalculating them from scratch.

This caching mechanism helps improve the efficiency of our model by reducing the computational complexity of attention score calculation and allowing us to leverage previously computed attention scores when processing subsequent tokens.

I hope this explanation helps you understand how KV caching is used in conjunction with the attention mechanism in `model.py`!