In [None]:
# fully local RAG demo targeting CPU only indexing and inference within 16GB RAM

In [None]:
from llama_index import (
    VectorStoreIndex, #a basic in-memory vector store provided by llama index
    ServiceContext, # used to configure llama index e.g. what LLM to use
    set_global_tokenizer # used to configure llama index
)
from llama_index.llms import LlamaCPP # wrapper for llama.cpp to run the LLM locally
from llama_index.llms.llama_utils import (
    messages_to_prompt, completion_to_prompt # prompts to use when the LLM generates responses
)

from llama_index.embeddings import HuggingFaceEmbedding # embedding used to index our documents in the vector db
from llama_index.schema import Document # format we use for each individual item in our vector db
import json
from transformers import AutoTokenizer # for tokenizing documents

In [2]:
#Set the tokenizer to use
set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [3]:
#This is the embedding model used to index the contextual documents (so that we can retrieve them again in RAG)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [4]:
#load the LLM model weights - note model path may not be the same on your machine
#for CPU use with low-ish RAM, n_gpu_layers and n_batch are key. 
#If you can generate text from a short prompt (under ~30 tokens) but not more, dropping n_batch dramatically
#to ~10 can solve it. By trial and error I found n_batch = 10 is stable and performant enough - it can take 
#a while to process the prompt initially, especially with a lot of context. 
#n_gpu_layers = 0 means do not use the GPU at all. If you're on a mac and the text generation
#immediately crashes, it's likely a Metal error and setting this to 0 may solve it.
llm = LlamaCPP(
    model_path="./models/llama-2-7b/llama-2-7b-chat.Q4_K_M.gguf",
    messages_to_prompt=messages_to_prompt, # built in prompts we imported earlier - may not be optimal for your model
    completion_to_prompt=completion_to_prompt,
    verbose=False,
    model_kwargs={"n_gpu_layers": 0, "n_batch": 10}
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-2-7b/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count

In [5]:
# create a service context so llama-index knows which llm and embedding to use
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

In [6]:
# load our demo dataset and convert to a Document object
with open('./data/dataset.json') as f:
    dataset = json.load(f)

documents = [Document(text=x['text']) for x in dataset]

In [7]:
# create vector store index - this step can be slow if you increase the number of documents
# for a real use case you would want to save the embeddings rather than running this step every time
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)

In [8]:
# set up query engine - this is what puts the RAG steps together
query_engine = index.as_query_engine()

In [9]:
# see what your LLM responds given the most relevant context (if any) from your documents
# (below we will check what context will be provided)
response = query_engine.query("What cell types are significant in angiosarcoma?")
print(response)

  Based on the given context information, the significant cell types in angiosarcoma are:
1. Cells that line the walls of blood vessels or lymphatic vessels.


In [10]:
# what context does your vector index provide the LLM?
# get your vector store as a "retriever" rather than a query engine
retriever = index.as_retriever()

In [12]:
relevant = retriever.retrieve("What cell types are significant in angiosarcoma?")
for d in relevant:
    print(d)

Node ID: f40d598e-3c68-439f-b66a-d6b92c868892
Text: angiosarcoma is A vascular cancer that derives from the cells
that line the walls of blood vessels or lymphatic vessels.
Score:  0.794

Node ID: 23f524e6-e422-478c-af8c-132fc64bf2ae
Text: pterygium is A corneal disease that is characterized by a
triangular tissue growth located in cornea of the eye that is the
result of collagen degeneration and fibrovascular proliferation.
Score:  0.593

