In [3]:
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2312.04511.pdf" -O "llm_compiler.pdf"
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2312.06648.pdf" -O "dense_x_retrieval.pdf"

--2024-04-03 17:37:09--  https://arxiv.org/pdf/2312.04511.pdf
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.131.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 885090 (864K) [application/pdf]
Saving to: 'llm_compiler.pdf'

     0K .......... .......... .......... .......... ..........  5% 1.81M 0s
    50K .......... .......... .......... .......... .......... 11% 4.95M 0s
   100K .......... .......... .......... .......... .......... 17% 3.65M 0s
   150K .......... .......... .......... .......... .......... 23% 4.47M 0s
   200K .......... .......... .......... .......... .......... 28% 7.75M 0s
   250K .......... .......... .......... .......... .......... 34% 11.2M 0s
   300K .......... .......... .......... .......... .......... 40% 20.2M 0s
   350K .......... .......... .......... .......... .......... 46% 5.43M 0s
   400K .......... .......... .......... .......... ....

In [31]:
import os
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb
from chromadb.config import Settings

from llama_index.core import StorageContext
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, RecursiveRetriever
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter

In [3]:
reader = SimpleDirectoryReader(input_files=['dense_x_retrieval.pdf'])
documents_jerry = reader.load_data()

reader = SimpleDirectoryReader(input_files=['llm_compiler.pdf'])
documents_ravi = reader.load_data()

In [23]:
len(documents_jerry) # 18 page pdf file loaded as 18 documents 

18

# Create An Empty Index

In [5]:
# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model) #slow

In [7]:
semant_nodes = splitter.get_nodes_from_documents(documents_jerry)

In [8]:
len(semant_nodes)

57

In [15]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_path='../../llama2/llama-2-7b-chat.Q8_0.gguf',
    temperature=0.01,
    context_window=3900,  
    # kwargs to pass to __call__()
    model_kwargs={"n_gpu_layers": 3, "offload_kqv": True}, #onyl 1 layer in GPU, others in CPU, if you do not how many layers, set to -1
    # transform inputs into Llama2 format
    verbose=False,
    )

# Create An Empty Index

In [16]:
index = VectorStoreIndex.from_documents(documents=[], embed_model = embed_model)

# Ingestion Pipeline

In [19]:
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
    ]
)

# Update metadata and insert document

In [24]:
# For user Jerry
for document in documents_jerry:
    document.metadata['user'] = 'Jerry'

nodes = pipeline.run(documents=documents_jerry)
# Insert nodes into the index
index.insert_nodes(nodes)

In [25]:
# For user Ravi
for document in documents_ravi:
    document.metadata['user'] = 'Ravi'

nodes = pipeline.run(documents=documents_ravi)
# Insert nodes into the index
index.insert_nodes(nodes)

# Define Query Engines:

In [42]:
# For Jerry
jerry_query_engine = index.as_query_engine(llm = llm, streaming=True,
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(
                key="user",
                value="Jerry",
            )
        ]
    ),
    similarity_top_k=3
)


In [43]:
# For Ravi
ravi_query_engine = index.as_query_engine(llm = llm, streaming=True,
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(
                key="user",
                value="Ravi",
            )
        ]
    ),
    similarity_top_k=3
)

# Query

In [44]:
jerry_query_engine.query("what are propositions mentioned in the paper?").print_response_stream()


Propositions are not explicitly mentioned in the paper, but the authors propose using them as a retrieval unit for dense retrieval models. They explain that each proposition should correspond to a distinct piece of meaning in text, and should be minimal, contextualized, and self-contained. They also demonstrate the concept of proposition and how a passage can be split into its set of propositions using an example on the left side of Figure 2. They expect each proposition to describe exactly one contextualized atomic fact, and so their intuition is that propositions would suitably work as a retrieval unit for information-seeking questions. They empirically compare the use of 100-word passages, sentences, and propositions as retrieval units on Wikipedia, a commonly-used retrieval source for knowledge-intensive NLP tasks, and find that propositions work best. They also train a model called the Propositionizer to generate propositions from a given passage, and use it to finetune a Flan-T5

In [45]:
# Ravi has LLMCompiler paper
ravi_query_engine.query("what are steps involved in LLMCompiler?").print_response_stream()


The steps involved in LLMCompiler are as follows:
Step 1: User Provides Tool Definitions and In-Context Examples for the Planner (Sec. A.3.2)
Step 2: The Planner Generates a Sequence of Tasks and Their Dependencies Using the Provided Tool Definitions and In-Context Examples (Sec. A.3.1)
Step 3: The Task Fetching Unit Fetches Tasks to the Executor Based on a Greedy Policy (Sec. A.3.1)
Step 4: The Executor Executes the Fetched Tasks in Parallel (Sec. A.4)
Note that the steps are not mutually exclusive, and the Planner may use the provided tool definitions and in-context examples to generate tasks and their dependencies, and then the Task Fetching Unit may fetch tasks to the Executor based on a greedy policy, and the Executor may execute the fetched tasks in parallel.