In [4]:
import os
os.environ["OPENAI_API_KEY"] = 'sk-'
os.environ['COHERE_API_KEY'] = ''

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
)
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

# Load
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)



USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:
str(doc_splits[0])

'page_content=\'LLM Powered Autonomous Agents | Lil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\nemojisearch.app\n\n\n\n\n\n\n\n\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nAgent System Overview\n\nComponent One: Planning\n\nTask Decomposition\n\nSelf-Reflection\n\n\nComponent Two: Memory\n\nTypes of Memory\n\nMaximum Inner Product Search (MIPS)\n\n\nComponent Three: Tool Use\n\nCase Studies\n\nScientific Discovery Agent\n\nGenerative Agents Simulation\n\nProof-of-Concept Examples\n\n\nChallenges\n\nCitation\n\nReferences\n\n\n\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as 

In [8]:

def instructions(context_variables):
    whole_document = context_variables.get("whole_document")
    chunk = context_variables.get("chunk")
    return f'''<document> 
{whole_document}
</document> 
Here is the chunk we want to situate within the whole document 
<chunk> 
{chunk} 
</chunk> 
'''

from swarm import Swarm, Agent

client = Swarm()

chunk_agent = Agent(
    name="Chunk Agent",
    instructions=instructions,
    model = 'gpt-4o-mini'
)

context_variables = {
    'whole_document': docs,
    'chunk': doc_splits[10].page_content
}

messages = [{"role": "user", "content": '''Please give a short succinct context to situate this chunk within the overall document for the purposes of
improving search retrieval of the chunk. Answer only with the succinct context and nothing else. '''}]
response = client.run(agent=chunk_agent, messages=messages, context_variables=context_variables,)

print(response.messages[-1]["content"])


The chunk is part of the "Maximum Inner Product Search (MIPS)" section, which discusses various algorithms for efficient memory retrieval in large language model architectures. It specifically highlights ScaNN as an innovative method for anisotropic vector quantization, focusing on optimizing the similarity of inner products during the search process.


In [10]:
import concurrent.futures
import time

def process_chunk(i, doc, docs, client, chunk_agent):
    
    print(i)
    context_variables = {
        'whole_document': docs,
        'chunk': doc
    }

    messages = [{"role": "user", "content": '''Please give a short succinct context to situate this chunk within the overall document for the purposes of
    improving search retrieval of the chunk. Answer only with the succinct context and nothing else. '''}]
    
    response = client.run(agent=chunk_agent, messages=messages, context_variables=context_variables,)
    chunk = response.messages[-1]["content"]
    doc.page_content = chunk + '\n' + doc.page_content
    return doc

# List to store the processed docs
processed_docs = []

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_chunk, i, doc, docs, client, chunk_agent) for i, doc in enumerate(doc_splits)]
    
    for future in concurrent.futures.as_completed(futures):
        processed_docs.append(future.result())


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


In [22]:
doc_splits[10].page_content[:800]

"The chunk discusses ScaNN (Scalable Nearest Neighbors), a component of memory retrieval in the context of LLM-powered autonomous agents. It highlights ScaNN's innovation in anisotropic vector quantization, which enhances the efficiency and accuracy of information retrieval within the agent system. This information is part of a broader examination of various memory types and retrieval algorithms used to augment the capabilities of language models in autonomous agents.\nScaNN (Scalable Nearest Neighbors): The main innovation in ScaNN is anisotropic vector quantization. It quantizes a data point $x_i$ to $\\tilde{x}_i$ such that the inner product $\\langle q, x_i \\rangle$ is as similar to the original distance of $\\angle q, \\tilde{x}_i$ as possible, instead of picking the closet quantization cen"

In [12]:
# Add to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    embedding=embedding,
)

vectorstore_retreiver = vectorstore.as_retriever()

In [13]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

keyword_retriever = BM25Retriever.from_documents(
    doc_splits
)

from langchain.retrievers import BM25Retriever, EnsembleRetriever
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.3, 0.7])

In [16]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank


compressor = CohereRerank(model="rerank-english-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)



In [17]:
def retrieve(q):
    print("retrieve", q)
    d = compression_retriever.invoke(q)
    print(len(d))
    return d

In [18]:
from swarm import Swarm, Agent

client = Swarm()

rag_agent = Agent(
    name="RAG Agent",
    instructions="Answers the user's question based on data from the retrieve only",
    model = 'gpt-4o-mini'
)
rag_agent.functions.append(retrieve)


messages = [{"role": "user", "content": "What are the types of agent memory?"}]
response = client.run(agent=rag_agent, messages=messages)

print(response.messages[-1]["content"])

retrieve types of agent memory
3
The types of agent memory in LLM-powered autonomous agents include:

1. **Sensory Memory**: This is the first stage of memory that retains impressions of sensory information (such as visual and auditory) for a short period, typically lasting only a few seconds. Subcategories include:
   - Iconic Memory (visual)
   - Echoic Memory (auditory)
   - Haptic Memory (touch)

2. **Short-Term Memory (STM)**: Also known as working memory, this stores information that individuals are currently aware of and need for complex cognitive tasks such as learning and reasoning. It is believed to have a capacity of about 7 items and lasts for approximately 20-30 seconds.

3. **Long-Term Memory (LTM)**: This type of memory can store information for a long time, ranging from days to decades, with essentially unlimited capacity. LTM is further divided into:
   - **Explicit/Declarative Memory**: Memory of facts and events that can be consciously recalled, including:
     - Epi

In [19]:

messages = [{"role": "user", "content": "What is Scann?"}]
response = client.run(agent=rag_agent, messages=messages)

print(response.messages[-1]["content"])

retrieve Scann
3
ScaNN (Scalable Nearest Neighbors) is a component related to memory retrieval in large language model (LLM)-powered autonomous agents. It features an innovation in anisotropic vector quantization, which enhances the efficiency and accuracy of information retrieval within such systems. The primary purpose of ScaNN is to quantize data points in a way that makes the inner product between a query and a point as similar as possible to the original distance, rather than merely selecting the closest quantization centroid points.

This technology is part of a broader exploration of various memory types and algorithms used to improve the capabilities of language models in autonomous agents.


In [20]:


messages = [{"role": "user", "content": 'Maximum Inner Product Search'}]
response = client.run(agent=rag_agent, messages=messages)

print(response.messages[-1]["content"])

retrieve Maximum Inner Product Search
3
Maximum Inner Product Search (MIPS) is a technique used in the context of optimizing retrieval from external memory sources, particularly in systems powered by large language models (LLMs). MIPS aids in quickly finding embedding representations within a vector store, which serves as long-term memory for the LLM. 

In MIPS, a standard practice involves storing the embedding representations of relevant information in a vector store database that supports fast retrieval. For speeding up the retrieval process, approximate nearest neighbors (ANN) algorithms are commonly employed, allowing the system to return approximate top-k nearest neighbors while sacrificing some accuracy for significant speed improvements.

Here are a few common choices of ANN algorithms utilized for efficient MIPS:

1. **Locality-Sensitive Hashing (LSH)**: It hashes similar items to the same buckets with high likelihood, facilitating fast retrieval.
  
2. **ANNOY (Approximate Ne