In [None]:
pip install --quiet datasets=='2.18.0' anthropic=='0.21.3' voyageai=='0.2.1' qdrant-client=='1.7.1'


# Building a RAG Agent Utilizing Claude 3 Opus

In our project, we aim to create an efficient RAG (Retrieval-Augmented Generation) agent by integrating several components:

- **LangChain**: To manage the workflow.
- **Voyage AI**: For knowledge embeddings.
- **Qdrant Database**: Serving as our primary data storage.
- **Claude 3 Opus**: As our Language Model.
- **Hugging Face Dataset**: For our dataset needs.

### Quick Intro
1. We'll be using the AI ArXiv dataset from Hugging Face, specifically the prechunked version: [Link](https://huggingface.co/datasets/jamescalam/ai-arxiv2-chunks)
    - The full version is available [Here](https://huggingface.co/datasets/jamescalam/ai-arxiv2)  

&nbsp;
2. Voyage Setup

&nbsp;
3. Qdrant DB Setup

&nbsp;
4. Anthropich Setup


### 1- Dataset

We will be utilising version 2 of the AI ArXiv dataset from Hugging Face. Our chosen dataset is already pre-chunked for convenience. However, there is also an option to work with the raw/plain version of the dataset. (Refer to the links mentioned above).

In [None]:
from datasets import load_dataset

dataset = load_dataset("jamescalam/ai-arxiv2-chunks", split="train[:10000]")
dataset

In [None]:
dataset[0]

### 2- Voyage  Setup

Regarding embeddings, we'll use VoyageEmbeddings, leveraging the embedding models provided by Voyage AI. An API key is necessary for this process. 

Initially, we'll establish a connection with Voyage AI and then create an embed object specifically for handling embeddings

In [None]:
import os
from dotenv import load_dotenv
from langchain_community.embeddings import VoyageEmbeddings

load_dotenv()  # This loads the variables from .env into the environment

voyage_key = os.getenv("VOYAGE_API_KEY")

embed = VoyageEmbeddings(
    voyage_api_key=voyage_key, model="voyage-2"
)

### 3- Qdrant DB Setup

We will use a vector database to store and query our embeddings. For this purpose, Qdrant is our chosen platform, which also necessitates obtaining a free API key.

In [None]:
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
from qdrant_client import QdrantClient
from qdrant_client.http import models

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")


# from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url=qdrant_url, 
    api_key=qdrant_api_key
)
collection = "rag_with_claude"
qdrant_client.recreate_collection(
    collection_name=collection,
    vectors_config=models.VectorParams(size=1024, distance=models.Distance.DOT),
)

this code processes a dataset in batches, generates embeddings for each batch, and then stores these embeddings along with their metadata in a Qdrant database.

In [None]:
from tqdm.auto import tqdm
import uuid

"""
- tqdm; imported for showing a progress bar during loops.
- uuid; used for generating unique identifiers.
"""

data = dataset.to_pandas()

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [str(uuid.uuid4()) for _ in range(len(batch))]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed.embed_documents(texts)
    # get metadata to store in Qdrant
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    
    points = [{'id': id, 'vector': embed, 'payload': meta} for id, embed, meta in zip(ids, embeds, metadata)]

    operation_info = qdrant_client.upsert(
        collection_name=collection,
        points=points
    )


In [None]:
from qdrant_client import QdrantClient


def arxiv_search(query: str) -> str:
    """For inquiries about artificial intelligence, machine learning, data science, 
    or other technical fields where arXiv papers might offer relevant 
    information and answers, consider using this tool.
    """
    # create query vector
    query_vector = embed.embed_query(query)

    # perform search in Qdrant
    search_results = qdrant_client.search(
        collection_name=collection,
        query_vector=query_vector,
        limit=5,
        append_payload=True
    )

    return search_results



In [None]:
results = arxiv_search("Do you know what claude ai is?")

In [None]:
formatted_results = []
for idx, result in enumerate(results, start=1):
    source = result.payload['source']
    title = result.payload['title']
    text = result.payload['text']
    formatted_results.append(f"Result-{idx}\nSource:\n{source}\nTitle:\n{title}\nText:\n{text}")

full_text = '\n\n'.join(formatted_results)
print(full_text[:500])


### 4- Anthropic - Claude AI
- Next we initialize our connection to Anthropic, for this we need an Anthropic API key. 

In [None]:
import anthropic

anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
client = anthropic.Client(api_key=anthropic_api_key)


- #### RAG Function

In [None]:
def handle_user_query(query):

    results = arxiv_search(query)
    
    formatted_results = []
    for idx, result in enumerate(results, start=1):
        source = result.payload['source']
        title = result.payload['title']
        text = result.payload['text']
        formatted_results.append(f"Result-{idx}\nSource:\n{source}\nTitle:\n{title}\nText:\n{text}")

    search_result = '\n\n'.join(formatted_results)

    response = client.messages.create(
    model = "claude-3-opus-20240229",
    max_tokens = 1024,
    system = """
    Your task is to provide informed and accurate answers
    to technical inquiries in artificial intelligence, machine learning and data science, 
    trained extensively on arXiv papers.""",
    messages = [
        {
            "role": "user", 
            "content": "Answer this user query: " + query + " with the following context: \n " + search_result
        }
        ]
    )

    
    return (response.content[0].text), search_result


In [None]:
query = "Is llama better than GPT?"
response, search_result = handle_user_query(query)
print(f"Response:\n {response}")
print("---"*30 + "\n"*2)
print(f"Search Result:\n {search_result}")