<a href="https://colab.research.google.com/github/edyoda/AI-Agent-Development-and-GenAI/blob/main/Day_10_Autogen_with_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install chromadb
!pip install autogen_agentchat

Collecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [5]:
!pip install autogen_ext

Collecting autogen_ext
  Downloading autogen_ext-0.5.3-py3-none-any.whl.metadata (6.5 kB)
Downloading autogen_ext-0.5.3-py3-none-any.whl (262 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.5/262.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: autogen_ext
Successfully installed autogen_ext-0.5.3


In [8]:
!pip install aiofiles

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [1]:
!pip install autogen

Collecting autogen
  Downloading autogen-0.8.7-py3-none-any.whl.metadata (24 kB)
Collecting pyautogen==0.8.7 (from autogen)
  Downloading pyautogen-0.8.7-py3-none-any.whl.metadata (35 kB)
Collecting asyncer==0.0.8 (from pyautogen==0.8.7->autogen)
  Downloading asyncer-0.0.8-py3-none-any.whl.metadata (6.7 kB)
Collecting diskcache (from pyautogen==0.8.7->autogen)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting docker (from pyautogen==0.8.7->autogen)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting python-dotenv (from pyautogen==0.8.7->autogen)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken (from pyautogen==0.8.7->autogen)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading autogen-0.8.7-py3-none-any.whl (13 kB)
Downloading pyautogen-0.8.7-py3-none-any.whl (739 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.3/7

In [9]:
import re
from typing import List

import aiofiles
import aiohttp
from autogen_core.memory import Memory, MemoryContent, MemoryMimeType


class SimpleDocumentIndexer:
    """Basic document indexer for AutoGen Memory."""

    def __init__(self, memory: Memory, chunk_size: int = 1500) -> None:
        self.memory = memory
        self.chunk_size = chunk_size

    async def _fetch_content(self, source: str) -> str:
        """Fetch content from URL or file."""
        if source.startswith(("http://", "https://")):
            async with aiohttp.ClientSession() as session:
                async with session.get(source) as response:
                    return await response.text()
        else:
            async with aiofiles.open(source, "r", encoding="utf-8") as f:
                return await f.read()

    def _strip_html(self, text: str) -> str:
        """Remove HTML tags and normalize whitespace."""
        text = re.sub(r"<[^>]*>", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    def _split_text(self, text: str) -> List[str]:
        """Split text into fixed-size chunks."""
        chunks: list[str] = []
        # Just split text into fixed-size chunks
        for i in range(0, len(text), self.chunk_size):
            chunk = text[i : i + self.chunk_size]
            chunks.append(chunk.strip())
        return chunks

    async def index_documents(self, sources: List[str]) -> int:
        """Index documents into memory."""
        total_chunks = 0

        for source in sources:
            try:
                content = await self._fetch_content(source)

                # Strip HTML if content appears to be HTML
                if "<" in content and ">" in content:
                    content = self._strip_html(content)

                chunks = self._split_text(content)

                for i, chunk in enumerate(chunks):
                    await self.memory.add(
                        MemoryContent(
                            content=chunk, mime_type=MemoryMimeType.TEXT, metadata={"source": source, "chunk_index": i}
                        )
                    )

                total_chunks += len(chunks)

            except Exception as e:
                print(f"Error indexing {source}: {str(e)}")

        return total_chunks


In [14]:
import os
from pathlib import Path

from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.ui import Console
from autogen_ext.memory.chromadb import ChromaDBVectorMemory, PersistentChromaDBVectorMemoryConfig
from autogen_ext.models.openai import OpenAIChatCompletionClient

# Initialize vector memory

rag_memory = ChromaDBVectorMemory(
    config=PersistentChromaDBVectorMemoryConfig(
        collection_name="autogen_docs",
        persistence_path=os.path.join(str(Path.home()), ".chromadb_autogen"),
        k=3,  # Return top 3 results
        score_threshold=0.4,  # Minimum similarity score
    )
)

await rag_memory.clear()  # Clear existing memory


# Index AutoGen documentation
async def index_autogen_docs() -> None:
    indexer = SimpleDocumentIndexer(memory=rag_memory)
    sources = [
        "https://www.edyoda.com/faq",
    ]
    chunks: int = await indexer.index_documents(sources)
    print(f"Indexed {chunks} chunks from {len(sources)} AutoGen documents")


await index_autogen_docs()


Indexed 169 chunks from 1 AutoGen documents


In [12]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [15]:
# Create our RAG assistant agent
rag_assistant = AssistantAgent(
    name="rag_assistant", model_client=OpenAIChatCompletionClient(model="gpt-4o"), memory=[rag_memory]
)

# Ask questions about AutoGen
stream = rag_assistant.run_stream(task="Are LIVE session recordings available?")
await Console(stream)

# Remember to close the memory when done
await rag_memory.close()

---------- TextMessage (user) ----------
Are LIVE session recordings available?
---------- MemoryQueryEvent (rag_assistant) ----------
[MemoryContent(content='is scheduled for today, you should see - &ldquo;Join LIVE Session&rdquo; 3. If your LIVE class isn&rsquo;t today. This is where your start date will be visible. 4. To know your next LIVE session date or another way to join &nbsp; Question: How do I access study materials? If you want to access study materials before scheduling the batch. Select the micro degree &amp; click on Study Materials - &ldquo; View Content &rdquo;. Make sure that you select the module named Study Material. Click on Pre-watch videos tab &nbsp; &nbsp; If you want to access study materials after scheduling the batch. Choose the micro degree &amp; click on view course 4. Inside view course, choose study materials Question: How do I access LIVE classes recordings? 1. Choose the micro-degree for which you wish to access session recordings. Don&#39;t click on St

In [18]:
# Ask questions about AutoGen
stream = rag_assistant.run_stream(task="Are study matrials available?")
await Console(stream)

# Remember to close the memory when done
await rag_memory.close()

---------- TextMessage (user) ----------
Are study matrials available?
---------- TextMessage (rag_assistant) ----------
Yes, study materials are available. You can access them by selecting the micro-degree you are enrolled in. If you want to access study materials before scheduling your batch, you can click on "Study Materials" and then "View Content." Ensure that you select the module named "Study Material." Additionally, you can click on the "Pre-watch videos" tab for more resources. If you have already scheduled your batch, you can choose the micro-degree and click on "view course," then select "study materials" inside the course view.

TERMINATE
