In [1]:
# Install required packages
!pip install nemo_toolkit[all]






In [2]:
!pip install huggingface-hub==0.23.2
!pip install transformers==4.40.0

Collecting huggingface-hub==0.23.2
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.23.2-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.24.7
    Uninstalling huggingface-hub-0.24.7:
      Successfully uninstalled huggingface-hub-0.24.7
Successfully installed huggingface-hub-0.23.2
Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collecte

In [3]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


# Main imports

In [19]:
import torch
import cudf
import cupy as cp
import numpy as np
from typing import List
# Import the Megatron-LM model specifically
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Tuple


# Main Class

In [44]:
class NvidiaRAGSystem:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.embedding_model = None
        self.llm = None
        self.index = None
        self.document_store = None
        self.tokenizer = None
        print(f"Initializing RAG system on {self.device}")
        self.setup_models()

    def setup_models(self):
        """Initialize embedding model and LLM"""
        print("Loading models...")
        try:
            # Load embedding model (Sentence Transformer)
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2').to(self.device)

            # Load language model (GPT-2 instead of Megatron GPT)
            self.llm = AutoModelForCausalLM.from_pretrained("gpt2").to(self.device)

            # Use AutoTokenizer to load the tokenizer associated with the model
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
            print("Models loaded successfully")
        except Exception as e:
            print(f"Error loading models: {str(e)}")

    def process_documents(self, documents: List[str], chunk_size: int = 512):
        """Process documents using RAPIDS for efficient data handling"""
        try:
            # Convert to pandas DataFrame for text processing
            df = cudf.DataFrame({'text': documents}).to_pandas()

            # Tokenize and chunk documents
            chunks = []
            for doc in df['text']:
                # Simple chunking by words
                words = doc.split()
                for i in range(0, len(words), chunk_size):
                    chunk = ' '.join(words[i:i + chunk_size])
                    chunks.append(chunk)

            return chunks
        except Exception as e:
            print(f"Error processing documents: {str(e)}")
            return documents

    def build_index(self, document_chunks):
        """Build FAISS index for document search (CPU version)."""
        embeddings = self.embedding_model.encode(document_chunks, convert_to_numpy=True)
        # Build FAISS index on CPU
        index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance index
        index.add(embeddings)  # Add embeddings to index
        self.index = index
        self.document_store = document_chunks  # Store for chunk retrieval
        print("Index built successfully on CPU")

    def retrieve_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
        """Retrieve relevant chunks using FAISS index"""
        try:
            # Generate query embedding
            query_embedding = self.embedding_model.encode([query])[0].astype(np.float32)
            query_embedding = query_embedding.reshape(1, -1)

            # Search index
            distances, indices = self.index.search(query_embedding, k)

            # Retrieve chunks
            relevant_chunks = [self.document_store[i] for i in indices[0]]
            return relevant_chunks
        except Exception as e:
            print(f"Error retrieving chunks: {str(e)}")
            return []

    def generate_response(self, query: str, context: List[str]) -> str:
        """Generate response using retrieved context"""
        # Combine query and context
        prompt = f"""Context: {' '.join(context)}

        Question: {query}

        Answer: """

        # Generate response using GPT-2
        tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        response = self.llm.generate(tokens, max_length=200)

        # Decode response
        generated_text = self.tokenizer.decode(response[0], skip_special_tokens=True)
        return generated_text


In [45]:
def demonstrate_rag():
    """Demonstrate the RAG system with example documents"""
    documents = [
        "NVIDIA Corporation is a technology company founded in 1993.",
        "NVIDIA's GPU Technology Conference (GTC) is a global conference series.",
        "The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications."
    ]

    # Initialize RAG system
    print("Initializing RAG system...")
    rag_system = NvidiaRAGSystem()

    # Process documents
    print("\nProcessing documents...")
    chunks = rag_system.process_documents(documents)

    # Build index
    print("\nBuilding search index...")
    rag_system.build_index(chunks)

    # Example queries
    queries = ["What is NVIDIA's main product?", "What is GTC?", "How does NVIDIA support developers?"]

    # Demonstrate RAG pipeline
    print("\nDemonstrating RAG pipeline...")
    for query in queries:
        print(f"\nQuery: {query}")

        # Retrieve relevant chunks
        relevant_chunks = rag_system.retrieve_relevant_chunks(query)
        print("\nRetrieved relevant contexts:")
        for i, chunk in enumerate(relevant_chunks, 1):
            print(f"{i}. {chunk[:100]}...")

        # Generate response
        response = rag_system.generate_response(query, relevant_chunks)
        print(f"\nGenerated Response: {response}")


# Execute demo

In [46]:
demonstrate_rag()

Initializing RAG system...
Initializing RAG system on cuda
Loading models...


    
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Models loaded successfully

Processing documents...

Building search index...
Index built successfully on CPU

Demonstrating RAG pipeline...

Query: What is NVIDIA's main product?

Retrieved relevant contexts:
1. NVIDIA Corporation is a technology company founded in 1993....
2. The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications....
3. NVIDIA's GPU Technology Conference (GTC) is a global conference series....


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Response: Context: NVIDIA Corporation is a technology company founded in 1993. The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications. NVIDIA's GPU Technology Conference (GTC) is a global conference series.

        Question: What is NVIDIA's main product?

        Answer:                                                                                                                                

Query: What is GTC?

Retrieved relevant contexts:
1. NVIDIA's GPU Technology Conference (GTC) is a global conference series....
2. The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications....
3. NVIDIA Corporation is a technology company founded in 1993....


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Response: Context: NVIDIA's GPU Technology Conference (GTC) is a global conference series. The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications. NVIDIA Corporation is a technology company founded in 1993.

        Question: What is GTC?

        Answer:                                                                                                                                  

Query: How does NVIDIA support developers?

Retrieved relevant contexts:
1. The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications....
2. NVIDIA Corporation is a technology company founded in 1993....
3. NVIDIA's GPU Technology Conference (GTC) is a global conference series....

Generated Response: Context: The NVIDIA CUDA Toolkit provides a development environment for GPU-accelerated applications. NVIDIA Corporation is a technology company founded in 1993. NVIDIA's GPU Technology Conference (GTC) is a global conference se