In [1]:
import os

import wikipediaapi
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import SentenceTransformer

import faiss

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# Initialize Wikipedia API
wiki = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent='RAG Knowledge Graph Demo/1.0'
)

# Get Wikipedia article (e.g., "Artificial Intelligence")
article_title = "Artificial Intelligence"
page = wiki.page(article_title)

if page.exists():
    text_content = page.text
    print(f"Successfully retrieved Wikipedia article: {article_title}")
    
    # Split content into sentences (using the same approach)
    sentences = []
    for paragraph in text_content.split("\n\n"):
        for sentence in paragraph.split(". "):
            sentence = sentence.strip()
            if sentence:
                sentences.append(sentence)
    
    print(f"Extracted {len(sentences)} sentences from Wikipedia article")
else:
    print(f"Article '{article_title}' not found")
    exit()

Successfully retrieved Wikipedia article: Artificial Intelligence
Extracted 476 sentences from Wikipedia article


In [None]:
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

# Generate embeddings for all sentences
sentence_embeddings = embedder.encode(sentences, convert_to_numpy=True)

print(f"Embeddings shape: {sentence_embeddings.shape}")

Embeddings shape: (476, 384)


In [None]:
dimension = sentence_embeddings.shape[1]  # Embedding dimension
num_vectors = sentence_embeddings.shape[0]  # Number of sentences
nlist = 25  # Number of Voronoi cells (inverted lists)
M = 8  # Number of sub-quantizers for PQ
nbits = 4  # Bits per sub-quantizer

# Create the index factory string for IVFPQ
index_factory_string = f"IVF{nlist},PQ{M}x{nbits}"

# Instantiate the FAISS index
index = faiss.index_factory(dimension, index_factory_string)

print("Training index...")
index.train(sentence_embeddings)  # Train on the embeddings

print("Adding vectors to index...")
index.add(sentence_embeddings)  # Add the embeddings to the index

print(f"Index is trained: {index.is_trained}")
print(f"Number of vectors in index: {index.ntotal}")

In [None]:
import random
import time

# os.environ["GOOGLE_API_KEY"] = "your_google_api_key"

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", api_key=os.getenv("GOOGLE_API_KEY"))

# Select 4 random sentences
selected_sentences = random.sample(sentences, 4)

# Generate questions using Gemini
question_prompt = """
For the following sentence, create one specific question that can be answered using only the information in the sentence. 
Make the question clear and focused.

Sentence: {sentence}

Generate only the question, without any additional text or explanation.
"""

query_texts = []
print("Selected sentences and generated questions:\n")

for i, sentence in enumerate(selected_sentences, 1):
    response = llm.invoke(question_prompt.format(sentence=sentence))
    query_texts.append(response.text)
    
    print(f"Sentence {i}: {sentence}")
    print(f"Question {i}: {response.text}\n")
    
    time.sleep(2) # Sleep for 2 seconds to avoid rate limiting


Searching index for query: 'What was Nvidia's revenue in the last fiscal year?'...

Query: What was Nvidia's revenue in the last fiscal year?

Retrieved sentences:
Rank 1: 333-
74905, 333-51520, 333-74868, 333-100010, 333-106191, 333-114375, 333-123933, 333-132493, 333-140021, 333-143953, 333-181625, 333-
185036, 333-188775, 333-196259, 333-211615, 333-229774, 333-237833, 333-239164, 333-249570, 333-259044, and 333-267207) of NVIDIA
Corporation of our report dated February 21, 2024 relating to the financial statements, financial statement schedule and the effectiveness of
internal control over financial reporting, which appears in this Form 10-K.
/s/ PricewaterhouseCoopers LLP
San Jose, California
February 21, 2024  EXHIBIT 31.1
CERTIFICATION
 
I, Jen-Hsun Huang, certify that:
 
1 (Distance: 0.4633)
Response: content="The provided context is an excerpt from Nvidia's 10-K report, specifically the part containing the auditor's report and the CEO's certification. It does **not** contain 

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Response: content="The provided context does *not* discuss Nvidia's research and development expenses. It only contains a general disclaimer about the uncertainty of forward-looking statements. Therefore, I cannot provide a discussion of Nvidia's R&D expenses based on this excerpt.\n\nTo discuss Nvidia's research and development expenses, I would need information from the 10-K report that specifically addresses that topic, such as:\n\n*   **The amount of R&D expenses for the fiscal year.**\n*   **A comparison of R&D expenses to previous years.**\n*   **Discussion of the types of projects or areas where R&D is focused.**\n*   **Management's discussion of the importance of R&D to the company's strategy.**\n\nWithout that information, I can only say that Nvidia, like most technology companies, likely invests significantly in research and development." additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_rati

ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [None]:
prompt = """Consider the following context from Wikipedia: {sentences}
Now answer the following question using only the context provided: {query}?"""

k = 5  # number of nearest neighbors to retrieve

for query_text in query_texts:
    # Embed the query text
    query_embedding = embedder.encode(query_text, convert_to_numpy=True).reshape(
        1, -1
    )  # reshape to 2D array

    print(f"\nSearching index for query: '{query_text}'...")
    distances, indices = index.search(query_embedding, k)  # perform the search

    print("\nQuery:", query_text)
    print("\nRetrieved sentences:")
    for i, idx in enumerate(indices[0]):
        print(f"\nRank {i+1}: {sentences[idx]} (Distance: {distances[0][i]:.4f})")
        response = llm.invoke(prompt.format(sentences=sentences[idx], query=query_text))
        print(f"Answer: {response.text}")
        
    # separator between queries for better readability
    print("\n" + "="*80 + "\n")
    time.sleep(2)  # 2 seconds wait between call to avoid rate limiting