In [None]:
# To install all dependencies
%pip install -q --upgrade \
    pinecone-client \
    langchain-pinecone \
    langchain-text-splitters \
    langchain \
    google-genai \
    firecrawl-py

print("Packages installed successfully")


In [None]:
import os
import time

from google import genai
from firecrawl import FirecrawlApp

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore, PineconeEmbeddings
from pinecone import Pinecone, ServerlessSpec

print("Imports completed")

In [None]:
# Add API keys (accessed via Google Gemini, Pinecome, and Firecrawl)

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
#removed to not hardcode keys, find via web source

# Set environment variables
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["FIRECRAWL_API_KEY"] = FIRECRAWL_API_KEY

In [None]:
# Scraping course content with Firecrawl

COURSE_PAGES = [
    "https://fa25.datastructur.es/",
    "https://fa25.datastructur.es/policies/exams/",
    "https://fa25.datastructur.es/policies/extensions/",
    "https://fa25.datastructur.es/policies/grading/",
    "https://cs61b-2.gitbook.io/cs61b-textbook/4.-sllists",
    "https://cs61b-2.gitbook.io/cs61b-textbook/5.-dllists",
]

# Initialize Firecrawl
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])

scraped_data = []

#Scrape course pages
for page in COURSE_PAGES:
    full_url = page
    print(f"Scraping: {full_url}")
    try:
        result = app.scrape(full_url)
        text = result.markdown
        if len(text) > 100:
            scraped_data.append((full_url, text))
    except Exception as e:
        print(f"Error: {e}")

print(f"Successfully scraped {len(scraped_data)} pages")
if scraped_data:
    print(f"Total characters: {sum(len(text) for _, text in scraped_data):,}")


In [None]:
# Create LangChain documents
documents = []

#Looping through scraped_data (list of tuples: url, text) and create Document objects
for url, text in scraped_data:
    print(f"Converting: {url}")
    documents.append(Document(page_content=text, metadata={"source": url}))

print(f"Created {len(documents)} LangChain documents")
if documents:
    print(f"Sample preview: {documents[0].page_content[:200]}...")


In [None]:
#Split documents into list of chunks for better retrieval

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_chunks = splitter.split_documents(documents)

print(f"Created {len(all_chunks)} chunks from {len(documents)} documents")
if all_chunks:
    print(f"Sample preview: {all_chunks[0].page_content[:200]}...")


In [None]:
# Initialize Pinecone (for vector storage and retrieval)

INDEX_NAME = "berkeley-course-rag"

pinecone = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

RESET_INDEX = True  # will always rebuild index (can change to False to avoid this)
if RESET_INDEX and INDEX_NAME in pinecone.list_indexes().names():
    pinecone.delete_index(INDEX_NAME)

# Create Pinecone index

index = pinecone.create_index_for_model(
        name=INDEX_NAME,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

print(f"Creating Pinecone index: {INDEX_NAME}")

In [None]:
# Loop through chunks, create embeddings, and upsert to Pinecone

index = pinecone.Index(INDEX_NAME)

#Preparing records with unique IDs (source URL + chunk index)
records = []
for i, chunk in enumerate(all_chunks):
    chunk_id = str(i)
    records.append({
        "id": chunk_id,
        "chunk_text": chunk.page_content,
        "source": chunk.metadata['source']
    })

#Upsert to Pinecone in batches
BATCH_SIZE = 96 #Pinecone limit is 96 records per batch
for i in range(0, len(records), BATCH_SIZE):
    batch = records[i:i + BATCH_SIZE]
    index.upsert_records("test-namespace", batch)
    print(f"Upserted batch {i//BATCH_SIZE + 1} ({len(batch)} records)")
print(f"All {len(records)} chunks embedded and stored in Pinecone!")
    
print(f"Index stats: {index.describe_index_stats()}")

In [None]:
# Initialize Gemini LLM (for answer generation)

client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.0-flash-exp", contents="Explain how AI works in a few words"
)
print(response.text)

print("Gemini LLM initialized")


In [None]:
# Build RAG query function
def ask_question(question, index, k=3):
    """
    Query Pinecone for relevant chunks and generate answer with Gemini.
    
    Args:
        question: User's question
        index: Pinecone index
        embeddings: Embedding model
        k: Number of chunks to retrieve
    Returns:
        (answer, sources): Generated answer and source URLs
    """
    
    #Query Pinecone with the question
    results = index.search(
        namespace="test-namespace",
        query={
            "inputs": {"text": question},
            "top_k": k
        },
        fields=["chunk_text", "source"]
    )

    #Extracting the relevant chunks and sources
    retrieved_chunks = []
    sources = []
    if 'result' in results and 'hits' in results['result']:
        for hit in results['result']['hits']:
            chunk_text = hit.get('fields', {}).get('chunk_text', '')
            source = hit.get('fields', {}).get('source', 'Unknown')
            if chunk_text:
                retrieved_chunks.append(chunk_text)
                if source not in sources:
                    sources.append(source)
    
    #Preparing context for the LLM
    context = "\n\n".join([f"Chunk {i+1}:\n{chunk}" for i, chunk in enumerate(retrieved_chunks)])

    #Generating sn answer through Gemini and extracting sources
    prompt = f"""Answer the following question based on the provided context from the CS 61B course materials.
        Question: {question}
        Context from course materials: {context}
        Please provide a comprehensive answer based on the context above. If the context doesn't contain relevant information, say so."""
    response = client.models.generate_content(
        model="gemini-2.0-flash-exp",
        contents=prompt
    )
    return response.text, sources
answer, sources = ask_question("What is the exam policy for CS 61B?", index, k=3)
print(f"Answer: {answer}\n\nSources: {sources}")
