# Lab 1 — Retrieval‑Augmented Generation (RAG)
**Goal:** Build and experiment with a RAG pipeline that matches all concepts covered in the slides.


In [None]:
!pip -q install langchain transformers sentence-transformers datasets faiss-cpu pinecone-client rank_bm25 nltk
import nltk, os, warnings
nltk.download('punkt', quiet=True)
warnings.filterwarnings('ignore')

## Environment Setup
Before running this lab you’ll need a few **environment variables** for API access:

| Variable | Purpose |
|----------|---------|
| `OPENAI_API_KEY` | Access to OpenAI models (used by LangChain’s `OpenAI()` wrapper) |
| `PINECONE_API_KEY` | (Optional) Auth token for Pinecone vector DB |
| `PINECONE_ENV` | (Optional) Your Pinecone environment region, e.g. `us-east-1-gcp` |

On a local machine you can set them in a terminal **before** launching Jupyter:
```bash
export OPENAI_API_KEY="sk-..."
export PINECONE_API_KEY="your-pinecone-key"
export PINECONE_ENV="us-east-1-gcp"
jupyter lab
```
In Google Colab, use the Secrets tool or run:
```python
import os
os.environ['OPENAI_API_KEY'] = 'sk-...'
```

> ⚠️ **Never commit keys to version control or share them publicly.**


In [None]:
# ✅ Quick sanity‑check for required keys
import os, openai
if not os.getenv('OPENAI_API_KEY'):
    raise ValueError('OPENAI_API_KEY not set. Please set it before running the lab.')
openai.api_key = os.environ['OPENAI_API_KEY']

if os.getenv('PINECONE_API_KEY'):
    print('Pinecone key detected — cloud retrieval enabled.')
else:
    print('Pinecone key not set — using local FAISS only.')

## 0. Parameters
Feel free to tweak these as you work through the exercises.

In [None]:
CHUNK_SIZE = 512          # tokens per chunk (Exercise 1)
TOP_K = 5                 # retrieved docs
HYBRID_ALPHA = 0.5        # 0=dense only, 1=sparse only, blend for hybrid
PINECONE_INDEX = 'rag-demo-' + ''.join(__import__('random').choices('abcdefghijklmnopqrstuvwxyz0123456789', k=6))

## 1. Load a Mini Corpus
We'll use 1‑% of English Wikipedia for speed.

In [None]:
from datasets import load_dataset
raw = load_dataset('wikipedia', '20220301.en', split='train[:1%]')
docs = [t[:3000] for t in raw['text'][:500]]  # limit to 500 docs
print(f'Loaded {len(docs)} documents')

## 2. Chunk, Embed, and Build Vector Stores

In [None]:
# 2.1 Chunk
from nltk import sent_tokenize
def chunk_text(text, chunk_size=CHUNK_SIZE):
    sentences = sent_tokenize(text)
    chunks, current = [], ''
    for sent in sentences:
        if len(current.split()) + len(sent.split()) < chunk_size:
            current += ' ' + sent
        else:
            chunks.append(current.strip())
            current = sent
    if current:
        chunks.append(current.strip())
    return chunks

chunks = []
for d in docs:
    chunks.extend(chunk_text(d))
print(f'Created {len(chunks)} chunks (avg {sum(len(c.split()) for c in chunks)//len(chunks)} tokens)')

In [None]:
# 2.2 Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# 2.3 Local FAISS Vector DB
from langchain.vectorstores import FAISS
faiss_store = FAISS.from_texts(chunks, embedding_model)
retriever_dense = faiss_store.as_retriever(search_kwargs={'k': TOP_K})

In [None]:
# 2.4 BM25 Sparse Retriever for Hybrid Search
from rank_bm25 import BM25Okapi
tokenized = [doc.split() for doc in chunks]
bm25 = BM25Okapi(tokenized)

class BM25Retriever:
    def __init__(self, k=TOP_K):
        self.k = k
    def get_relevant_documents(self, query):
        scores = bm25.get_scores(query.split())
        top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:self.k]
        return [{'page_content': chunks[i], 'score': scores[i]} for i in top_idx]

retriever_sparse = BM25Retriever()

In [None]:
# 2.5 Hybrid Retriever (Dense + Sparse Blend)
def hybrid_search(query, alpha=HYBRID_ALPHA):
    dense_hits = retriever_dense.get_relevant_documents(query)
    sparse_hits = retriever_sparse.get_relevant_documents(query)
    score_dict = {}
    for h in dense_hits:
        score_dict[h['page_content']] = score_dict.get(h['page_content'], 0) + (1-alpha)*h['score']
    for h in sparse_hits:
        score_dict[h['page_content']] = score_dict.get(h['page_content'], 0) + alpha*h['score']
    ranked = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)[:TOP_K]
    return [{'page_content': t[0], 'score': t[1]} for t in ranked]

class HybridRetriever:
    def __init__(self):
        pass
    def get_relevant_documents(self, query):
        return hybrid_search(query)

retriever_hybrid = HybridRetriever()

### Optional: Pinecone Cloud Vector DB
Set `PINECONE_API_KEY` and `PINECONE_ENV`, then run the cell below.

In [None]:
import os, pinecone
from langchain.vectorstores import Pinecone
if os.getenv('PINECONE_API_KEY'):
    pinecone.init(api_key=os.getenv('PINECONE_API_KEY'), environment=os.getenv('PINECONE_ENV', 'us-east-1-gcp'))
    if PINECONE_INDEX not in pinecone.list_indexes():
        pinecone.create_index(PINECONE_INDEX, dimension=384, metric='cosine')
    index = pinecone.Index(PINECONE_INDEX)
    pc_store = Pinecone(index, embedding_model.embed_query, embedding_model.embed_documents)
    retriever_pine = pc_store.as_retriever(search_kwargs={'k': TOP_K})
else:
    print('Pinecone environment variables not set; skipping cloud retriever.')

## 3. Instantiate an LLM

In [None]:
from langchain_community.llms import OpenAI
llm = OpenAI(model='gpt-3.5-turbo', temperature=0)


## 4. Build RetrievalQA Chains

In [None]:
from langchain.chains import RetrievalQA
qa_dense = RetrievalQA.from_llm(llm=llm, retriever=retriever_dense)
qa_hybrid = RetrievalQA.from_llm(llm=llm, retriever=retriever_hybrid)

In [None]:
# 4.1 Conversational RAG
from langchain.chains import ConversationalRetrievalChain
conv_chain = ConversationalRetrievalChain.from_llm(llm, retriever_dense)

## 5. Quick Demo

In [None]:
question = 'What is the capital of France and why is it significant?'
print('Dense answer:')
print(qa_dense.run(question))

print('\nHybrid answer:')
print(qa_hybrid.run(question))

## ✏️ Exercises (Match Slides)
1. **Chunk‑Size Sensitivity**  – Change `CHUNK_SIZE` to 256 and 768, rebuild chunks & FAISS, and compare answer grounding quality.
2. **Hybrid Search** – Adjust `HYBRID_ALPHA` (0.0, 0.5, 1.0). Which blend retrieves best context for numeric, keyword‑heavy queries?
3. **Conversational RAG** – Use `conv_chain` to ask a follow‑up: “How many people live there?” after the capital question.
4. **💠 Pinecone Cloud** – Enable Pinecone and time latency vs FAISS. Report recall and speed differences.
5. **Grounding Check (Bonus)** – Write a simple function that verifies each sentence in the answer exists in at least one retrieved chunk.