<a href="https://colab.research.google.com/github/dsany94/symptom-chatbot/blob/main/IDS575_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#RAG Pipeline

In [None]:
!pip install --upgrade openai langchain-openai faiss-cpu rank_bm25



In [None]:
import os
import faiss
import openai
import numpy as np
from rank_bm25 import BM25Okapi
from google.colab import auth
from google.colab import userdata
from googleapiclient.discovery import build
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Authenticate and connect to Google Drive
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')

# Define path to the folder containing text files
TEXT_FOLDER = "/content/drive/My Drive/Sample_Data/"

# Verify if the folder exists
if not os.path.exists(TEXT_FOLDER):
    print("⚠️ Folder not found! Check your Google Drive path.")
else:
    print("✅ Google Drive mounted successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully.


In [None]:
# ---- STEP 2: Set OpenAI API Key ----
api_key = userdata.get('OPENAI_KEY')

if api_key:
    os.environ["OPENAI_API_KEY"] = api_key
    print("✅ OpenAI API key is set.")
else:
    print("⚠️ OpenAI API key not found! Please set it manually.")

# Set up OpenAI Client
client = openai.Client(api_key=os.environ["OPENAI_API_KEY"])

✅ OpenAI API key is set.


In [None]:
# ---- STEP 3: Load and Process Text Files ----
def load_text_files(folder_path):
    documents = []
    filenames = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                documents.append(text)
                filenames.append(filename)

    return documents, filenames

# ---- STEP 4: Split text into smaller chunks ----
def split_text_into_chunks(texts, chunk_size=1200, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return [chunk for text in texts for chunk in splitter.split_text(text)]

# ---- STEP 5: Create and Store Embeddings in FAISS ----
def create_faiss_index(chunks):
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    embeddings = np.array([embedding_model.embed_query(chunk) for chunk in chunks], dtype=np.float32)

    d = embeddings.shape[1]  # Embedding dimension
    M = 32  # Number of connections per node in HNSW graph

    index = faiss.IndexHNSWFlat(d, M)  # Hierarchical Navigable Small World (HNSW)
    index.add(embeddings)

    return index, chunks

# ---- STEP 6: Create FAISS Index using Batch Processing ----
def create_faiss_index_batch_model(chunks, batch_size=10):
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    embeddings = []

    # Batch processing for embeddings
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i: i + batch_size]
        batch_embeddings = embedding_model.embed_documents(batch)  # Batch call
        embeddings.extend(batch_embeddings)

    embeddings = np.array(embeddings, dtype=np.float32)

    d = embeddings.shape[1]  # Embedding dimension
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    return index, chunks

# ---- STEP 7: Setup BM25 Re-Ranking ----
def setup_bm25(chunks):
    tokenized_corpus = [chunk.split() for chunk in chunks]
    return BM25Okapi(tokenized_corpus)

# ---- STEP 8: Retrieve & Re-Rank Chunks ----
def retrieve_and_rerank(query, index, chunks, bm25_model, top_k=10, rerank_top_n=5):
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    query_embedding = np.array([embedding_model.embed_query(query)], dtype=np.float32)

    # Retrieve top-K nearest chunks from FAISS
    distances, indices = index.search(query_embedding, top_k)
    retrieved_chunks = [chunks[i] for i in indices[0]]

    # BM25 Re-Ranking
    tokenized_query = query.split()
    bm25_scores = bm25_model.get_scores(tokenized_query)

    # Sort and return top rerank_top_n chunks
    ranked_chunks = sorted(zip(retrieved_chunks, bm25_scores), key=lambda x: x[1], reverse=True)[:rerank_top_n]

    return [chunk for chunk, score in ranked_chunks]

# ---- STEP 9: Generate Answer using OpenAI GPT-4 ----
def generate_response(query, retrieved_chunks):
    context = "\n".join(retrieved_chunks)
    prompt = f"User Query: {query}\n\nRelevant Context:\n{context}\n\nAnswer:"

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "Answer the query based on the provided context."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

In [None]:
## ---- STEP 10: RUN THE FULL RAG PIPELINE ----
print("\n🔄 Loading text files...")
documents, filenames = load_text_files(TEXT_FOLDER)
print("✅ Done loading!")

print("\n🔄 Splitting text into chunks...")
chunks = split_text_into_chunks(documents)
print(f"✅ Done chunking! Total chunks: {len(chunks)}")

print("\n🔄 Creating FAISS index...")
faiss_index, processed_chunks = create_faiss_index_batch_model(chunks, batch_size=10)
print("✅ Done embedding and indexing!")

print("\n🔄 Setting up BM25 re-ranking...")
bm25_model = setup_bm25(processed_chunks)
print("✅ Done with BM25 re-ranking!")

# Example Query
query = "What are the key challenges in AI adoption?"
print(f"\n🔍 Query: {query}")

print("\n🔄 Retrieving and re-ranking relevant chunks...")
retrieved_chunks = retrieve_and_rerank(query, faiss_index, processed_chunks, bm25_model)
print("✅ Done with retrieval!")

print("\n📝 Generating response using GPT-4...")
response = generate_response(query, retrieved_chunks)

# ---- STEP 11: DISPLAY RESULTS ----
print("\n📌 Retrieved Context:")
for chunk in retrieved_chunks:
    print(f"- {chunk}\n")

print("\n🤖 AI-Generated Response:")
print(response)


🔄 Loading text files...
✅ Done loading!

🔄 Splitting text into chunks...
✅ Done chunking! Total chunks: 185

🔄 Creating FAISS index...
✅ Done embedding and indexing!

🔄 Setting up BM25 re-ranking...
✅ Done with BM25 re-ranking!

🔍 Query: What are the key challenges in AI adoption?

🔄 Retrieving and re-ranking relevant chunks...
✅ Done with retrieval!

📝 Generating response using GPT-4...

📌 Retrieved Context:
- Personalization At Scale
Companies are leveraging AI to deliver highly personalized experiences. Duolingo, for instance, uses generative AI to create dynamic language exercises tailored to individual learning patterns. This level of personalization extends across industries, from e-commerce product recommendations to financial service offerings.

The Challenge Landscape
The path to AI implementation is filled with intriguing paradoxes:

The Data Dilemma
One of the primary challenges enterprises face is maintaining fresh, accurate data in their AI systems. It's like trying to hi