In [4]:
!pip install transformers sentence-transformers faiss-cpu beautifulsoup4 requests
!pip install faiss-cpu
!pip install faiss-gpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [10]:
# Importing necessary libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from bs4 import BeautifulSoup
import requests

# Initialize Models
# SentenceTransformer for generating embeddings from text
embedding_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
# BLOOM tokenizer and model for generating answers to user queries
bloom_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
bloom_model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")

# FAISS Vector Store setup
dimension = 384  # The embedding dimension size of the model (for FAISS index)
index = faiss.IndexFlatL2(dimension)  # FAISS index to store vector embeddings and perform efficient similarity search
documents = []  # List to store the raw document text along with metadata

# Function to crawl and scrape websites
def crawl_website(url):
    # Send HTTP request to fetch content of the webpage
    response = requests.get(url)
    # Parse the page content using BeautifulSoup to extract the textual data
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract text from all paragraphs and remove empty ones
    paragraphs = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]
    return paragraphs

# Function to generate embeddings and add them to FAISS index
def add_to_faiss(texts, url):
    global documents
    # Convert each text into vector embeddings using the pre-trained embedding model
    embeddings = embedding_model.encode(texts, convert_to_numpy=True)
    # Add embeddings to FAISS index
    index.add(embeddings)
    # Store the text and URL metadata in the documents list for later retrieval
    documents.extend([{"text": text, "url": url} for text in texts])

# Function to retrieve similar documents based on the query
def retrieve_documents(query, top_k=5):
    # Convert the user's query into embeddings
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    # Perform similarity search in FAISS index to retrieve top_k most similar text chunks
    distances, indices = index.search(query_embedding, top_k)
    # Collect the actual documents that correspond to the indices found by FAISS
    retrieved = [documents[idx] for idx in indices[0] if idx < len(documents)]
    return retrieved

# Function to generate a response using the BLOOM model
def generate_response(query, retrieved_docs):
    # Prepare context for the model by joining retrieved documents into a single string
    context = "\n".join([f"- {doc['text']}" for doc in retrieved_docs])
    # Create a prompt that includes the context and the query for the LLM to generate an answer
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    # Tokenize the prompt to feed into the BLOOM model
    inputs = bloom_tokenizer(prompt, return_tensors="pt")
    # Generate a response from the model, limiting the response to 300 tokens
    outputs = bloom_model.generate(**inputs, max_length=300, num_return_sequences=1)
    # Decode the model's response into a human-readable format
    return bloom_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Main function to implement the RAG pipeline
def rag_pipeline(query, urls):
    # Step 1: Crawl and process websites, scraping the textual content from each URL
    for url in urls:
        paragraphs = crawl_website(url)
        add_to_faiss(paragraphs, url)

    # Step 2: Retrieve the most relevant documents based on the user’s query
    retrieved_docs = retrieve_documents(query)

    # Step 3: Generate a response using the retrieved documents
    response = generate_response(query, retrieved_docs)
    return response

# Example usage of the RAG pipeline
if __name__ == "__main__":
    # Define a list of websites to scrape
    urls = [
        "https://www.uchicago.edu/",
        "https://www.stanford.edu/",
        "https://www.washington.edu/",
        "https://und.edu/"
    ]
    # Define a sample user query
    user_query = "How can students apply for financial aid at the University of North Dakota?"

    # Run the RAG pipeline to generate an answer based on the provided query and websites
    response = rag_pipeline(user_query, urls)
    # Print the generated response
    print("Response:\n", response)

Response:
 Context:
- © 2024 University of North Dakota - Grand Forks, ND - Member of ND University System
- More than two-thirds of undergrads receive some form of financial assistance. Generally, tuition is covered for families with incomes below $150,000.
- The University of North Dakota values, honors, and supports all members of our campus
                                    community.
- Online learning option is perfect for student living in western North Dakota.
- The University of North Dakota is the state's oldest and largest university. We offer
                           225+ highly accredited on-campus and online degrees.

Question: How can students apply for financial aid at the University of North Dakota?

Answer:
Students who are enrolled in a degree program at the University of North Dakota may apply for financial aid at the University of North Dakota. The University of North Dakota is the state's oldest and largest university. We offer 225+ highly accredited on-campus 