# Retreival Augmented Generation (RAG)

### 🎯 Objectives
- Build FAISS‑backed retrieval QA chains for each embedding type using LangChain.
- Run a sample query across all chains and save each response with its source documents.

### ⚙️ Setup
- Remember to set `subreddit_name` to the name of your subreddit since multiple scripts rely on that file notation.

---

### Global Parameters

In [1]:
subreddit_name = "AITAH"

### Load Data

In [2]:
import pandas as pd

# Input comments from a CSV file
input_file = rf"..\data\{subreddit_name}\csv\{subreddit_name}_comments.csv"

df = pd.read_csv(input_file)
df = df.dropna()
df = df.reset_index()

### Load embeddings

In [3]:
import numpy as np
import os

# Define the directory path
embeddings_dir = f"../data/{subreddit_name}/embeddings/"

# List of embedding types to load
embedding_types = ["sentence", "gnn", "node2vec", "simple_attention", "enhanced_attention"]

# Dictionary to store all loaded embeddings
embeddings = {}

# Load each embedding file
for embed_type in embedding_types:
    file_path = os.path.join(embeddings_dir, f"{embed_type}_embeddings.npy")
    
    # Check if the file exists
    if os.path.exists(file_path):
        # Load the embedding
        embeddings[embed_type] = np.load(file_path)
        print(f"Loaded {embed_type} embeddings with shape: {embeddings[embed_type].shape}")
    else:
        print(f"Warning: Could not find {file_path}")

Loaded sentence embeddings with shape: (100000, 384)
Loaded gnn embeddings with shape: (100000, 384)
Loaded node2vec embeddings with shape: (100000, 384)
Loaded simple_attention embeddings with shape: (100000, 384)
Loaded enhanced_attention embeddings with shape: (100000, 384)


### Setup Retreivers

In [4]:
# Import required libraries
import faiss
import numpy as np
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Initialize standard embedding model
base_embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a custom embeddings class for embedding processing
class ModelProcessedEmbeddings(Embeddings):
    def __init__(self, base_embedder, model_path):
        self.base_embedder = base_embedder
        self.model_path = model_path
        
        # Load the model state dict instead of expecting a full model
        self.state_dict = torch.load(model_path)
        print(f"Loaded state dictionary from {model_path}")
        
        # Here we would normally initialize the model and load state dict
        # But since we don't know the exact model architecture, we'll use a simpler approach
    
    def embed_documents(self, texts):
        # Get base embeddings first
        base_embeddings = self.base_embedder.embed_documents(texts)
        
        # Since we don't have the actual model structure, we'll use the pre-computed embeddings
        # In a real implementation, you would initialize your model and use:
        # model.load_state_dict(self.state_dict)
        # processed_embeddings = model(torch.tensor(base_embeddings))
        
        print(f"Using model from {self.model_path} for document embeddings")
        return np.array(base_embeddings)  # Return base embeddings as fallback
    
    def embed_query(self, text):
        # Get base embedding first
        base_embedding = self.base_embedder.embed_query(text)
        
        # Same situation as above - we would apply the model here
        print(f"Using model from {self.model_path} for query embedding")
        return np.array(base_embedding)  # Return base embedding as fallback

# Extract the texts from the dataframe
document_texts = df['body'].tolist()

# Build a mapping of document IDs to Document objects
document_mapping = {str(i): Document(page_content=text) for i, text in enumerate(document_texts)}

# Use InMemoryDocstore
document_store = InMemoryDocstore(document_mapping)

# Create index_to_docstore_id mapping
vector_index_to_doc_id_map = {i: str(i) for i in range(len(document_texts))}

# Initialize LLM model
llm_model = ChatOpenAI(model_name="gpt-4o")

# Create dictionary to store QA chains
qa_chains = {}

# Define embedding model paths for each type
model_paths = {
    "gnn": f"../data/{subreddit_name}/models/gnn_model.pt",
    "node2vec": f"../data/{subreddit_name}/models/node2vec_model.pt",
    "simple_attention": f"../data/{subreddit_name}/models/simple_attention_model.pt",
    "enhanced_attention": f"../data/{subreddit_name}/models/enhanced_attention_model.pt"
}

# Process each embedding type with appropriate embedding model
for embed_type, embed_data in embeddings.items():
    # Skip 'concat' as requested
    if embed_type == 'concat':
        continue
        
    print(f"Creating QA chain for {embed_type} embeddings...")
    
    # Choose appropriate embedding model based on type
    if embed_type == 'sentence':
        embedding_model = base_embedding_model
    else:
        embedding_model = ModelProcessedEmbeddings(
            base_embedder=base_embedding_model,
            model_path=model_paths[embed_type]
        )
    
    # Create FAISS index
    index = faiss.IndexFlatIP(embed_data.shape[1])
    index.add(embed_data.astype('float32'))
    
    # Create vectorstore with appropriate model
    vectorstore = FAISS(
        embedding_model,
        index,
        document_store,
        vector_index_to_doc_id_map
    )
    
    # Create QA chain
    qa_chains[embed_type] = RetrievalQA.from_chain_type(
        llm_model,
        retriever=vectorstore.as_retriever(),
        return_source_documents=True
    )
    
    print(f"Created QA chain for {embed_type} embeddings with dimension {embed_data.shape[1]}")

  base_embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  llm_model = ChatOpenAI(model_name="gpt-4o")


Creating QA chain for sentence embeddings...
Created QA chain for sentence embeddings with dimension 384
Creating QA chain for gnn embeddings...
Loaded state dictionary from ../data/AITAH/models/gnn_model.pt
Created QA chain for gnn embeddings with dimension 384
Creating QA chain for node2vec embeddings...
Loaded state dictionary from ../data/AITAH/models/node2vec_model.pt
Created QA chain for node2vec embeddings with dimension 384
Creating QA chain for simple_attention embeddings...
Loaded state dictionary from ../data/AITAH/models/simple_attention_model.pt
Created QA chain for simple_attention embeddings with dimension 384
Creating QA chain for enhanced_attention embeddings...
Loaded state dictionary from ../data/AITAH/models/enhanced_attention_model.pt
Created QA chain for enhanced_attention embeddings with dimension 384


### Query

In [5]:
# Example query
query = "AITAH for insisting my friend stop talking about their ex?"

# Dictionary to store responses from each QA chain
responses = {}

# Run the query against each QA chain
print(f"\nRunning query: '{query}' against all embedding types...")
for embed_type, chain in qa_chains.items():
    print(f"\nProcessing {embed_type} embeddings...")
    
    # Run the QA chain and store the response
    responses[embed_type] = chain.invoke(query)
    
    # Print a brief confirmation
    print(f"Completed query with {embed_type} embeddings")


Running query: 'AITAH for insisting my friend stop talking about their ex?' against all embedding types...

Processing sentence embeddings...
Completed query with sentence embeddings

Processing gnn embeddings...
Using model from ../data/AITAH/models/gnn_model.pt for query embedding
Completed query with gnn embeddings

Processing node2vec embeddings...
Using model from ../data/AITAH/models/node2vec_model.pt for query embedding
Completed query with node2vec embeddings

Processing simple_attention embeddings...
Using model from ../data/AITAH/models/simple_attention_model.pt for query embedding
Completed query with simple_attention embeddings

Processing enhanced_attention embeddings...
Using model from ../data/AITAH/models/enhanced_attention_model.pt for query embedding
Completed query with enhanced_attention embeddings


### Print & Save Responses

In [6]:
import os

# Create the responses directory if it doesn't exist
os.makedirs(f"../data/{subreddit_name}/responses", exist_ok=True)

# Open a file to save all responses
with open(f"../data/{subreddit_name}/responses/all_responses.txt", 'w', encoding='utf-8') as file:
    # For each embedding type and its response
    for embed_type, response in responses.items():
        # Print header for this embedding type
        header = f"\n\n{'=' * 80}\nRESPONSE USING {embed_type.upper()} EMBEDDINGS\n{'=' * 80}"
        print(header)
        file.write(header + "\n")
        
        # Print and save the generated answer
        result = f"Answer: {response['result']}"
        print(result)
        file.write(result + "\n\n")
        
        # Print and save details about the source documents
        doc_header = f"\nSource Documents:"
        print(doc_header)
        file.write(doc_header + "\n")
        
        for i, doc in enumerate(response["source_documents"]):
            doc_info = f"Document {i}:\n{doc.page_content}\n{'-' * 80}"
            print(doc_info)
            file.write(doc_info + "\n")
        
        print(f"\nSaved {embed_type} response and documents")

print(f"\nAll responses saved to '../data/{subreddit_name}/responses/all_responses.txt'")



RESPONSE USING SENTENCE EMBEDDINGS
Answer: Based on the context provided, it seems like there may be concerns about the impact of discussing an ex on current relationships or friendships. It's not necessarily wrong to express your feelings about a topic that makes you uncomfortable, but it's important to communicate your perspective respectfully and considerately. If you feel that the conversations about their ex are problematic or disrespectful, it might be worth having an open discussion with your friend about how it affects you or others involved. However, ultimately, it's up to your friend to decide whether they want to continue discussing their ex, and it's important to respect their autonomy. If the conversations are negatively impacting you, you might consider setting boundaries for yourself regarding these discussions.

Source Documents:
Document 0:
I totally agree, that’s really bad and so disrespectful to her, thats what I meant writing, maybe I didn’t express myself right,