In [6]:
import os
import pandas as pd
import hashlib
import json
from typing import List, Dict, Optional
from langchain_community.document_loaders import PyMuPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.retrievers import TavilySearchAPIRetriever
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi
import numpy as np
import warnings
warnings.filterwarnings('ignore')

class HybridResearchRAG:
    """
    Advanced RAG system with reranking, hybrid search, and caching.
    """
    
    def __init__(
        self, 
        excel_path: Optional[str] = None,
        tavily_api_key: Optional[str] = None,
        vector_store_path: str = "research_vectorstore",
        model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",
        cache_path: str = "query_cache.json",
        use_reranker: bool = True,
        use_hybrid_search: bool = True
    ):
        """
        Initialize the hybrid RAG system.
        
        Args:
            excel_path: Path to Excel file with paper metadata
            tavily_api_key: API key for Tavily web search
            vector_store_path: Directory to save/load vector store
            model_name: HuggingFace model for generation
            cache_path: Path to query cache file
            use_reranker: Whether to use cross-encoder reranking
            use_hybrid_search: Whether to use BM25 + vector hybrid search
        """
        self.vector_store_path = vector_store_path
        self.excel_path = excel_path
        self.tavily_api_key = tavily_api_key
        self.model_name = model_name
        self.cache_path = cache_path
        self.use_reranker = use_reranker
        self.use_hybrid_search = use_hybrid_search
        
        # Initialize query cache
        self.query_cache = self._load_cache()
        self.cache_hits = 0
        self.total_queries = 0
        
        # Initialize embedding model
        print("Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )
        
        # Initialize reranker if enabled
        self.reranker = None
        if use_reranker:
            print("Loading reranker model...")
            self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=600,
            chunk_overlap=75,
            separators=["\n\n", "\n", ". ", "? ", "! ", " "],
            length_function=len
        )
        
        # Initialize LLM
        print(f"Loading generation model: {model_name}...")
        self.llm = self._initialize_llm(model_name)
        
        # Initialize vector store and BM25 index
        self.vectorstore = None
        self.bm25_index = None
        self.bm25_docs = []
        self.load_or_create_vectorstore()
        
        # Initialize online retriever
        self.online_retriever = None
        if tavily_api_key:
            self.online_retriever = TavilySearchAPIRetriever(
                k=6,
                api_key=tavily_api_key
            )
    
    def _load_cache(self) -> Dict:
        """Load query cache from disk."""
        if os.path.exists(self.cache_path):
            try:
                with open(self.cache_path, 'r') as f:
                    return json.load(f)
            except:
                return {}
        return {}
    
    def _save_cache(self):
        """Save query cache to disk."""
        try:
            with open(self.cache_path, 'w') as f:
                json.dump(self.query_cache, f, indent=2)
        except Exception as e:
            print(f"Warning: Could not save cache: {e}")
    
    def _get_cache_key(self, question: str, search_online: bool) -> str:
        """Generate cache key for a query."""
        key_str = f"{question.lower().strip()}_{search_online}"
        return hashlib.md5(key_str.encode()).hexdigest()
    
    def _initialize_llm(self, model_name: str):
        """Initialize the language model."""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            trust_remote_code=True
        )
        
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            temperature=0.6,
            top_p=0.85,
            repetition_penalty=1.2,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            return_full_text=False,
            eos_token_id=tokenizer.eos_token_id
        )
        
        return HuggingFacePipeline(pipeline=pipe)
    
    def load_papers_from_excel(self) -> List[Dict]:
        """Load paper metadata from Excel file."""
        if not self.excel_path or not os.path.exists(self.excel_path):
            return []
        
        df = pd.read_excel(self.excel_path)
        return [
            {
                "title": row["Title"],
                "url": row["Link"],
                "s_no": row.get("S.No", idx)
            }
            for idx, row in df.iterrows()
        ]
    
    def load_documents_from_papers(self, papers: List[Dict]) -> List:
        """Load and process documents from paper URLs."""
        all_docs = []
        total = len(papers)
        
        print(f"\nLoading {total} research papers...")
        
        for idx, paper in enumerate(papers, 1):
            if idx % 50 == 0:
                print(f"Progress: {idx}/{total} papers loaded")
            
            try:
                if paper["url"].endswith('.pdf'):
                    loader = PyMuPDFLoader(paper["url"])
                else:
                    loader = WebBaseLoader(paper["url"])
                
                docs = loader.load()
                
                for doc in docs:
                    doc.metadata.update({
                        "paper_title": paper["title"],
                        "paper_url": paper["url"],
                        "s_no": paper["s_no"],
                        "source_type": "local_collection"
                    })
                
                all_docs.extend(docs)
                
            except Exception as e:
                with open("load_errors.log", "a") as f:
                    f.write(f"Failed: {paper['title']} - {str(e)}\n")
        
        print(f"Successfully loaded {len(all_docs)} documents")
        return all_docs
    
    def create_bm25_index(self, documents):
        """Create BM25 index for keyword search."""
        print("Creating BM25 index...")
        self.bm25_docs = documents
        tokenized = [doc.page_content.lower().split() for doc in documents]
        self.bm25_index = BM25Okapi(tokenized)
        print("BM25 index created")
    
    def load_or_create_vectorstore(self):
        """Load existing vector store or create new one."""
        index_path = f"{self.vector_store_path}/index.faiss"
        
        if os.path.exists(index_path):
            print("Loading existing vector store...")
            self.vectorstore = FAISS.load_local(
                self.vector_store_path,
                self.embeddings,
                allow_dangerous_deserialization=True
            )
            print("Vector store loaded successfully")
            
            # Load BM25 index if hybrid search enabled
            if self.use_hybrid_search:
                # Get all documents from vectorstore for BM25
                # This is a workaround - ideally save/load BM25 separately
                print("Note: BM25 index will be created on first hybrid search")
                
        elif self.excel_path:
            print("Creating new vector store from local papers...")
            papers = self.load_papers_from_excel()
            if papers:
                docs = self.load_documents_from_papers(papers)
                chunks = self.text_splitter.split_documents(docs)
                
                print(f"Creating embeddings for {len(chunks)} chunks...")
                self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
                self.vectorstore.save_local(self.vector_store_path)
                
                # Create BM25 index if enabled
                if self.use_hybrid_search:
                    self.create_bm25_index(chunks)
                
                print("Vector store created and saved")
            else:
                print("No papers found to process")
        else:
            print("No vector store or Excel file provided - online search only mode")
    
    def rerank_documents(self, query: str, docs: List, top_k: int = 5):
        """Rerank retrieved documents using cross-encoder."""
        if not docs or not self.reranker:
            return docs
        
        pairs = [[query, doc.page_content] for doc in docs]
        scores = self.reranker.predict(pairs)
        
        scored_docs = list(zip(docs, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        
        return [doc for doc, _ in scored_docs[:top_k]]
    
    def hybrid_retrieve(self, query: str, k: int = 10):
        """Combine BM25 and vector search for better retrieval."""
        if not self.use_hybrid_search or not self.vectorstore:
            # Fallback to vector search only
            return self.vectorstore.similarity_search(query, k=k)
        
        # Initialize BM25 if not done yet
        if self.bm25_index is None:
            # Get all docs from vectorstore (expensive, should be cached)
            all_docs = self.vectorstore.similarity_search("", k=10000)  # Hack to get all
            if all_docs:
                self.create_bm25_index(all_docs)
            else:
                return self.vectorstore.similarity_search(query, k=k)
        
        # BM25 search
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25_index.get_scores(tokenized_query)
        bm25_top_indices = np.argsort(bm25_scores)[-k*2:][::-1]  # Get 2x more for fusion
        
        # Vector search
        vector_docs = self.vectorstore.similarity_search(query, k=k*2)
        
        # Combine results (simple approach: union and deduplicate)
        combined_docs = {}
        
        # Add BM25 results with scores
        for idx in bm25_top_indices:
            if idx < len(self.bm25_docs):
                doc = self.bm25_docs[idx]
                doc_key = doc.page_content[:100]
                combined_docs[doc_key] = doc
        
        # Add vector results
        for doc in vector_docs:
            doc_key = doc.page_content[:100]
            combined_docs[doc_key] = doc
        
        # Return top k unique documents
        return list(combined_docs.values())[:k]
    
    def search_online(self, query: str) -> List:
        """Search online sources using Tavily."""
        if not self.online_retriever:
            return []
        
        try:
            print("Searching online sources...")
            docs = self.online_retriever.invoke(query)
            
            for doc in docs:
                doc.metadata["source_type"] = "online_search"
                if "title" in doc.metadata:
                    doc.metadata["paper_title"] = doc.metadata["title"]
                elif "source" in doc.metadata:
                    from urllib.parse import urlparse
                    domain = urlparse(doc.metadata["source"]).netloc.replace("www.", "")
                    doc.metadata["paper_title"] = f"Web: {domain}"
                else:
                    doc.metadata["paper_title"] = "Online Source"
            
            chunks = self.text_splitter.split_documents(docs)
            return chunks
        except Exception as e:
            print(f"Online search error: {e}")
            return []
    
    def _get_prompt_template(self) -> PromptTemplate:
        """Get the prompt template."""
        template = """Use the research papers below to answer the question. Write 2-3 clear sentences that directly answer what is asked. Cite paper titles in [brackets].

Research Papers:
{context}

Question: {question}

Direct answer:"""
        
        return PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )
    
    def create_qa_chain(self, retriever):
        """Create the QA chain with given retriever."""
        return RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs={"prompt": self._get_prompt_template()},
            return_source_documents=True,
            verbose=False
        )
    
    def _clean_answer(self, raw_answer: str) -> str:
        """Clean up model output."""
        cleaned = raw_answer.strip()
        
        placeholders = [
            "[Insert your response here]",
            "[Your answer here]",
            "[Response]"
        ]
        for placeholder in placeholders:
            cleaned = cleaned.replace(placeholder, "").strip()
        
        prefixes_to_remove = [
            "Use the research papers below",
            "Research Papers:",
            "Question:",
            "Direct answer:",
            "Answer:",
        ]
        
        for prefix in prefixes_to_remove:
            if prefix in cleaned:
                parts = cleaned.split(prefix)
                cleaned = parts[-1].strip()
        
        stop_phrases = ["\nNote:", "\nReferences:", "\nBest regards", "\nPlease", "\n\n"]
        for phrase in stop_phrases:
            if phrase in cleaned:
                cleaned = cleaned.split(phrase)[0].strip()
        
        if len(cleaned) < 10:
            return "The provided sources don't contain sufficient information to answer this question."
        
        return cleaned
    
    def query(self, question: str, search_online: bool = False) -> Dict:
        """
        Query the RAG system with caching.
        """
        self.total_queries += 1
        
        # Check cache first
        cache_key = self._get_cache_key(question, search_online)
        if cache_key in self.query_cache:
            self.cache_hits += 1
            print("✓ Using cached result")
            return self.query_cache[cache_key]
        
        try:
            # Handle online search
            if search_online and self.online_retriever:
                online_docs = self.search_online(question)
                
                if online_docs:
                    # Apply reranking if enabled
                    if self.use_reranker:
                        print("Reranking online results...")
                        online_docs = self.rerank_documents(question, online_docs, top_k=5)
                    
                    temp_store = FAISS.from_documents(online_docs, self.embeddings)
                    retriever = temp_store.as_retriever(search_kwargs={"k": 5})
                    qa_chain = self.create_qa_chain(retriever)
                    response = qa_chain.invoke({"query": question})
                else:
                    result = {
                        "answer": "No relevant online sources found.",
                        "sources": []
                    }
                    self.query_cache[cache_key] = result
                    self._save_cache()
                    return result
            else:
                # Use local vectorstore
                if not self.vectorstore:
                    result = {
                        "answer": "No local papers available. Use --online to search the web.",
                        "sources": []
                    }
                    return result
                
                # Retrieve using hybrid search or regular search
                if self.use_hybrid_search:
                    print("Using hybrid search (BM25 + Vector)...")
                    docs = self.hybrid_retrieve(question, k=10)
                else:
                    docs = self.vectorstore.similarity_search(question, k=8)
                
                # Apply reranking if enabled
                if self.use_reranker and docs:
                    print("Reranking results...")
                    docs = self.rerank_documents(question, docs, top_k=5)
                
                # Create retriever from filtered docs
                temp_store = FAISS.from_documents(docs, self.embeddings)
                retriever = temp_store.as_retriever(search_kwargs={"k": 5})
                qa_chain = self.create_qa_chain(retriever)
                response = qa_chain.invoke({"query": question})
            
            # Extract and clean answer
            raw_answer = response.get("result", "No answer generated").strip()
            answer = self._clean_answer(raw_answer)
            
            # Process sources
            sources = []
            seen = set()
            
            for doc in response.get("source_documents", []):
                source_info = {
                    "title": doc.metadata.get("paper_title", "Unknown"),
                    "url": doc.metadata.get("paper_url", doc.metadata.get("source", "#")),
                    "type": doc.metadata.get("source_type", "unknown"),
                    "page": doc.metadata.get("page", "N/A")
                }
                
                identifier = f"{source_info['title']}_{source_info['url']}"
                if identifier not in seen:
                    sources.append(source_info)
                    seen.add(identifier)
            
            result = {
                "answer": answer,
                "sources": sources
            }
            
            # Cache the result
            self.query_cache[cache_key] = result
            self._save_cache()
            
            return result
            
        except Exception as e:
            import traceback
            return {
                "answer": f"Error processing query: {str(e)}\n{traceback.format_exc()}",
                "sources": []
            }
    
    def get_cache_stats(self):
        """Get cache performance statistics."""
        if self.total_queries > 0:
            hit_rate = (self.cache_hits / self.total_queries) * 100
            return f"Cache: {self.cache_hits}/{self.total_queries} hits ({hit_rate:.1f}%)"
        return "Cache: No queries yet"
    
    def clear_cache(self):
        """Clear the query cache."""
        self.query_cache = {}
        self._save_cache()
        self.cache_hits = 0
        self.total_queries = 0
        print("Cache cleared")
    
    def interactive_session(self):
        """Run interactive Q&A session."""
        print("\n" + "="*60)
        print("Research Assistant - Interactive Mode")
        print("="*60)
        print("Commands:")
        print("  - Type your question to search local papers")
        print("  - Add '--online' to search online sources")
        print("  - Type 'stats' to see cache statistics")
        print("  - Type 'clear-cache' to clear cache")
        print("  - Type 'exit' to quit")
        print("="*60 + "\n")
        
        while True:
            try:
                user_input = input("\nQuestion: ").strip()
                
                if user_input.lower() in ['exit', 'quit', 'q']:
                    print(f"\n{self.get_cache_stats()}")
                    print("Goodbye!")
                    break
                
                if user_input.lower() == 'stats':
                    print(f"\n{self.get_cache_stats()}")
                    continue
                
                if user_input.lower() == 'clear-cache':
                    self.clear_cache()
                    continue
                
                if not user_input:
                    continue
                
                # Check for online flag
                search_online = '--online' in user_input
                question = user_input.replace('--online', '').strip()
                
                print("\nProcessing...\n")
                result = self.query(question, search_online=search_online)
                
                # Display results
                print(f"Answer: {result['answer']}\n")
                
                if result['sources']:
                    print("Sources:")
                    for idx, source in enumerate(result['sources'], 1):
                        print(f"\n[{idx}] {source['title']}")
                        print(f"    Type: {source['type']}")
                        print(f"    URL: {source['url']}")
                        if source['page'] != 'N/A':
                            print(f"    Page: {source['page']}")
                
            except KeyboardInterrupt:
                print(f"\n\n{self.get_cache_stats()}")
                print("Exiting...")
                break
            except Exception as e:
                print(f"\nError: {e}")



In [7]:
from huggingface_hub import login
login(token="") 

In [None]:

# ========== Usage Example ==========
if __name__ == "__main__":
    # Initialize system
    # Option 1: Local papers only
    rag = HybridResearchRAG(
        excel_path="LLM-Papers.xlsx",  # Your Excel file
        tavily_api_key= "",
        vector_store_path="research_vectorstore"
    )
    
    # Option 2: With online search capability
    # rag = HybridResearchRAG(
    #     excel_path="LLM-Papers.xlsx",
    #     tavily_api_key="tvly-YOUR-API-KEY",
    #     vector_store_path="research_vectorstore"
    # )
    
    # Start interactive session
    rag.interactive_session()
    
    # Or use programmatically
    # result = rag.query("What is multi-head attention?")
    # print(result['answer'])

Loading embedding model...
Loading reranker model...
Loading generation model: meta-llama/Meta-Llama-3-8B-Instruct...


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.59it/s]
Device set to use cuda:0


Loading existing vector store...
Vector store loaded successfully
Note: BM25 index will be created on first hybrid search

Research Assistant - Interactive Mode
Commands:
  - Type your question to search local papers
  - Add '--online' to search online sources
  - Type 'stats' to see cache statistics
  - Type 'clear-cache' to clear cache
  - Type 'exit' to quit




Question:  what is multi-head attention and how does it improve transformer model's performance? --online



Processing...

Searching online sources...
Reranking online results...
Answer: Multi-head attention is a powerful mechanism that enables Transformer models to capture diverse features, distribute attention effectively, and improve generalization, thereby significantly improving their performance [1]. It works by splitting the self-attention process into several smaller parts, allowing the model to examine information from multiple perspectives simultaneously and process it in parallel [2]. This enhancement leads to improved generalization and adaptability to different tasks and datasets [1], making it a crucial innovation in Transformer models. [References: [1] Understanding the Power and Benefits of Multi-Head Attention in Transformer Models; [2] Multi-head attention significantly improves transformer model performance]

Sources:

[1] Exploring Multi-Head Attention: Why More Heads Are ...
    Type: online_search
    URL: https://medium.com/@hassaanidrees7/exploring-multi-head-attenti


Question:  hat is multi-head attention and how does it improve transformer model's performance?



Processing...

Using hybrid search (BM25 + Vector)...
Creating BM25 index...
BM25 index created
Reranking results...
Answer: Multi-head attention is an extension of standard self-attention mechanism where multiple parallel attention heads are used simultaneously to jointly attend different representation sub-spaces within the input sequence. By doing so, it enables the model to learn more complex relationships between inputs and outputs, leading to improved performance on various natural language processing tasks. [“The Infini-attention”]

Sources:

[1] Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention
    Type: local_collection
    URL: https://arxiv.org/abs/2404.07143

[2] DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding
    Type: local_collection
    URL: https://arxiv.org/abs/2412.10302

[3] DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model
    Type: local_collection



Question:  What is the most trending prompting technique?



Processing...

Using hybrid search (BM25 + Vector)...
Reranking results...
Answer: According to the provided research papers, there isn't one specific "most trending" prompting technique mentioned explicitly. However, based on the surveys presented in [1] and [2], various studies are being conducted on multiple prompting methods across 29 different NLP tasks over the past few years. This suggests that researchers are actively exploring and experimenting with various approaches to find the most effective ones.

Sources:

[1] The Prompt Report: A Systematic Survey of Prompting Techniques
    Type: local_collection
    URL: https://arxiv.org/abs/2406.06608

[2] A Survey of Prompt Engineering Methods in Large Language Models for Different NLP Tasks
    Type: local_collection
    URL: https://arxiv.org/abs/2407.12994

[3] Multi-expert Prompting Improves Reliability, Safety, and Usefulness of Large Language Models
    Type: local_collection
    URL: https://arxiv.org/abs/2411.00492

[4] Enh


Question:  What is the most used prompting technique?



Processing...

Using hybrid search (BM25 + Vector)...
Reranking results...
Answer: The most widely used method to align large language models (LLMs) with human preferences is Reinforcement Learning from Human Feedback (RLHF). This is mentioned in the abstract of the second research paper, "Reinforcement Learning from Human Feedback" [2].

Sources:

[1] A Survey of Prompt Engineering Methods in Large Language Models for Different NLP Tasks
    Type: local_collection
    URL: https://arxiv.org/abs/2407.12994

[2] The Prompt Report: A Systematic Survey of Prompting Techniques
    Type: local_collection
    URL: https://arxiv.org/abs/2406.06608

[3] Mind Your Step (by Step: Chain-of-Thought Can Reduce Performance on Tasks Where Thinking Makes Humans Worse
    Type: local_collection
    URL: https://arxiv.org/abs/2410.21333

[4] Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study
    Type: local_collection
    URL: https://arxiv.org/abs/2404.10719
