In [13]:
import os
import pandas as pd
from typing import List, Dict, Optional
from langchain_community.document_loaders import PyMuPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.retrievers import TavilySearchAPIRetriever
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import warnings
warnings.filterwarnings('ignore')

class HybridResearchRAG:
    """
    Unified RAG system supporting both local paper collection and online search.
    Optimized for research question-answering with proper source attribution.
    """
    
    def __init__(
        self, 
        excel_path: Optional[str] = None,
        tavily_api_key: Optional[str] = None,
        vector_store_path: str = "research_vectorstore",
        model_name: str = "google/flan-t5-large"
    ):
        """
        Initialize the hybrid RAG system.
        
        Args:
            excel_path: Path to Excel file with paper metadata (Title, Link, S.No)
            tavily_api_key: API key for Tavily web search
            vector_store_path: Directory to save/load vector store
            model_name: HuggingFace model for generation
        """
        self.vector_store_path = vector_store_path
        self.excel_path = excel_path
        self.tavily_api_key = tavily_api_key
        
        # Initialize embedding model
        print("Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=400, 
            chunk_overlap=50,
            separators=["\n\n", "\n", ". ", "? ", "! ", " "],
            length_function=len
        )
        
        # Initialize LLM
        print(f"Loading generation model: {model_name}...")
        self.llm = self._initialize_llm(model_name)
        
        # Initialize vector store
        self.vectorstore = None
        self.load_or_create_vectorstore()
        
        # Initialize online retriever if API key provided
        self.online_retriever = None
        if tavily_api_key:
            self.online_retriever = TavilySearchAPIRetriever(
                k=8,
                api_key=tavily_api_key
            )
    
    def _initialize_llm(self, model_name: str):
        """Initialize the language model with optimized settings."""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        pipe = pipeline(
            "text2text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=400, 
            temperature=0.5,
            repetition_penalty=1.5,
            do_sample=True,
            truncation=True
        )
        
        return HuggingFacePipeline(pipeline=pipe)
    
    def load_papers_from_excel(self) -> List[Dict]:
        """Load paper metadata from Excel file."""
        if not self.excel_path or not os.path.exists(self.excel_path):
            return []
        
        df = pd.read_excel(self.excel_path)
        return [
            {
                "title": row["Title"],
                "url": row["Link"],
                "s_no": row.get("S.No", idx)
            }
            for idx, row in df.iterrows()
        ]
    
    def load_documents_from_papers(self, papers: List[Dict]) -> List:
        """Load and process documents from paper URLs."""
        all_docs = []
        total = len(papers)
        
        print(f"\nLoading {total} research papers...")
        
        for idx, paper in enumerate(papers, 1):
            if idx % 50 == 0:
                print(f"Progress: {idx}/{total} papers loaded")
            
            try:
                # Choose appropriate loader
                if paper["url"].endswith('.pdf'):
                    loader = PyMuPDFLoader(paper["url"])
                else:
                    loader = WebBaseLoader(paper["url"])
                
                docs = loader.load()
                
                # Add metadata
                for doc in docs:
                    doc.metadata.update({
                        "paper_title": paper["title"],
                        "paper_url": paper["url"],
                        "s_no": paper["s_no"],
                        "source_type": "local_collection"
                    })
                
                all_docs.extend(docs)
                
            except Exception as e:
                # Log errors but continue
                with open("load_errors.log", "a") as f:
                    f.write(f"Failed: {paper['title']} - {str(e)}\n")
        
        print(f"Successfully loaded {len(all_docs)} documents")
        return all_docs
    
    def load_or_create_vectorstore(self):
        """Load existing vector store or create new one from local papers."""
        index_path = f"{self.vector_store_path}/index.faiss"
        
        if os.path.exists(index_path):
            print("Loading existing vector store...")
            self.vectorstore = FAISS.load_local(
                self.vector_store_path,
                self.embeddings,
                allow_dangerous_deserialization=True
            )
            print("Vector store loaded successfully")
        elif self.excel_path:
            print("Creating new vector store from local papers...")
            papers = self.load_papers_from_excel()
            if papers:
                docs = self.load_documents_from_papers(papers)
                chunks = self.text_splitter.split_documents(docs)
                
                print(f"Creating embeddings for {len(chunks)} chunks...")
                self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
                self.vectorstore.save_local(self.vector_store_path)
                print("Vector store created and saved")
            else:
                print("No papers found to process")
        else:
            print("No vector store or Excel file provided - online search only mode")
    
    def search_online(self, query: str) -> List:
        """Search online sources using Tavily."""
        if not self.online_retriever:
            return []
        
        try:
            print("Searching online sources...")
            docs = self.online_retriever.invoke(query)
            
            # Add metadata and split
            for doc in docs:
                doc.metadata["source_type"] = "online_search"
            
            chunks = self.text_splitter.split_documents(docs)
            return chunks
        except Exception as e:
            print(f"Online search error: {e}")
            return []
    
    def create_qa_chain(self, use_online: bool = False):
        """Create the QA chain with appropriate retriever."""
        
        # Improved prompt template
        prompt_template = """You are a research assistant. Answer the question using the provided research papers.

Guidelines:
1. Provide a clear, concise answer (2-3 sentences maximum)
2. Cite specific papers using their titles when possible
3. If information is insufficient, state what's available and what's missing
4. Be precise and avoid speculation

Research Papers:
{context}

Question: {question}

Answer:"""
        
        prompt = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        
        # Determine retriever
        if use_online and self.online_retriever:
            # For online searches, we'll handle retrieval manually
            return None  # Signal to use custom retrieval
        elif self.vectorstore:
            retriever = self.vectorstore.as_retriever(
                search_kwargs={"k": 6}  # Limited for token constraints
            )
        else:
            raise ValueError("No retriever available. Provide Excel file or Tavily API key.")
        
        return RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs={"prompt": prompt},
            return_source_documents=True
        )
    
    def query(self, question: str, search_online: bool = False) -> Dict:
        """
        Query the RAG system.
        
        Args:
            question: User question
            search_online: Whether to search online sources (requires API key)
        
        Returns:
            Dict with 'answer' and 'sources'
        """
        try:
            # Handle online search separately
            if search_online and self.online_retriever:
                online_docs = self.search_online(question)
                
                if online_docs:
                    # Create temporary vectorstore
                    temp_store = FAISS.from_documents(online_docs, self.embeddings)
                    retriever = temp_store.as_retriever(search_kwargs={"k": 4})
                    
                    qa_chain = RetrievalQA.from_chain_type(
                        llm=self.llm,
                        chain_type="stuff",
                        retriever=retriever,
                        return_source_documents=True
                    )
                    
                    response = qa_chain.invoke({"query": question})
                else:
                    return {
                        "answer": "No relevant online sources found.",
                        "sources": []
                    }
            else:
                # Use local vectorstore
                qa_chain = self.create_qa_chain(use_online=False)
                response = qa_chain.invoke({"query": question})
            
            # Extract answer
            answer = response.get("result", "No answer generated")
            
            # Process sources
            sources = []
            seen = set()
            
            for doc in response.get("source_documents", []):
                source_info = {
                    "title": doc.metadata.get("paper_title", "Unknown"),
                    "url": doc.metadata.get("paper_url", doc.metadata.get("source", "#")),
                    "type": doc.metadata.get("source_type", "unknown"),
                    "page": doc.metadata.get("page", "N/A")
                }
                
                # Deduplicate
                identifier = f"{source_info['title']}_{source_info['url']}"
                if identifier not in seen:
                    sources.append(source_info)
                    seen.add(identifier)
            
            return {
                "answer": answer,
                "sources": sources
            }
            
        except Exception as e:
            return {
                "answer": f"Error processing query: {str(e)}",
                "sources": []
            }
    
    def interactive_session(self):
        """Run interactive Q&A session."""
        print("\n" + "="*60)
        print("Research Assistant - Interactive Mode")
        print("="*60)
        print("Commands:")
        print("  - Type your question to search local papers")
        print("  - Add '--online' to search online sources")
        print("  - Type 'exit' to quit")
        print("="*60 + "\n")
        
        while True:
            try:
                user_input = input("\nQuestion: ").strip()
                
                if user_input.lower() in ['exit', 'quit', 'q']:
                    print("Goodbye!")
                    break
                
                if not user_input:
                    continue
                
                # Check for online flag
                search_online = '--online' in user_input
                question = user_input.replace('--online', '').strip()
                
                print("\nProcessing...\n")
                result = self.query(question, search_online=search_online)
                
                # Display results
                print(f"Answer: {result['answer']}\n")
                
                if result['sources']:
                    print("Sources:")
                    for idx, source in enumerate(result['sources'], 1):
                        print(f"\n[{idx}] {source['title']}")
                        print(f"    Type: {source['type']}")
                        print(f"    URL: {source['url']}")
                        if source['page'] != 'N/A':
                            print(f"    Page: {source['page']}")
                
            except KeyboardInterrupt:
                print("\n\nExiting...")
                break
            except Exception as e:
                print(f"\nError: {e}")



In [15]:

# ========== Usage Example ==========
if __name__ == "__main__":
    # Initialize system
    # Option 1: Local papers only
    rag = HybridResearchRAG(
        excel_path="LLM-Papers.xlsx",  # Your Excel file
        tavily_api_key= "",
        vector_store_path="research_vectorstore"
    )
    
    # Option 2: With online search capability
    # rag = HybridResearchRAG(
    #     excel_path="LLM-Papers.xlsx",
    #     tavily_api_key="tvly-YOUR-API-KEY",
    #     vector_store_path="research_vectorstore"
    # )
    
    # Start interactive session
    rag.interactive_session()
    
    # Or use programmatically
    # result = rag.query("What is multi-head attention?")
    # print(result['answer'])

Loading embedding model...
Loading generation model: google/flan-t5-large...
Loading existing vector store...
Vector store loaded successfully

Research Assistant - Interactive Mode
Commands:
  - Type your question to search local papers
  - Add '--online' to search online sources
  - Type 'exit' to quit




Question:  What is the most trending prompting technique?



Processing...

Answer: Multi-expert Prompting

Sources:

[1] The Prompt Report: A Systematic Survey of Prompting Techniques
    Type: local_collection
    URL: https://arxiv.org/abs/2406.06608

[2] A Survey of Prompt Engineering Methods in Large Language Models for Different NLP Tasks
    Type: local_collection
    URL: https://arxiv.org/abs/2407.12994

[3] Multi-expert Prompting Improves Reliability, Safety, and Usefulness of Large Language Models
    Type: local_collection
    URL: https://arxiv.org/abs/2411.00492



Question:  What is the most trending prompting technique? --online



Processing...

Searching online sources...
Answer: Meta-prompting

Sources:

[1] Unknown
    Type: online_search
    URL: https://www.reddit.com/r/PromptEngineering/comments/1ius9pt/my_favorite_prompting_technique_whats_yours/

[2] Unknown
    Type: online_search
    URL: https://www.promptingguide.ai/techniques

[3] Unknown
    Type: online_search
    URL: https://www.linkedin.com/pulse/mastering-advanced-prompting-techniques-large-language-watkins-lik9e

[4] Unknown
    Type: online_search
    URL: https://machinelearningmastery.com/7-next-generation-prompt-engineering-techniques/



Question:  What is multi-head attention?



Processing...

Answer: The multi-head mechanism enables the model to collectively attend to information from various representation spaces within different experts, while significantly enhances expert activation, thus deepens context understanding and alleviate overfitting

Sources:

[1] Attention Heads of Large Language Models: A Survey
    Type: local_collection
    URL: https://arxiv.org/abs/2409.03752

[2] Retrieval Head Mechanistically Explains Long-Context Factuality
    Type: local_collection
    URL: https://arxiv.org/abs/2404.15574

[3] Multi-Head Mixture-of-Experts
    Type: local_collection
    URL: https://arxiv.org/abs/2404.15045

[4] MoA: Mixture of Sparse Attention for Automatic Large Language Model Compression
    Type: local_collection
    URL: https://arxiv.org/abs/2406.14909

[5] A Phase Transition Between Positional and Semantic Learning in a Solvable Model of Dot-Product Attention
    Type: local_collection
    URL: https://arxiv.org/abs/2402.03902



Question:  quit


Goodbye!
