In [1]:
import json
import logging
from typing import List, Dict, Any, Optional, Tuple
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.manager import get_openai_callback
from langchain_core.prompts import ChatPromptTemplate

# Import your existing RAG fusion function
from rag_pipeline.rag_fusion_pipeline import *

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SmartRAGTool:
    def __init__(self, 
                 local_index_path: str,
                 embedding_model,
                 llm_params: Optional[Dict] = None):
        """
        Initialize the Smart RAG Tool
        
        Args:
            local_index_path: Path to the FAISS vector store
            embedding_model: Embedding model for vector store
            llm_params: Parameters for the LLM
        """
        self.local_index_path = local_index_path
        self.embedding_model = embedding_model
        self.llm_params = llm_params or {"temperature": 0, "model": "gpt-4o"}
        self.llm = ChatOpenAI(**self.llm_params)
        
        # Function definition for OpenAI function calling
        self.function_definition = {
            "name": "search_knowledge_base",
            "description": "Search the knowledge base for specific information when the question requires domain-specific or detailed factual information that may not be in general knowledge",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query to find relevant information"
                    },
                    "mode": {
                        "type": "string",
                        "enum": ["original", "generated"],
                        "description": "Search mode: 'original' uses only the user query, 'generated' creates multiple related queries for better coverage"
                    },
                    "num_queries": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 10,
                        "description": "Number of queries to generate if using 'generated' mode (default: 3)"
                    }
                },
                "required": ["query"]
            }
        }
        
        # Prompt to determine if RAG is needed
        self.decision_prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an AI assistant that decides whether a user question requires searching a knowledge base or can be answered with general knowledge.

Use the search_knowledge_base function ONLY when:
1. The questions is specific and realted to a policy, fact, legal specification.
2. The question requires current or specific information that might not be in general knowledge

DO NOT use the search function for:
1. General knowledge questions (e.g., "What is machine learning?", "How does photosynthesis work?")
2. Questions not related to the knowledge questions should be politely redirected to ask user for questions that are related to the knowledge base.

If you decide to search, choose the appropriate mode:
- Use "original" mode for simple, direct queries
- Use "generated" mode for complex questions that might benefit from multiple search perspectives

If you don't need to search, answer the question directly using your general knowledge."""),
            ("user", "{user_query}")
        ])

    def search_knowledge_base(self, query: str, mode: str = "generated", num_queries: int = 3) -> Dict[str, Any]:
        """
        Search the knowledge base using RAG fusion
        
        Args:
            query: Search query
            mode: Search mode ('original' or 'generated')
            num_queries: Number of queries to generate if using 'generated' mode
            
        Returns:
            Dictionary with answer and metadata
        """
        try:
            logger.info(f"Searching knowledge base with query: '{query}' in {mode} mode")
            
            answer, metadata = rag_fusion_answer(
                user_query=query,
                local_index_path=self.local_index_path,
                embedding_model=self.embedding_model,
                mode=mode,
                num_generated_queries=num_queries,
                top_k=5,  # Retrieve more documents for better context
                params=self.llm_params
            )
            
            return {
                "answer": answer,
                "metadata": metadata,
                "search_performed": True
            }
            
        except Exception as e:
            logger.error(f"Error in knowledge base search: {str(e)}")
            return {
                "answer": f"I encountered an error while searching the knowledge base: {str(e)}",
                "metadata": {},
                "search_performed": False,
                "error": str(e)
            }

    def process_user_query(self, user_query: str, chat_context: Optional[str] = None) -> Tuple[str, Dict[str, Any]]:
        """
        Process a user query and decide whether to use RAG or answer directly
        
        Args:
            user_query: The user's question
            chat_context: Optional conversation context
            
        Returns:
            Tuple of (answer, metadata)
        """
        try:
            # Create the chain with function calling
            chain = self.decision_prompt | self.llm.bind(
                functions=[self.function_definition],
                function_call="auto"
            )
            
            metadata = {
                "user_query": user_query,
                "decision_made": None,
                "search_performed": False,
                "total_cost": 0.0,
                "token_usage": {}
            }
            
            with get_openai_callback() as cb:
                response = chain.invoke({"user_query": user_query})
            
            # Track decision-making cost
            decision_cost = {
                "total_tokens": cb.total_tokens,
                "prompt_tokens": cb.prompt_tokens,
                "completion_tokens": cb.completion_tokens,
                "total_cost": cb.total_cost
            }
            metadata["decision_cost"] = decision_cost
            metadata["total_cost"] += cb.total_cost
            
            # Check if the model decided to use function calling
            if hasattr(response, 'additional_kwargs') and 'function_call' in response.additional_kwargs:
                function_call = response.additional_kwargs['function_call']
                function_name = function_call['name']
                function_args = json.loads(function_call['arguments'])
                
                logger.info(f"LLM decided to use function: {function_name} with args: {function_args}")
                metadata["decision_made"] = "search_needed"
                
                if function_name == "search_knowledge_base":
                    # Execute the RAG search
                    search_result = self.search_knowledge_base(
                        query=function_args.get('query', user_query),
                        mode=function_args.get('mode', 'generated'),
                        num_queries=function_args.get('num_queries', 3)
                    )
                    
                    if search_result.get("search_performed"):
                        metadata.update(search_result["metadata"])
                        metadata["search_performed"] = True
                        metadata["total_cost"] += search_result["metadata"].get("total_price", 0)
                        
                        return search_result["answer"], metadata
                    else:
                        # Fallback if search failed
                        return f"I tried to search for information but encountered an issue. Based on general knowledge: I'd be happy to help, but I may need more specific information to give you the most accurate answer.", metadata
                        
            else:
                # LLM decided not to search - use the direct response
                logger.info("LLM decided no search needed, providing direct answer")
                metadata["decision_made"] = "direct_answer"
                return response.content, metadata
                
        except Exception as e:
            logger.error(f"Error in process_user_query: {str(e)}")
            metadata["error"] = str(e)
            return f"I encountered an error while processing your question: {str(e)}", metadata

    def chat(self, user_query: str, chat_context: Optional[str] = None, verbose: bool = False) -> str:
        """
        Simple chat interface that handles the query and returns just the answer
        
        Args:
            user_query: The user's question
            chat_context: Optional conversation context
            verbose: Whether to print detailed metadata
            
        Returns:
            The answer string
        """
        answer, metadata = self.process_user_query(user_query, chat_context)
        
        if verbose:
            print(f"\n--- Smart RAG Tool Execution Report ---")
            print(f"User Query: {user_query}")
            print(f"Decision Made: {metadata.get('decision_made', 'unknown')}")
            print(f"Search Performed: {metadata.get('search_performed', False)}")
            print(f"Total Cost: ${metadata.get('total_cost', 0):.4f}")
            
            if metadata.get('search_performed'):
                token_usage = metadata.get('token_usage', {})
                print(f"Total Tokens Used: {token_usage.get('total_tokens', 0)}")
                print(f"Queries Used: {metadata.get('queries_used', [])}")
                print(f"Documents Retrieved: {metadata.get('num_documents_retrieved', 0)}")
            
            print(f"--- End Report ---\n")
        
        return answer


# Usage Example
def create_smart_rag_tool(local_index_path: str, embedding_model) -> SmartRAGTool:
    """Factory function to create a Smart RAG Tool instance"""
    return SmartRAGTool(
        local_index_path=local_index_path,
        embedding_model=embedding_model,
        llm_params={"temperature": 0, "model": "gpt-4o"}
    )


# Example usage:
    # Initialize the tool
    # smart_rag = create_smart_rag_tool("./faiss_index", your_embedding_model)
    
    # Example questions that would trigger different behaviors:
    
    # This would likely NOT trigger RAG (general knowledge)
    # answer = smart_rag.chat("What is machine learning?", verbose=True)
    # print(f"Answer: {answer}")
    
    # This would likely trigger RAG (specific/domain knowledge)
    # answer = smart_rag.chat("What are the specific implementation details of our authentication system?", verbose=True)
    # print(f"Answer: {answer}")
    
    # This would likely trigger RAG with generated mode (complex query)
    # answer = smart_rag.chat("How does our system handle user permissions and what are the security implications?", verbose=True)
    # print(f"Answer: {answer}")
    

In [2]:
from langchain_openai import OpenAIEmbeddings
smart_rag = create_smart_rag_tool("./data/faiss_index", OpenAIEmbeddings(model="text-embedding-3-large"))

In [None]:
response = smart_rag.chat("Какой пороговый уровень ОЗП", verbose=True)
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:LLM decided to use function: search_knowledge_base with args: {'query': 'пороговый уровень ОЗП', 'mode': 'original'}
INFO:__main__:Searching knowledge base with query: 'пороговый уровень ОЗП' in original mode
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.
INFO:faiss:Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.
INFO:rag_pipeline.rag_fusion_pipeline:Running in original query mode.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



--- Smart RAG Tool Execution Report ---
User Query: Какой пороговый уровень ОЗП
Decision Made: search_needed
Search Performed: True
Total Cost: $0.0065
Total Tokens Used: 1720
Queries Used: ['пороговый уровень ОЗП']
Documents Retrieved: 4
--- End Report ---



'Пороговый уровень для оценки знаний педагогов (ОЗП) зависит от квалификационной категории педагога. Согласно предоставленным документам, пороговые уровни следующие:\n\n- Для квалификационной категории «педагог-стажер/педагог» - 50%;\n- Для квалификационной категории «педагог-модератор» - 60%;\n- Для квалификационной категории «педагог-эксперт» - 70%;\n- Для квалификационной категории «педагог-исследователь» - 80%;\n- Для квалификационной категории «педагог-мастер» - 90%.\n\nДля первых руководителей, заместителей руководителя организаций образования и методических кабинетов (центров) пороговый уровень составляет 70%.'

In [30]:
smart_rag.chat("Какая столица Франции?", verbose=True)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:LLM decided no search needed, providing direct answer



--- Smart RAG Tool Execution Report ---
User Query: Какая столица Франции?
Decision Made: direct_answer
Search Performed: False
Total Cost: $0.0009
--- End Report ---



'Столица Франции — Париж.'

In [3]:
import json
import os
import logging
from typing import Dict, Any, Optional, Tuple, List
from pathlib import Path
import mimetypes
from difflib import SequenceMatcher
import re
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.callbacks.manager import get_openai_callback

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFRetrievalTool:
    def __init__(self, 
                 documents_json_path: str,
                 llm_params: Optional[Dict] = None):
        """
        Initialize the PDF Retrieval Tool
        
        Args:
            documents_json_path: Path to JSON file containing document mappings
            llm_params: Parameters for the LLM
        """
        self.documents_json_path = documents_json_path
        self.llm_params = llm_params or {"temperature": 0, "model": "gpt-4o"}
        self.llm = ChatOpenAI(**self.llm_params)
        
        # Load document mappings
        self.document_mappings = self._load_document_mappings()
        
        # Create enhanced search indices
        self.search_index = self._create_search_index()
        
        # Enhanced function definition for OpenAI function calling
        self.function_definition = {
            "name": "retrieve_document",
            "description": "Retrieve a specific document when user asks for any document, file, manual, guide, report, or wants to download/access/get any document",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_name": {
                        "type": "string",
                        "description": "The name or identifier of the document to retrieve"
                    },
                    "search_keywords": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Keywords extracted from user query to help find the document"
                    },
                    "document_type": {
                        "type": "string",
                        "description": "Type of document (manual, guide, report, documentation, etc.)"
                    }
                },
                "required": ["document_name"]
            }
        }
        
        # More aggressive prompt to catch document requests
        self.decision_prompt = ChatPromptTemplate.from_messages([
            ("system", f"""You are a document retrieval assistant. Your job is to identify when users want to access, download, or retrieve ANY document.

AVAILABLE DOCUMENTS:
{self._format_available_documents_detailed()}

ALWAYS use the retrieve_document function when users:
- Ask for ANY document, file, manual, guide, report, or PDF
- Want to "download", "get", "retrieve", "access", "show", "open", or "see" a document
- Ask questions like "do you have...", "can I get...", "where is...", "find..." followed by document-related terms
- Mention specific document types (manual, guide, documentation, report, etc.)
- Use phrases like "I need the...", "show me the...", "give me the..."

EXAMPLES that should trigger retrieve_document:
✅ "Can I get the user manual?"
✅ "I need the API documentation"  
✅ "Do you have the installation guide?"
✅ "Show me the quarterly report"
✅ "Where is the policy document?"
✅ "Find the technical specifications"
✅ "I want to see the handbook"
✅ "Download the reference guide"
✅ "Give me the troubleshooting manual"
✅ "Can you provide the system documentation?"

When calling retrieve_document:
1. Extract the main document name/type from the user's request
2. Include relevant keywords from their query in search_keywords
3. Specify the document_type if identifiable

Be very liberal in detecting document requests - when in doubt, assume they want a document."""),
            ("user", "{user_query}")
        ])

    def _load_document_mappings(self) -> Dict[str, str]:
        """Load document name to path mappings from JSON file"""
        try:
            with open(self.documents_json_path, 'r', encoding='utf-8') as f:
                mappings = json.load(f)
            logger.info(f"Loaded {len(mappings)} document mappings")
            return mappings
        except FileNotFoundError:
            logger.error(f"Document mappings file not found: {self.documents_json_path}")
            return {}
        except json.JSONDecodeError as e:
            logger.error(f"Error parsing document mappings JSON: {e}")
            return {}

    def _create_search_index(self) -> Dict[str, List[str]]:
        """Create enhanced search index with keywords for each document"""
        search_index = {}
        
        for doc_name in self.document_mappings.keys():
            # Extract keywords from document name
            keywords = self._extract_keywords(doc_name)
            search_index[doc_name] = keywords
            
        return search_index

    def _extract_keywords(self, text: str) -> List[str]:
        """Extract searchable keywords from text"""
        # Convert to lowercase and split on common separators
        text = text.lower()
        # Split on various separators and remove empty strings
        words = re.split(r'[_\-\s\.\(\)\[\]]+', text)
        words = [w.strip() for w in words if w.strip()]
        
        # Add common synonyms and variations
        expanded_words = set(words)
        
        # Add synonyms for common terms
        synonyms = {
            'manual': ['guide', 'handbook', 'documentation', 'doc', 'instructions'],
            'guide': ['manual', 'handbook', 'documentation', 'tutorial', 'howto'],
            'api': ['interface', 'endpoint', 'service', 'programming'],
            'install': ['installation', 'setup', 'deployment', 'configure'],
            'user': ['users', 'customer', 'client'],
            'admin': ['administrator', 'administration', 'management'],
            'tech': ['technical', 'technology'],
            'spec': ['specification', 'specifications', 'specs'],
            'ref': ['reference', 'references'],
            'trouble': ['troubleshooting', 'troubleshoot', 'debug', 'problem'],
        }
        
        for word in words:
            if word in synonyms:
                expanded_words.update(synonyms[word])
        
        return list(expanded_words)

    def _format_available_documents_detailed(self) -> str:
        """Format available documents with more detail for better matching"""
        if not self.document_mappings:
            return "No documents currently available."
        
        doc_list = []
        for doc_name, doc_path in self.document_mappings.items():
            keywords = ', '.join(self.search_index.get(doc_name, [])[:5])  # Show first 5 keywords
            status = "✅" if os.path.exists(doc_path) else "❌"
            doc_list.append(f"{status} **{doc_name}**\n   Keywords: {keywords}")
        
        return "\n".join(doc_list)

    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity between two strings"""
        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

    def _find_document_enhanced(self, document_name: str, search_keywords: Optional[List[str]] = None, document_type: Optional[str] = None) -> Optional[Tuple[str, str, float]]:
        """
        Enhanced document finding with multiple strategies
        
        Returns:
            Tuple of (matched_name, file_path, confidence_score) or None
        """
        candidates = []
        
        # Strategy 1: Exact name match
        for doc_name, doc_path in self.document_mappings.items():
            if doc_name.lower() == document_name.lower():
                if os.path.exists(doc_path):
                    return doc_name, doc_path, 1.0
        
        # Strategy 2: High similarity match
        for doc_name, doc_path in self.document_mappings.items():
            similarity = self._calculate_similarity(document_name, doc_name)
            if similarity > 0.8 and os.path.exists(doc_path):
                candidates.append((doc_name, doc_path, similarity))
        
        # Strategy 3: Partial name matching
        document_name_lower = document_name.lower()
        for doc_name, doc_path in self.document_mappings.items():
            doc_name_lower = doc_name.lower()
            if (document_name_lower in doc_name_lower or 
                doc_name_lower in document_name_lower or
                any(word in doc_name_lower for word in document_name_lower.split() if len(word) > 2)):
                if os.path.exists(doc_path):
                    similarity = self._calculate_similarity(document_name, doc_name)
                    candidates.append((doc_name, doc_path, similarity))
        
        # Strategy 4: Keyword matching
        if search_keywords:
            for doc_name, doc_path in self.document_mappings.items():
                doc_keywords = self.search_index.get(doc_name, [])
                keyword_matches = 0
                for keyword in search_keywords:
                    keyword_lower = keyword.lower()
                    for doc_keyword in doc_keywords:
                        if (keyword_lower == doc_keyword or 
                            keyword_lower in doc_keyword or 
                            doc_keyword in keyword_lower):
                            keyword_matches += 1
                            break
                
                if keyword_matches > 0 and os.path.exists(doc_path):
                    # Calculate confidence based on keyword matches
                    confidence = min(0.9, keyword_matches / len(search_keywords) * 0.8)
                    candidates.append((doc_name, doc_path, confidence))
        
        # Strategy 5: Document type matching
        if document_type:
            document_type_lower = document_type.lower()
            for doc_name, doc_path in self.document_mappings.items():
                if document_type_lower in doc_name.lower() and os.path.exists(doc_path):
                    similarity = self._calculate_similarity(document_type, doc_name)
                    candidates.append((doc_name, doc_path, similarity * 0.7))
        
        # Remove duplicates and sort by confidence
        unique_candidates = {}
        for name, path, confidence in candidates:
            if name not in unique_candidates or confidence > unique_candidates[name][2]:
                unique_candidates[name] = (name, path, confidence)
        
        if unique_candidates:
            # Return the best match
            best_match = max(unique_candidates.values(), key=lambda x: x[2])
            return best_match
        
        return None

    def retrieve_document(self, document_name: str, search_keywords: Optional[List[str]] = None, document_type: Optional[str] = None) -> Dict[str, Any]:
        """
        Enhanced document retrieval with better matching
        """
        try:
            result = self._find_document_enhanced(document_name, search_keywords, document_type)
            
            if result is None:
                # Provide better suggestions
                suggestions = self._get_suggestions(document_name, search_keywords, document_type)
                available_docs = list(self.document_mappings.keys())
                
                return {
                    "success": False,
                    "message": f"Document '{document_name}' not found.",
                    "available_documents": available_docs,
                    "suggestions": suggestions,
                    "suggestion_text": f"Did you mean: {', '.join(suggestions[:3])}?" if suggestions else "Please check available documents."
                }
            
            matched_name, file_path, confidence = result
            
            # Get file info
            file_stat = os.stat(file_path)
            file_size = file_stat.st_size
            file_size_mb = file_size / (1024 * 1024)
            
            # Get MIME type
            mime_type, _ = mimetypes.guess_type(file_path)
            
            logger.info(f"Successfully retrieved document: {matched_name} (confidence: {confidence:.2f}, size: {file_size_mb:.2f} MB)")
            
            return {
                "success": True,
                "document_name": matched_name,
                "file_path": file_path,
                "file_size": file_size,
                "file_size_mb": round(file_size_mb, 2),
                "mime_type": mime_type or "application/pdf",
                "confidence": round(confidence, 2),
                "message": f"Document '{matched_name}' is ready for download (confidence: {confidence:.1%}).",
                "download_info": {
                    "filename": os.path.basename(file_path),
                    "extension": Path(file_path).suffix
                }
            }
            
        except Exception as e:
            logger.error(f"Error retrieving document '{document_name}': {str(e)}")
            return {
                "success": False,
                "message": f"Error retrieving document: {str(e)}",
                "error": str(e)
            }

    def _get_suggestions(self, document_name: str, search_keywords: Optional[List[str]] = None, document_type: Optional[str] = None) -> List[str]:
        """Get document suggestions based on partial matches"""
        suggestions = []
        
        # Find partial matches
        for doc_name in self.document_mappings.keys():
            similarity = self._calculate_similarity(document_name, doc_name)
            if similarity > 0.3:  # Lower threshold for suggestions
                suggestions.append((doc_name, similarity))
        
        # Sort by similarity and return top suggestions
        suggestions.sort(key=lambda x: x[1], reverse=True)
        return [name for name, _ in suggestions[:5]]

    def process_user_query(self, user_query: str) -> Tuple[str, Dict[str, Any]]:
        """
        Enhanced query processing with better document detection
        """
        try:
            # Force function calling to be more aggressive
            chain = self.decision_prompt | self.llm.bind(
                functions=[self.function_definition],
                function_call="auto"
            )
            
            metadata = {
                "user_query": user_query,
                "document_requested": False,
                "document_retrieved": False,
                "total_cost": 0.0,
                "available_documents_count": len(self.document_mappings)
            }
            
            with get_openai_callback() as cb:
                response = chain.invoke({"user_query": user_query})
            
            # Track costs
            metadata["decision_cost"] = {
                "total_tokens": cb.total_tokens,
                "prompt_tokens": cb.prompt_tokens,
                "completion_tokens": cb.completion_tokens,
                "total_cost": cb.total_cost
            }
            metadata["total_cost"] += cb.total_cost
            
            # Check for function call in response
            function_call = None
            if hasattr(response, 'additional_kwargs') and 'function_call' in response.additional_kwargs:
                function_call = response.additional_kwargs['function_call']
            elif hasattr(response, 'tool_calls') and response.tool_calls:
                # Handle newer OpenAI API format
                function_call = response.tool_calls[0].function if response.tool_calls else None
            
            if function_call:
                try:
                    function_name = function_call.name if hasattr(function_call, 'name') else function_call['name']
                    function_args = json.loads(function_call.arguments if hasattr(function_call, 'arguments') else function_call['arguments'])
                    
                    logger.info(f"LLM decided to retrieve document: {function_args}")
                    metadata["document_requested"] = True
                    metadata["function_args"] = function_args
                    
                    if function_name == "retrieve_document":
                        # Execute enhanced document retrieval
                        retrieval_result = self.retrieve_document(
                            document_name=function_args.get('document_name', ''),
                            search_keywords=function_args.get('search_keywords', []),
                            document_type=function_args.get('document_type', '')
                        )
                        
                        metadata.update(retrieval_result)
                        metadata["document_retrieved"] = retrieval_result.get("success", False)
                        
                        if retrieval_result.get("success"):
                            doc_info = retrieval_result
                            response_message = (
                                f"✅ **{doc_info['document_name']}** found and ready for download!\n\n"
                                f"📄 **File:** {doc_info['download_info']['filename']}\n"
                                f"📊 **Size:** {doc_info['file_size_mb']} MB\n"
                                f"🎯 **Confidence:** {doc_info['confidence']:.1%}\n"
                                f"📁 **Location:** {doc_info['file_path']}\n\n"
                                f"The document is ready for download from the specified location."
                            )
                        else:
                            response_message = (
                                f"❌ {retrieval_result['message']}\n\n"
                                f"💡 **{retrieval_result.get('suggestion_text', '')}**\n\n"
                                f"📋 **Available documents:** {', '.join(list(self.document_mappings.keys())[:3])}..."
                            )
                        
                        return response_message, metadata
                except Exception as e:
                    logger.error(f"Error processing function call: {e}")
                    metadata["error"] = str(e)
            
            # If no function call, provide helpful response
            response_message = (
                f"I didn't detect a specific document request in your message. "
                f"I have access to {len(self.document_mappings)} documents. "
                f"Try asking something like:\n"
                f"• 'Can I get the [document name]?'\n"
                f"• 'I need the [type] documentation'\n"
                f"• 'Show me the [document] guide'\n\n"
                f"Available documents: {', '.join(list(self.document_mappings.keys())[:3])}..."
            )
            return response_message, metadata
                
        except Exception as e:
            logger.error(f"Error in process_user_query: {str(e)}")
            metadata["error"] = str(e)
            return f"I encountered an error while processing your request: {str(e)}", metadata

    def list_available_documents(self) -> str:
        """Return a formatted list of available documents with enhanced info"""
        if not self.document_mappings:
            return "No documents are currently available."
        
        doc_list = []
        for i, (doc_name, doc_path) in enumerate(self.document_mappings.items(), 1):
            file_exists = "✅" if os.path.exists(doc_path) else "❌"
            
            # Get file size if exists
            size_info = ""
            if os.path.exists(doc_path):
                try:
                    size_mb = os.path.getsize(doc_path) / (1024 * 1024)
                    size_info = f" ({size_mb:.1f} MB)"
                except:
                    size_info = ""
            
            # Get keywords
            keywords = ', '.join(self.search_index.get(doc_name, [])[:3])
            
            doc_list.append(
                f"{i}. {file_exists} **{doc_name}**{size_info}\n"
                f"   🏷️ Keywords: {keywords}"
            )
        
        return f"**Available Documents ({len(self.document_mappings)}):**\n\n" + "\n\n".join(doc_list)

In [4]:
def setup_pdf_tool():
    """Set up the PDF retrieval tool"""
    
    # Create documents.json if it doesn't exist
    documents_json_path = "documents.json"
    if not os.path.exists(documents_json_path):
        documents_json_path = create_documents_json()
    
    # Initialize the too
    
    # Configure LLM parameters (make sure you have OPENAI_API_KEY set)
    llm_params = {
        "temperature": 0,
        "model": "gpt-4o",  # or "gpt-3.5-turbo" for cheaper option
        # "api_key": "your-api-key-here"  # if not using environment variable
    }
    
    # Create the tool instance
    pdf_tool = PDFRetrievalTool(
        documents_json_path=documents_json_path,
        llm_params=llm_params
    )
    
    print("✅ PDF Retrieval Tool initialized")
    return pdf_tool

pdf_tool = setup_pdf_tool()

INFO:__main__:Loaded 16 document mappings


✅ PDF Retrieval Tool initialized


In [5]:
print("\n" + "="*60)
print("PDF RETRIEVAL TOOL - USAGE EXAMPLES")
print("="*60)

# Example 1: List available documents
print("\n1️⃣ LISTING AVAILABLE DOCUMENTS:")
print("-" * 40)
available_docs = pdf_tool.list_available_documents()
print(available_docs)

# Example 2: Direct document requests
print("\n2️⃣ DOCUMENT REQUEST EXAMPLES:")
print("-" * 40)


PDF RETRIEVAL TOOL - USAGE EXAMPLES

1️⃣ LISTING AVAILABLE DOCUMENTS:
----------------------------------------
**Available Documents (16):**

1. ❌ **Біліктілік санатын беруге (растауға) арналған Комиссия отырысының хаттамасы**
   🏷️ Keywords: комиссия, растауға, арналған

2. ❌ **комиссияның толық атауын көрсету аттестаттау комиссиясы отырысының хаттама**
   🏷️ Keywords: хаттама, толық, көрсету

3. ❌ **Педагогтің аттестаттау рәсіміне қатысуға өтініші**
   🏷️ Keywords: өтініші, қатысуға, педагогтің

4. ❌ **Аттестаттаудан өтуге өтінішті қабылдаудан бас тарту туралы хабарлама**
   🏷️ Keywords: өтінішті, тарту, хабарлама

5. ❌ **Біліктілік санатын беру (растау) туралы КУӘЛІК**
   🏷️ Keywords: санатын, біліктілік, растау

6. ❌ **Аттестаттаудан өтуге өтінішті қабылдау туралы хабарлама**
   🏷️ Keywords: өтінішті, хабарлама, туралы

7. ❌ **Педагогтердің білімін бағалаудан өткені туралы сертификат**
   🏷️ Keywords: бағалаудан, өткені, сертификат

8. ❌ **Педагогтердің білімін бағалауды өткізу ер

In [6]:
test_queries = ["Отправь мне акт о нарушений правил аттестации"]

In [23]:
# For basic usage
pdf_tool = PDFRetrievalTool("/workspaces/chatbot-rag-83/old_ones/documents.json")
response, metadata = pdf_tool.process_user_query("Отправь пожалуйста Акт нарушения правил и условий проведения оценки знаний педагога")

INFO:__main__:Loaded 16 document mappings


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:LLM decided to retrieve document: {'document_name': 'Акт нарушения правил и условий проведения оценки знаний педагога', 'search_keywords': ['нарушения', 'правил', 'условий', 'проведения', 'оценки', 'знаний', 'педагога'], 'document_type': 'акт'}


In [24]:
print(response)

❌ Document 'Акт нарушения правил и условий проведения оценки знаний педагога' not found.

💡 **Did you mean: Акт нарушения правил и условий проведения оценки знаний педагога, Сертификат о прохождении оценки знаний педагога, Заявление на прохождение аттестации педагога?**

📋 **Available documents:** Біліктілік санатын беруге (растауға) арналған Комиссия отырысының хаттамасы, комиссияның толық атауын көрсету аттестаттау комиссиясы отырысының хаттама, Педагогтің аттестаттау рәсіміне қатысуға өтініші...


In [25]:
print(metadata)

{'user_query': 'Отправь пожалуйста Акт нарушения правил и условий проведения оценки знаний педагога', 'document_requested': True, 'document_retrieved': False, 'total_cost': 0.0021000000000000003, 'available_documents_count': 16, 'decision_cost': {'total_tokens': 1148, 'prompt_tokens': 1080, 'completion_tokens': 68, 'total_cost': 0.0021000000000000003}, 'function_args': {'document_name': 'Акт нарушения правил и условий проведения оценки знаний педагога', 'search_keywords': ['нарушения', 'правил', 'условий', 'проведения', 'оценки', 'знаний', 'педагога'], 'document_type': 'акт'}, 'success': False, 'message': "Document 'Акт нарушения правил и условий проведения оценки знаний педагога' not found.", 'available_documents': ['Біліктілік санатын беруге (растауға) арналған Комиссия отырысының хаттамасы', 'комиссияның толық атауын көрсету аттестаттау комиссиясы отырысының хаттама', 'Педагогтің аттестаттау рәсіміне қатысуға өтініші', 'Аттестаттаудан өтуге өтінішті қабылдаудан бас тарту тур

In [1]:
example_queries = ['Какое решение принимает аттестационная комиссия, если отсутствует один из критериев на заявляемую категорию?',
 'Можно ли сохранить действующую квалификационную категорию руководителю и заместителю руководителя за 4 года до выхода на пенсию?',
 'Обязан ли педагог проходить ОЗП, если у него стаж более 30 лет?',
 'Если в дипломе педагога два предмета, по какому из них он должен проходить аттестацию?',
 'Сохраняется ли квалификационная категория педагога, если он переходит из дошкольной организации в организацию  среднего образования?',
 'Какой пороговый уровень ОЗП?',
 'Сколько баллов нужно педагогу-модератору для прохождения квалификационного теста?']

example_queries_kk = ['Аттестациялық комиссия талап етілетін санат бойынша бір критерий болмаған жағдайда қандай шешім қабылдайды?',
'Зейнеткерлікке шығуына 4 жыл қалғанда басшы мен басшының орынбасарына қолданыстағы біліктілік санатын сақтауға бола ма?',
'Егер педагогтың еңбек өтілі 30 жылдан асса, ол міндетті ПББ-ден өтуі тиіс пе?',
'Егер педагогтың дипломында екі пән көрсетілсе, ол қай пән бойынша аттестациядан өтуі керек?',
'Педагог мектепке дейінгі ұйымнан орта білім беру ұйымына ауысқан жағдайда оның біліктілік санаты сақтала ма?',
'ПББ үшін өту шегі қандай?',
'Біліктілік тестінен өту үшін педагог-модератор қанша балл жинауы керек?']


In [3]:
for query in example_queries_kk:
    print(f"- {query}")

- Аттестациялық комиссия талап етілетін санат бойынша бір критерий болмаған жағдайда қандай шешім қабылдайды?
- Зейнеткерлікке шығуына 4 жыл қалғанда басшы мен басшының орынбасарына қолданыстағы біліктілік санатын сақтауға бола ма?
- Егер педагогтың еңбек өтілі 30 жылдан асса, ол міндетті ПББ-ден өтуі тиіс пе?
- Егер педагогтың дипломында екі пән көрсетілсе, ол қай пән бойынша аттестациядан өтуі керек?
- Педагог мектепке дейінгі ұйымнан орта білім беру ұйымына ауысқан жағдайда оның біліктілік санаты сақтала ма?
- ПББ үшін өту шегі қандай?
- Біліктілік тестінен өту үшін педагог-модератор қанша балл жинауы керек?


In [1]:
import json

In [4]:
with open('/workspaces/chatbot-rag-83/documents/documents.json') as f:
    data = json.load(f)

In [8]:
new_data = {}

In [9]:
for key, value in data.items():
    new_key = key + '№ 83 Об утверждении Правил и условий проведения аттестации педагогических работников и приравненных к ним лиц, занимающих должности в организациях образования, реализующих общеобразовательные учебные программы дошкольного воспитания и обучения, начального, основного среднего и общего среднего образования, образовательные программы технического и профессионального, послесреднего, дополнительного образования и специальные учебные программы, иных гражданских служащих в области образования и науки '
    new_data[new_key] = value


In [10]:
with open('/workspaces/chatbot-rag-83/documents/documents.json', 'w') as f:
    json.dump(new_data, f)

'Приложение 14: Лист наблюдения урока (занятия, организованной деятельности, мероприятия) педагога организации среднего (специального), дополнительного, технического и профессионального, послесреднего образования, организаций образования для детей-сирот и детей, оставшихся без попечения родителей'