In [None]:
import json
import math
import pickle
import os
from typing import List, Dict, Tuple, Optional
import numpy as np

try:
    import faiss  # Vector Database
    import PyPDF2  # PDF Upload and Processing
    from sentence_transformers import SentenceTransformer  # Domain-Specific Embeddings
    ADVANCED_MODE = True
    print("Advanced RAG mode enabled with Faiss, PDF processing, and embeddings")
except ImportError:
    ADVANCED_MODE = False
    print("Basic mode - install faiss-cpu, PyPDF2, sentence-transformers for advanced features")

class EnhancedRAGAssistant:
    def __init__(self, use_embeddings: bool = True):
        """Initialize assistant with enhanced RAG capabilities"""
        self.use_embeddings = use_embeddings and ADVANCED_MODE
        self.knowledge_base = []
        self.conversation_history = []
        
        # NEW: Embedding Creation & Domain-Specific Embeddings
        if self.use_embeddings:
            print("Loading domain-specific mental health embeddings model...")
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.embedding_dim = 384
            
            # NEW: Faiss Vector Database
            self.vector_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product for similarity
            self.doc_embeddings = []
            
        self.cache_file = "mental_health_embeddings_cache.pkl"
        self.vector_cache_file = "mental_health_vector_cache.faiss"
        
        self.create_knowledge_base()
        
        self.load_cached_embeddings()
        
        print(f"Mental Health Assistant initialized with {len(self.knowledge_base)} documents")
        if self.use_embeddings:
            print(f"Vector database contains {self.vector_index.ntotal} embeddings")

    def create_knowledge_base(self) -> None:
        """Create enhanced knowledge base with mental health information"""
        # Original knowledge base + enhanced content
        knowledge_docs = [
            {
                "id": "anxiety_basics",
                "content": "anxiety worry panic breathing relaxation mindfulness exercise sleep routine calm techniques deep breath meditation progressive muscle",
                "full_text": "For anxiety: Practice deep breathing (4-7-8 technique), try progressive muscle relaxation, use mindfulness meditation, maintain regular exercise, keep consistent sleep schedule, limit caffeine, and stay connected with support system.",
                "category": "anxiety"
            },
            {
                "id": "anxiety_advanced",
                "content": "anxiety cognitive behavioral therapy CBT exposure therapy grounding techniques 5-4-3-2-1 method panic attacks agoraphobia social anxiety generalized anxiety disorder GAD",
                "full_text": "Advanced anxiety management: Cognitive Behavioral Therapy (CBT) helps identify and change negative thought patterns. Exposure therapy gradually reduces avoidance behaviors. Use grounding techniques like 5-4-3-2-1 method (5 things you see, 4 you touch, 3 you hear, 2 you smell, 1 you taste). Different types include panic disorder, social anxiety, and GAD.",
                "category": "anxiety"
            },
            {
                "id": "depression_basics",
                "content": "depression sad hopeless routine goals exercise sunlight sleep self-care activities support connection therapy medication",
                "full_text": "For depression: Maintain daily routine, set small achievable goals, stay physically active, get sunlight exposure, practice self-compassion, stay connected with others, engage in previously enjoyed activities. Professional therapy and medication are highly effective.",
                "category": "depression"
            },
            {
                "id": "depression_advanced",
                "content": "depression major depressive disorder MDD seasonal affective disorder SAD bipolar disorder therapy antidepressants SSRI cognitive therapy behavioral activation",
                "full_text": "Depression types and treatments: Major Depressive Disorder (MDD) affects mood, energy, and daily functioning. Seasonal Affective Disorder (SAD) occurs during darker months. Treatment includes therapy (cognitive, behavioral activation), medications (SSRIs, SNRIs), and lifestyle changes. Bipolar disorder requires specialized treatment.",
                "category": "depression"
            },
            {
                "id": "stress_management",
                "content": "stress overwhelmed pressure time management relaxation exercise healthy lifestyle social support hobbies professional help resilience",
                "full_text": "For stress: Use time management techniques, practice relaxation methods, engage in regular physical activity, maintain healthy lifestyle, seek social support, pursue enjoyable hobbies, consider professional counseling, build resilience through positive thinking.",
                "category": "stress"
            },
            {
                "id": "crisis_resources",
                "content": "crisis suicide self-harm emergency help 988 lifeline professional immediate support therapy treatment",
                "full_text": "Crisis resources: National Suicide Prevention Lifeline 988, Crisis Text Line (text HOME to 741741), Emergency Services 911. If having thoughts of self-harm, seek immediate professional help. Treatment is effective and help is available.",
                "category": "crisis"
            },
            {
                "id": "trauma_ptsd",
                "content": "trauma PTSD post-traumatic stress disorder flashbacks nightmares avoidance hypervigilance EMDR therapy exposure therapy",
                "full_text": "Trauma and PTSD: Post-Traumatic Stress Disorder can develop after experiencing or witnessing traumatic events. Symptoms include flashbacks, nightmares, avoidance, and hypervigilance. Effective treatments include EMDR (Eye Movement Desensitization and Reprocessing), trauma-focused CBT, and exposure therapy.",
                "category": "trauma"
            },
            {
                "id": "mindfulness_meditation",
                "content": "mindfulness meditation present moment awareness breathing exercises body scan loving-kindness meditation apps headspace calm",
                "full_text": "Mindfulness and meditation: Practice present-moment awareness through breathing exercises, body scans, and loving-kindness meditation. Regular practice reduces stress, anxiety, and depression. Use apps like Headspace, Calm, or Insight Timer for guided sessions. Start with 5-10 minutes daily.",
                "category": "mindfulness"
            }
        ]
        
        self.knowledge_base = knowledge_docs

    # NEW: Document Ingestion
    def ingest_document(self, text: str, doc_id: str, category: str = "general") -> None:
        """Ingest a new document into the knowledge base"""
        # NEW: Chunking Strategies
        chunks = self.chunk_document(text, chunk_size=200, overlap=50)
        
        for i, chunk in enumerate(chunks):
            doc = {
                "id": f"{doc_id}_chunk_{i}",
                "content": chunk.lower(),
                "full_text": chunk,
                "category": category,
                "source": doc_id
            }
            self.knowledge_base.append(doc)
            
            # NEW: Create embeddings for new document
            if self.use_embeddings:
                embedding = self.embedding_model.encode([chunk])[0]
                self.doc_embeddings.append(embedding)
                self.vector_index.add(np.array([embedding], dtype=np.float32))
        
        print(f"Ingested document '{doc_id}' as {len(chunks)} chunks")

    # NEW: Chunking Strategies
    def chunk_document(self, text: str, chunk_size: int = 200, overlap: int = 50) -> List[str]:
        """Divide document into overlapping chunks for better retrieval"""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:i + chunk_size]
            chunk = ' '.join(chunk_words)
            if len(chunk.strip()) > 0:
                chunks.append(chunk)
                
        return chunks

    # NEW: PDF Upload and Processing
    def upload_pdf(self, pdf_path: str, doc_id: Optional[str] = None) -> None:
        """Upload and process PDF document"""
        if not ADVANCED_MODE:
            print("PDF processing requires PyPDF2. Install with: pip install PyPDF2")
            return
            
        if doc_id is None:
            doc_id = os.path.basename(pdf_path).replace('.pdf', '')
            
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                
                # NEW: Divide PDF into Chunks and ingest
                self.ingest_document(text, doc_id, "pdf_upload")
                print(f"Successfully processed PDF: {pdf_path}")
                
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {str(e)}")

    # NEW: Embedding Creation
    def create_embeddings(self) -> None:
        """Create embeddings for all documents in knowledge base"""
        if not self.use_embeddings:
            return
            
        print("Creating embeddings for knowledge base...")
        self.doc_embeddings = []
        
        # Clear existing index
        self.vector_index = faiss.IndexFlatIP(self.embedding_dim)
        
        for doc in self.knowledge_base:
            embedding = self.embedding_model.encode([doc["full_text"]])[0]
            self.doc_embeddings.append(embedding)
            
        # Add all embeddings to Faiss index
        if self.doc_embeddings:
            embeddings_array = np.array(self.doc_embeddings, dtype=np.float32)
            # Normalize for cosine similarity
            faiss.normalize_L2(embeddings_array)
            self.vector_index.add(embeddings_array)
            
        print(f"Created {len(self.doc_embeddings)} embeddings")

    # NEW: Avoid Re-embedding / Cache Embeddings
    def save_cached_embeddings(self) -> None:
        """Save embeddings to cache for faster loading"""
        if not self.use_embeddings:
            return
            
        try:
            # Save document embeddings and metadata
            cache_data = {
                'doc_embeddings': self.doc_embeddings,
                'knowledge_base': self.knowledge_base
            }
            with open(self.cache_file, 'wb') as f:
                pickle.dump(cache_data, f)
                
            # Save Faiss index
            faiss.write_index(self.vector_index, self.vector_cache_file)
            print("Embeddings cached successfully")
            
        except Exception as e:
            print(f"Error saving cache: {str(e)}")

    # NEW: Load cached embeddings
    def load_cached_embeddings(self) -> None:
        """Load embeddings from cache to avoid re-computation"""
        if not self.use_embeddings:
            return
            
        try:
            if os.path.exists(self.cache_file) and os.path.exists(self.vector_cache_file):
                # Load cached data
                with open(self.cache_file, 'rb') as f:
                    cache_data = pickle.load(f)
                    
                # Only use cache if knowledge base matches
                if len(cache_data['knowledge_base']) == len(self.knowledge_base):
                    self.doc_embeddings = cache_data['doc_embeddings']
                    self.vector_index = faiss.read_index(self.vector_cache_file)
                    print("Loaded embeddings from cache")
                    return
                    
        except Exception as e:
            print(f"Cache loading failed: {str(e)}")
            
        # Create new embeddings if cache loading failed
        self.create_embeddings()
        self.save_cached_embeddings()

    # NEW: Enhanced similarity calculation with embeddings
    def calculate_similarity_embeddings(self, query: str, top_k: int = 3) -> List[Tuple[float, Dict]]:
        """Calculate semantic similarity using embeddings"""
        if not self.use_embeddings:
            return []
            
        # Create query embedding
        query_embedding = self.embedding_model.encode([query])
        query_embedding = np.array(query_embedding, dtype=np.float32)
        faiss.normalize_L2(query_embedding)
        
        # Search in vector database
        similarities, indices = self.vector_index.search(query_embedding, top_k)
        
        results = []
        for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
            if idx < len(self.knowledge_base):
                results.append((float(similarity), self.knowledge_base[idx]))
                
        return results

    # Original tokenization method (kept for fallback)
    def simple_tokenize(self, text: str) -> List[str]:
        """Basic tokenization - split text into words"""
        return text.lower().replace(',', '').replace('.', '').split()

    # Enhanced similarity with semantic understanding
    def calculate_similarity(self, query_words: List[str], doc_words: List[str]) -> float:
        """Calculate simple word overlap similarity (fallback method)"""
        query_set = set(query_words)
        doc_set = set(doc_words)
        
        if not query_set or not doc_set:
            return 0.0
            
        intersection = len(query_set.intersection(doc_set))
        union = len(query_set.union(doc_set))
        
        return intersection / union if union > 0 else 0.0

    # NEW: Semantic Similarity (word variants)
    def search_knowledge_base(self, query: str, top_k: int = 3) -> List[Dict]:
        """Enhanced search with semantic similarity"""
        if self.use_embeddings:
            # Use embedding-based semantic search
            results = self.calculate_similarity_embeddings(query, top_k)
            return [doc for score, doc in results if score > 0.3]  # Threshold for relevance
        else:
            # Fallback to original word-based search
            query_words = self.simple_tokenize(query)
            scored_docs = []
            
            for doc in self.knowledge_base:
                doc_words = self.simple_tokenize(doc["content"])
                similarity = self.calculate_similarity(query_words, doc_words)
                scored_docs.append((similarity, doc))
                
            scored_docs.sort(key=lambda x: x[0], reverse=True)
            return [doc for score, doc in scored_docs[:top_k] if score > 0]

    def generate_response(self, user_input: str) -> str:
        """Generate response using enhanced RAG approach"""
        # Retrieve relevant documents using enhanced search
        relevant_docs = self.search_knowledge_base(user_input, top_k=3)
        
        # Create context from retrieved documents
        context = ""
        for doc in relevant_docs:
            context += doc["full_text"] + " "
            
        # Generate response based on input and context
        return self.create_response(user_input, context.strip(), relevant_docs)

    def create_response(self, user_input: str, context: str, relevant_docs: List[Dict]) -> str:
        """Create enhanced response with document sources"""
        user_lower = user_input.lower()
        
        # Enhanced response generation with category awareness
        response = ""
        categories_found = set()
        
        if relevant_docs:
            for doc in relevant_docs:
                categories_found.add(doc.get("category", "general"))
        
        # Detect primary concern and provide targeted response
        if any(word in user_lower for word in ['anxiety', 'anxious', 'worried', 'panic']):
            response = "I understand you're experiencing anxiety. "
            if 'anxiety' in categories_found:
                response += context
            response += "\n\nWhat specific situations trigger your anxiety? Talking through them can help."
            
        elif any(word in user_lower for word in ['depression', 'depressed', 'sad', 'hopeless']):
            response = "I hear that you're going through a difficult time. "
            if 'depression' in categories_found:
                response += context
            response += "\n\nHow long have you been feeling this way? Remember that seeking help shows strength."
            
        elif any(word in user_lower for word in ['stress', 'stressed', 'overwhelmed']):
            response = "Stress can feel overwhelming, but there are ways to manage it. "
            if 'stress' in categories_found:
                response += context
            response += "\n\nWhat's the main source of stress in your life right now?"
            
        elif any(word in user_lower for word in ['trauma', 'ptsd', 'flashback']):
            response = "Trauma can have lasting effects, but healing is possible. "
            if 'trauma' in categories_found:
                response += context
            response += "\n\nTrauma therapy with qualified professionals can be very effective."
            
        elif any(word in user_lower for word in ['crisis', 'suicide', 'self-harm']):
            response = "I'm concerned about you. Please reach out for immediate support: "
            if 'crisis' in categories_found:
                response += context
            response += "\n\nYour life has value. Please contact these resources right away."
            
        else:
            if context:
                response = "I'm here to help with mental health support. " + context
            else:
                response = "I'm here to support you with mental health concerns like anxiety, depression, stress management, and crisis resources."
            response += "\n\nWhat's on your mind today? How can I best support you?"
        
        # Add source information if using advanced mode
        if self.use_embeddings and relevant_docs:
            sources = set([doc.get('source', doc['id']) for doc in relevant_docs])
            if len(sources) > 1:
                response += f"\n\n(Information from: {', '.join(list(sources)[:3])})"
                
        return response

    def chat(self, user_input: str) -> str:
        """Main chat function with enhanced capabilities"""
        # Add to conversation history
        self.conversation_history.append({"role": "user", "content": user_input})
        
        # Generate response using enhanced RAG
        response = self.generate_response(user_input)
        
        # Add response to history
        self.conversation_history.append({"role": "assistant", "content": response})
        
        # Keep history manageable
        if len(self.conversation_history) > 10:
            self.conversation_history = self.conversation_history[-10:]
            
        return response

    def get_stats(self) -> Dict:
        """Get system statistics"""
        stats = {
            "Total Documents": len(self.knowledge_base),
            "Conversation Length": len(self.conversation_history),
            "Advanced Mode": self.use_embeddings,
        }
        
        if self.use_embeddings:
            stats["Vector Database Size"] = self.vector_index.ntotal
            stats["Embedding Dimension"] = self.embedding_dim
            
        return stats

    # NEW: Batch document processing
    def batch_ingest_documents(self, documents: List[Dict]) -> None:
        """Ingest multiple documents at once"""
        for doc_info in documents:
            self.ingest_document(
                doc_info['text'], 
                doc_info['id'], 
                doc_info.get('category', 'general')
            )
        
        # Update cache after batch processing
        if self.use_embeddings:
            self.save_cached_embeddings()

def demo_enhanced_system():
    """Demonstrate the enhanced RAG system"""
    print("ENHANCED RAG SYSTEM DEMONSTRATION")
    print("=" * 50)
    
    assistant = EnhancedRAGAssistant()
    
    # Demo 1: Show knowledge base
    print("Knowledge Base Topics:")
    categories = {}
    for doc in assistant.knowledge_base:
        category = doc.get('category', 'general')
        if category not in categories:
            categories[category] = 0
        categories[category] += 1
    
    for category, count in categories.items():
        print(f"- {category}: {count} documents")
    
    # Demo 2: Show enhanced search
    print("\nEnhanced Search Demonstration:")
    test_queries = [
        "I feel anxious about social situations",
        "I'm having trouble sleeping and feel sad",
        "work is overwhelming me",
        "I keep having flashbacks"
    ]
    
    for query in test_queries:
        results = assistant.search_knowledge_base(query, top_k=2)
        categories = [doc.get('category', 'unknown') for doc in results]
        print(f"Query: '{query}' -> Found: {categories}")
    
    # Demo 3: Show complete enhanced pipeline
    print("\nEnhanced RAG Pipeline Test:")
    test_input = "I've been having panic attacks and can't sleep"
    response = assistant.chat(test_input)
    print(f"Input: {test_input}")
    print(f"Response: {response[:150]}...")
    
    # Demo 4: Show stats
    print("\nSystem Statistics:")
    stats = assistant.get_stats()
    for key, value in stats.items():
        print(f"{key}: {value}")

def interactive_enhanced_chat():
    """Enhanced interactive chat with new features"""
    print("\nENHANCED MENTAL HEALTH ASSISTANT")
    print("=" * 40)
    print("Commands: 'quit' to exit, 'stats' for info, 'upload <pdf_path>' for PDF")
    print("'ingest <text>' to add custom content")
    
    assistant = EnhancedRAGAssistant()
    
    while True:
        try:
            user_input = input("\nYou: ").strip()
            
            if user_input.lower() in ['quit', 'exit']:
                print("Take care of yourself!")
                break
                
            if user_input.lower() == 'stats':
                stats = assistant.get_stats()
                for key, value in stats.items():
                    print(f"{key}: {value}")
                continue
                
            # NEW: PDF upload command
            if user_input.lower().startswith('upload '):
                pdf_path = user_input[7:].strip()
                assistant.upload_pdf(pdf_path)
                continue
                
            # NEW: Custom document ingestion
            if user_input.lower().startswith('ingest '):
                text = user_input[7:].strip()
                doc_id = f"custom_{len(assistant.knowledge_base)}"
                assistant.ingest_document(text, doc_id, "custom")
                print(f"Added custom document: {doc_id}")
                continue
                
            if user_input:
                response = assistant.chat(user_input)
                print(f"\nAssistant: {response}")
                
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break

# For Jupyter notebook
def start_enhanced_assistant():
    """Quick start for Jupyter with enhanced features"""
    assistant = EnhancedRAGAssistant()
    
    def chat(message):
        return assistant.chat(message)
    
    def upload_pdf(pdf_path):
        return assistant.upload_pdf(pdf_path)
    
    def add_document(text, doc_id, category="custom"):
        return assistant.ingest_document(text, doc_id, category)
    
    def get_stats():
        return assistant.get_stats()
    
    print("Enhanced Assistant ready!")
    print("Functions: chat('message'), upload_pdf('path'), add_document('text', 'id'), get_stats()")
    
    return {
        'chat': chat,
        'upload_pdf': upload_pdf,
        'add_document': add_document,
        'get_stats': get_stats
    }

# Main execution
if __name__ == "__main__":
    demo_enhanced_system()
    interactive_enhanced_chat()
