In [None]:
import pdfplumber
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Union
import re
from bs4 import BeautifulSoup
import tabula
import os
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

class AdvancedRAGSystem:
    def __init__(self):
        self.embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        self.chunks = []
        self.embeddings = None
        self.index = None
        self.metadata = []
        self.tables_data = []
        
    def process_pdfs(self, pdf_paths: List[str], chunk_size: int = 1000):
        """Process multiple PDF files"""
        for pdf_path in pdf_paths:
            self.extract_and_process_pdf(pdf_path, chunk_size)
        self.create_embeddings()
        
    def extract_and_process_pdf(self, pdf_path: str, chunk_size: int):
        """Extract text and tables from PDF"""
        try:
            # Extract text using pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Extract text
                    text = page.extract_text()
                    if text:
                        self.process_text(text, chunk_size, pdf_path, page_num)
                    
                    # Extract tables
                    tables = page.extract_tables()
                    if tables:
                        self.process_tables(tables, pdf_path, page_num)
            
            # Extract tables using tabula
            try:
                tables = tabula.read_pdf(pdf_path, pages='all')
                for table in tables:
                    self.tables_data.append({
                        'table': table,
                        'source': pdf_path,
                        'page': 'N/A'
                    })
            except Exception as e:
                print(f"Warning: Table extraction with tabula failed: {str(e)}")
                
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {str(e)}")
            
    def process_text(self, text: str, chunk_size: int, source: str, page: int):
        """Process and chunk text"""
        # Clean text
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Split into sentences
        sentences = sent_tokenize(text)
        
        current_chunk = []
        current_size = 0
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            current_chunk.append(sentence)
            current_size += len(sentence)
            
            if current_size >= chunk_size:
                chunk_text = ' '.join(current_chunk)
                self.chunks.append(chunk_text)
                self.metadata.append({
                    'source': source,
                    'page': page,
                    'type': 'text'
                })
                current_chunk = []
                current_size = 0
                
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            self.chunks.append(chunk_text)
            self.metadata.append({
                'source': source,
                'page': page,
                'type': 'text'
            })
            
    def process_tables(self, tables: List, source: str, page: int):
        """Process extracted tables"""
        for table in tables:
            if table and len(table) > 0:
                try:
                    # Convert table to DataFrame
                    df = pd.DataFrame(table[1:], columns=table[0])
                    # Clean DataFrame
                    df = df.dropna(how='all').dropna(axis=1, how='all')
                    
                    if not df.empty:
                        self.tables_data.append({
                            'table': df,
                            'source': source,
                            'page': page
                        })
                        
                        # Add table content to chunks for searching
                        table_text = df.to_string()
                        self.chunks.append(table_text)
                        self.metadata.append({
                            'source': source,
                            'page': page,
                            'type': 'table'
                        })
                except Exception as e:
                    print(f"Warning: Failed to process table: {str(e)}")
                
    def create_embeddings(self):
        """Create embeddings for chunks"""
        if not self.chunks:
            print("No chunks to embed!")
            return
            
        try:
            self.embeddings = self.embed_model.encode(self.chunks)
            dimension = self.embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(self.embeddings.astype('float32'))
        except Exception as e:
            print(f"Error creating embeddings: {str(e)}")
            
    def search(self, query: str, k: int = 3) -> List[Dict]:
        """Search for relevant chunks"""
        if not self.index:
            return []
            
        try:
            query_embedding = self.embed_model.encode([query])
            distances, indices = self.index.search(query_embedding.astype('float32'), k)
            
            results = []
            for i, idx in enumerate(indices[0]):
                results.append({
                    'chunk': self.chunks[idx],
                    'metadata': self.metadata[idx],
                    'score': float(distances[0][i])
                })
            return results
        except Exception as e:
            print(f"Error during search: {str(e)}")
            return []
            
    def handle_comparison_query(self, query: str) -> str:
        """Handle comparison queries"""
        relevant_chunks = self.search(query, k=5)
        relevant_tables = self.find_relevant_tables(query)
        
        comparison_result = "Comparison Analysis:\n\n"
        
        # Process text-based comparisons
        if relevant_chunks:
            comparison_result += "Text Analysis:\n"
            for chunk in relevant_chunks:
                comparison_result += f"- From {chunk['metadata']['source']} (Page {chunk['metadata']['page']}):\n"
                comparison_result += f"  {chunk['chunk']}\n\n"
                
        # Process table-based comparisons
        if relevant_tables:
            comparison_result += "\nTabular Analysis:\n"
            for table_info in relevant_tables:
                comparison_result += f"\nTable from {table_info['source']} (Page {table_info['page']}):\n"
                comparison_result += str(table_info['table'])
                comparison_result += "\n"
                
        return comparison_result
        
    def find_relevant_tables(self, query: str) -> List[Dict]:
        """Find relevant tables for the query"""
        relevant_tables = []
        query_terms = set(query.lower().split())
        
        for table_info in self.tables_data:
            table_str = str(table_info['table']).lower()
            if any(term in table_str for term in query_terms):
                relevant_tables.append(table_info)
                
        return relevant_tables
        
    def get_answer(self, query: str) -> str:
        """Generate answer for query"""
        try:
            # Check if it's a comparison query
            if any(word in query.lower() for word in ['compare', 'comparison', 'difference', 'versus', 'vs']):
                return self.handle_comparison_query(query)
                
            # Regular query processing
            relevant_chunks = self.search(query)
            if not relevant_chunks:
                return "I couldn't find relevant information to answer your question."
                
            response = "Based on the retrieved information:\n\n"
            for chunk in relevant_chunks:
                response += f"From {chunk['metadata']['source']} (Page {chunk['metadata']['page']}):\n"
                response += f"{chunk['chunk']}\n\n"
                
            return response
            
        except Exception as e:
            return f"Error generating answer: {str(e)}"

def validate_pdf_path(path: str) -> bool:
    """Validate if the path exists and is a PDF file"""
    if not os.path.exists(path):
        print(f"Error: File does not exist: {path}")
        return False
    if not path.lower().endswith('.pdf'):
        print(f"Error: File is not a PDF: {path}")
        return False
    return True

def get_pdf_paths() -> List[str]:
    """Get PDF paths from user input"""
    pdf_paths = []
    while True:
        print("\nEnter PDF file path (or 'done' when finished):")
        path = input().strip()
        
        if path.lower() == 'done':
            if not pdf_paths:
                print("Please enter at least one PDF path.")
                continue
            break
            
        if validate_pdf_path(path):
            pdf_paths.append(path)
            print(f"Added: {path}")
            print("Enter another PDF path or type 'done' to proceed")
        else:
            print("Please enter a valid PDF file path")
    
    return pdf_paths

def main():
    print("Welcome to the Advanced RAG System!")
    print("This system will help you analyze and query multiple PDF documents.")
    
    # Get PDF paths from user
    print("\nFirst, let's specify the PDF files you want to analyze.")
    pdf_paths = get_pdf_paths()
    
    # Initialize the system
    print("\nInitializing the RAG system...")
    rag_system = AdvancedRAGSystem()
    
    # Process PDFs
    print("\nProcessing PDF files...")
    rag_system.process_pdfs(pdf_paths)
    print("PDF processing completed!")
    
    # Interactive query loop
    print("\nYou can now ask questions about your PDFs!")
    print("Available commands:")
    print("- Type your question to query the documents")
    print("- Type 'list' to show processed PDF files")
    print("- Type 'add' to add more PDF files")
    print("- Type 'quit' to exit")
    
    while True:
        print("\nEnter your question or command: ")
        query = input().strip()
        
        if query.lower() == 'quit':
            print("Thank you for using the RAG system. Goodbye!")
            break
            
        elif query.lower() == 'list':
            print("\nProcessed PDF files:")
            for i, path in enumerate(pdf_paths, 1):
                print(f"{i}. {path}")
            continue
            
        elif query.lower() == 'add':
            new_paths = get_pdf_paths()
            rag_system.process_pdfs(new_paths)
            pdf_paths.extend(new_paths)
            print("New PDFs added and processed!")
            continue
            
        # Handle the query
        try:
            answer = rag_system.get_answer(query)
            print("\nAnswer:", answer)
        except Exception as e:
            print(f"\nError processing query: {str(e)}")
            print("Please try a different question or check if the PDFs were processed correctly.")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nProgram terminated by user. Goodbye!")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {str(e)}")
