# AI-Powered Academic Knowledge Assistant
AI-powered RAG (Retrieval-Augmented Generation) system that transforms document collections into queryable knowledge bases using OpenAI embeddings and vector search. Features configurable chunking, file size limits, and retrieval parameters with a Gradio interface for processing PDFs and generating contextually-aware responses via LangChain and ChromaDB.

In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import shutil
import tiktoken
import time
import uuid
from typing import List, Tuple, Optional

# imports for langchain and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.docstore.document import Document

# Load environment variables
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

# Global variables to store the current setup
current_vectorstore = None
current_conversation_chain = None
processing_status = ""

def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """Count tokens in text using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
        return len(encoding.encode(text))
    except:
        # Fallback estimation: roughly 4 characters per token
        return len(text) // 4

def filter_chunks_by_tokens(chunks: List[Document], max_total_tokens: int = 250000) -> List[Document]:
    """Filter chunks to stay within token limits"""
    filtered_chunks = []
    total_tokens = 0
    
    for chunk in chunks:
        chunk_tokens = count_tokens(chunk.page_content)
        
        # Skip individual chunks that are too large (shouldn't happen with proper splitting)
        if chunk_tokens > 8000:  # Individual chunk limit
            continue
            
        if total_tokens + chunk_tokens <= max_total_tokens:
            filtered_chunks.append(chunk)
            total_tokens += chunk_tokens
        else:
            break
    
    return filtered_chunks

def add_metadata(doc, doc_type, file_path):
    """Add metadata including document type and file information"""
    doc.metadata["doc_type"] = doc_type
    doc.metadata["file_path"] = file_path
    doc.metadata["file_name"] = os.path.basename(file_path)
    return doc

def check_file_size(file_path, max_size_bytes):
    """Check if file size is within the limit"""
    try:
        file_size = os.path.getsize(file_path)
        return file_size <= max_size_bytes, file_size
    except OSError:
        return False, 0

def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):
    """Load PDF files from a folder with size restrictions"""
    pdf_files = glob.glob(os.path.join(folder_path, "**/*.pdf"), recursive=True)
    loaded_docs = []
    skipped_files = []
    
    for pdf_file in pdf_files:
        is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)
        
        if is_valid_size:
            try:
                loader = PyPDFLoader(pdf_file)
                docs = loader.load()
                docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]
                loaded_docs.extend(docs_with_metadata)
            except Exception as e:
                skipped_files.append((pdf_file, f"Loading error: {str(e)}"))
        else:
            file_size_mb = file_size / 1024 / 1024
            skipped_files.append((pdf_file, f"File too large: {file_size_mb:.2f} MB"))
    
    return loaded_docs, skipped_files

def process_documents(knowledge_base_dir: str, max_file_size_mb: float, chunk_size: int, chunk_overlap: int) -> Tuple[str, str]:
    """Process documents and create vector store"""
    global current_vectorstore, current_conversation_chain
    
    try:
        # Validate directory
        if not knowledge_base_dir or not knowledge_base_dir.strip():
            return "❌ Error: Please enter a directory path!", ""
        
        directory_path = knowledge_base_dir.strip()
        
        if not os.path.exists(directory_path):
            return "❌ Error: Directory does not exist! Please check the path.", ""
        
        # Configuration
        MAX_FILE_SIZE_BYTES = int(max_file_size_mb * 1024 * 1024)
        
        # Find folders
        if directory_path.endswith('*'):
            folders = glob.glob(directory_path)
        else:
            folders = glob.glob(os.path.join(directory_path, "*"))
        
        if not folders:
            return "❌ Error: No folders found in the specified directory!", ""
        
        # Process documents
        documents = []
        all_skipped_files = []
        status_lines = []
        
        status_lines.append(f"🔍 Processing folders with {max_file_size_mb} MB file size limit...")
        status_lines.append("-" * 60)
        
        for folder in folders:
            if os.path.isdir(folder):
                doc_type = os.path.basename(folder)
                status_lines.append(f"📁 Processing folder: {doc_type}")
                
                folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)
                documents.extend(folder_docs)
                all_skipped_files.extend(skipped_files)
                
                if folder_docs:
                    status_lines.append(f"  ✅ Loaded {len(folder_docs)} document pages")
                if skipped_files:
                    status_lines.append(f"  ⚠️ Skipped {len(skipped_files)} files")
        
        if not documents:
            error_msg = "❌ No PDF documents were loaded successfully."
            if all_skipped_files:
                error_msg += f"\n\nAll {len(all_skipped_files)} files were skipped:"
                for file_path, reason in all_skipped_files[:10]:  # Show first 10
                    error_msg += f"\n  • {os.path.basename(file_path)}: {reason}"
                if len(all_skipped_files) > 10:
                    error_msg += f"\n  ... and {len(all_skipped_files) - 10} more"
            return error_msg, ""
        
        # Text splitting
        status_lines.append("\n" + "="*40)
        status_lines.append("✂️ TEXT SPLITTING")
        status_lines.append("="*40)
        
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_documents(documents)
        
        # Filter chunks by token count to prevent API errors
        status_lines.append("🔢 Checking token limits...")
        original_chunk_count = len(chunks)
        chunks = filter_chunks_by_tokens(chunks, max_total_tokens=250000)
        
        if len(chunks) < original_chunk_count:
            status_lines.append(f"⚠️ Filtered from {original_chunk_count} to {len(chunks)} chunks to stay within token limits")
        
        # Create vectorstore
        status_lines.append("🧮 Creating vector embeddings...")
        embeddings = OpenAIEmbeddings()
        
        # Use a temporary database name
        db_name = "temp_vector_db"
        
        # Delete if already exists
        if os.path.exists(db_name):
            shutil.rmtree(db_name)
        
        # Create vectorstore
        vectorstore = Chroma.from_documents(
            documents=chunks, 
            embedding=embeddings, 
            persist_directory=db_name
        )
        
        # Update global variables
        current_vectorstore = vectorstore
        
        # Create conversation chain
        llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")
        memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
        retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
        current_conversation_chain = ConversationalRetrievalChain.from_llm(
            llm=llm, 
            retriever=retriever, 
            memory=memory
        )
        
        # Summary statistics
        status_lines.append("\n" + "="*40)
        status_lines.append("📊 SUMMARY")
        status_lines.append("="*40)
        status_lines.append(f"✅ Total PDFs processed: {len(set(doc.metadata['file_path'] for doc in documents))}")
        status_lines.append(f"📄 Total document pages: {len(documents)}")
        status_lines.append(f"🧩 Total text chunks: {len(chunks)}")
        status_lines.append(f"📁 Document types: {', '.join(set(doc.metadata['doc_type'] for doc in documents))}")
        status_lines.append(f"🗃️ Vector store size: {vectorstore._collection.count()} embeddings")
        
        if all_skipped_files:
            status_lines.append(f"\n⚠️ Skipped files: {len(all_skipped_files)}")
            for file_path, reason in all_skipped_files[:5]:  # Show first 5
                status_lines.append(f"  • {os.path.basename(file_path)}: {reason}")
            if len(all_skipped_files) > 5:
                status_lines.append(f"  ... and {len(all_skipped_files) - 5} more")
        
        success_msg = "✅ Knowledge base successfully created and ready for questions!"
        detailed_status = "\n".join(status_lines)
        
        return success_msg, detailed_status
        
    except Exception as e:
        error_msg = f"❌ Error processing documents: {str(e)}"
        return error_msg, ""

def chat_with_documents(message, history, num_chunks):
    """Chat with the processed documents"""
    global current_conversation_chain, current_vectorstore
    
    if current_conversation_chain is None:
        return "❌ Please process documents first before asking questions!"
    
    try:
        # Update retriever with new chunk count
        if current_vectorstore is not None:
            retriever = current_vectorstore.as_retriever(search_kwargs={"k": num_chunks})
            current_conversation_chain.retriever = retriever
        
        result = current_conversation_chain.invoke({"question": message})
        return result["answer"]
    
    except Exception as e:
        return f"❌ Error generating response: {str(e)}"

def reset_conversation():
    """Reset the conversation memory"""
    global current_conversation_chain
    if current_conversation_chain is not None:
        current_conversation_chain.memory.clear()
        return "✅ Conversation history cleared!"
    return "No active conversation to reset."

# Create Gradio Interface
with gr.Blocks(title="AI-Powered Academic Knowledge Assistant", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🎓 AI-Powered Academic Knowledge Assistant")
    gr.Markdown("Transform your entire document library into an intelligent, searchable AI tutor that answers questions instantly.")
    
    with gr.Tabs():
        # Configuration Tab
        with gr.Tab("⚙️ Configuration"):
            gr.Markdown("### 📁 Document Processing Settings")
            
            gr.Markdown("💡 **Tip:** Copy and paste your folder path here. On mobile, you can use file manager apps to copy folder paths.")
            
            with gr.Row():
                with gr.Column():
                    knowledge_dir = gr.Textbox(
                        label="Knowledge Base Directory",
                        value=r"C:\Users\Documents\Syllabi\Georgia Tech\Spring 22\Microwave Design",
                        placeholder="Enter or paste your document directory path",
                        lines=1
                    )
                    
                    max_file_size = gr.Slider(
                        label="Max File Size (MB)",
                        minimum=0.5,
                        maximum=50,
                        value=4,
                        step=0.5
                    )
                
                with gr.Column():
                    chunk_size = gr.Slider(
                        label="Chunk Size (characters)",
                        minimum=200,
                        maximum=1500,
                        value=800,
                        step=100,
                        info="Smaller chunks = better token management"
                    )
                    
                    chunk_overlap = gr.Slider(
                        label="Chunk Overlap (characters)",
                        minimum=0,
                        maximum=300,
                        value=150,
                        step=25,
                        info="Overlap preserves context between chunks"
                    )
            
            process_btn = gr.Button("🚀 Process Documents", variant="primary", size="lg")
            
            with gr.Row():
                status_output = gr.Textbox(
                    label="Status",
                    lines=2,
                    max_lines=2
                )
            
            detailed_output = gr.Textbox(
                label="Detailed Processing Log",
                lines=15,
                max_lines=20
            )
        
        # Chat Tab
        with gr.Tab("💬 Chat"):
            gr.Markdown("### 🤖 Ask Questions About Your Documents")
            
            with gr.Row():
                with gr.Column(scale=1):
                    num_chunks = gr.Slider(
                        label="Number of chunks to retrieve",
                        minimum=1,
                        maximum=50,
                        value=25,
                        step=1
                    )
                    
                    reset_btn = gr.Button("🗑️ Clear Chat History", variant="secondary")
                    reset_output = gr.Textbox(label="Reset Status", lines=1)
                
                with gr.Column(scale=3):
                    chatbot = gr.ChatInterface(
                        fn=lambda msg, history: chat_with_documents(msg, history, num_chunks.value),
                        type="messages",
                        title="Academic Assistant Chat",
                        description="Ask questions about your processed documents"
                    )
    
    # Event handlers
    process_btn.click(
        fn=process_documents,
        inputs=[knowledge_dir, max_file_size, chunk_size, chunk_overlap],
        outputs=[status_output, detailed_output]
    )
    
    reset_btn.click(
        fn=reset_conversation,
        outputs=reset_output
    )


In [None]:
app.launch(share=True, inbrowser=True)