In [2]:
# ============================================
# COMPLETE STUDYMATE APPLICATION FOR COLAB
# ============================================

# Step 1: Install dependencies (run once)
!pip install -q transformers torch gradio PyPDF2 sentence-transformers faiss-cpu accelerate

# Step 2: Run the application
# [PASTE THE ENTIRE STUDYMATE CODE HERE FROM THE ARTIFACT]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
"""
StudyMate: AI-Powered Academic Assistant
Complete implementation with all features in separate tabs
"""

import gradio as gr
import PyPDF2
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer
import faiss
import re
from typing import List, Tuple, Dict
import io

# ==================== MODEL INITIALIZATION ====================
print("Loading AI models... This may take a few minutes.")

# Load IBM Granite model for Q&A
tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.3-2b-instruct")
model = AutoModelForCausalLM.from_pretrained(
    "ibm-granite/granite-3.3-2b-instruct",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# Load embedding model for semantic search
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Models loaded successfully!")

# ==================== GLOBAL STATE ====================
class DocumentStore:
    def __init__(self):
        self.documents = []  # List of (filename, text, page_num)
        self.embeddings = None
        self.index = None

    def clear(self):
        self.documents = []
        self.embeddings = None
        self.index = None

doc_store = DocumentStore()

# ==================== PDF PROCESSING FUNCTIONS ====================
def extract_text_from_pdf(pdf_file) -> List[Tuple[str, str, int]]:
    """Extract text from PDF file page by page"""
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        documents = []

        for page_num, page in enumerate(pdf_reader.pages, 1):
            text = page.extract_text()
            if text.strip():
                documents.append((pdf_file.name, text, page_num))

        return documents
    except Exception as e:
        print(f"Error processing {pdf_file.name}: {str(e)}")
        return []

def process_uploaded_pdfs(pdf_files):
    """Process multiple PDF files and create vector index"""
    if not pdf_files:
        return "⚠️ Please upload at least one PDF file."

    doc_store.clear()

    # Extract text from all PDFs
    for pdf_file in pdf_files:
        docs = extract_text_from_pdf(pdf_file)
        doc_store.documents.extend(docs)

    if not doc_store.documents:
        return "❌ No text could be extracted from the uploaded PDFs."

    # Create embeddings
    texts = [doc[1] for doc in doc_store.documents]
    doc_store.embeddings = embedding_model.encode(texts, show_progress_bar=False)

    # Build FAISS index
    dimension = doc_store.embeddings.shape[1]
    doc_store.index = faiss.IndexFlatL2(dimension)
    doc_store.index.add(doc_store.embeddings.astype('float32'))

    num_docs = len(doc_store.documents)
    num_files = len(pdf_files)

    return f"✅ Successfully processed {num_files} PDF(s) with {num_docs} pages!\n\n📚 Files uploaded:\n" + "\n".join([f"- {pdf.name}" for pdf in pdf_files])

# ==================== AI QUERY FUNCTIONS ====================
def generate_answer(question: str, context: str) -> str:
    """Generate answer using IBM Granite model"""
    prompt = f"""You are an academic assistant helping students understand their study materials.

Context from documents:
{context}

Student Question: {question}

Provide a clear, accurate, and helpful answer based on the context above. If the context doesn't contain enough information, say so."""

    messages = [{"role": "user", "content": prompt}]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    answer = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )

    return answer.strip()

def semantic_search(query: str, top_k: int = 3) -> List[Tuple[str, str, int, float]]:
    """Perform semantic search using FAISS"""
    if not doc_store.index:
        return []

    query_embedding = embedding_model.encode([query])
    distances, indices = doc_store.index.search(query_embedding.astype('float32'), top_k)

    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if idx < len(doc_store.documents):
            filename, text, page_num = doc_store.documents[idx]
            results.append((filename, text, page_num, float(distance)))

    return results

# ==================== FEATURE 1: MULTI-PDF UPLOAD ====================
def create_upload_tab():
    with gr.Column():
        gr.Markdown("# 📚 Multi-PDF Upload & Parsing")
        gr.Markdown("Upload your textbooks, lecture notes, and research papers here.")

        pdf_input = gr.File(
            label="Upload PDF Files",
            file_types=[".pdf"],
            file_count="multiple"
        )

        upload_btn = gr.Button("Process PDFs", variant="primary", size="lg")
        upload_output = gr.Textbox(label="Status", lines=6)

        upload_btn.click(
            fn=process_uploaded_pdfs,
            inputs=[pdf_input],
            outputs=[upload_output]
        )

        gr.Markdown("""
        ### 📋 Instructions:
        1. Click "Browse Files" to select one or more PDF files
        2. Click "Process PDFs" to upload and index your documents
        3. Wait for the confirmation message
        4. Navigate to other tabs to interact with your documents
        """)

# ==================== FEATURE 2: CONVERSATIONAL Q&A ====================
def answer_question(question: str, num_context: int) -> str:
    """Answer questions based on uploaded documents"""
    if not doc_store.documents:
        return "⚠️ Please upload PDF documents first in the 'Upload PDFs' tab."

    if not question.strip():
        return "⚠️ Please enter a question."

    # Search for relevant context
    results = semantic_search(question, top_k=num_context)

    if not results:
        return "❌ No relevant content found in the documents."

    # Prepare context
    context = "\n\n".join([
        f"[From {filename}, Page {page_num}]\n{text[:500]}..."
        for filename, text, page_num, _ in results
    ])

    # Generate answer
    answer = generate_answer(question, context)

    # Format response with sources
    sources = "\n\n📖 **Sources:**\n" + "\n".join([
        f"- {filename} (Page {page_num})"
        for filename, _, page_num, _ in results
    ])

    return f"**Answer:**\n{answer}\n{sources}"

def create_qa_tab():
    with gr.Column():
        gr.Markdown("# 💬 Conversational Q&A")
        gr.Markdown("Ask questions about your study materials in natural language.")

        question_input = gr.Textbox(
            label="Your Question",
            placeholder="e.g., What is photosynthesis?",
            lines=3
        )

        num_context = gr.Slider(
            minimum=1,
            maximum=5,
            value=3,
            step=1,
            label="Number of source passages to use"
        )

        ask_btn = gr.Button("Get Answer", variant="primary", size="lg")
        answer_output = gr.Markdown(label="Answer")

        ask_btn.click(
            fn=answer_question,
            inputs=[question_input, num_context],
            outputs=[answer_output]
        )

        gr.Markdown("""
        ### 💡 Example Questions:
        - "Explain the concept of X in simple terms"
        - "What are the key differences between A and B?"
        - "Summarize the main points about Y"
        - "What does the document say about Z?"
        """)

# ==================== FEATURE 3: SEMANTIC SEARCH ====================
def search_documents(query: str, num_results: int) -> str:
    """Search documents using semantic search"""
    if not doc_store.documents:
        return "⚠️ Please upload PDF documents first in the 'Upload PDFs' tab."

    if not query.strip():
        return "⚠️ Please enter a search query."

    results = semantic_search(query, top_k=num_results)

    if not results:
        return "❌ No relevant content found."

    output = "## 🔍 Search Results\n\n"

    for i, (filename, text, page_num, distance) in enumerate(results, 1):
        relevance = max(0, 100 - distance * 10)  # Convert distance to relevance score
        output += f"### Result {i} (Relevance: {relevance:.1f}%)\n"
        output += f"**Source:** {filename} - Page {page_num}\n\n"
        output += f"{text[:400]}...\n\n"
        output += "---\n\n"

    return output

def create_search_tab():
    with gr.Column():
        gr.Markdown("# 🔎 Semantic Search Engine")
        gr.Markdown("Search your documents using natural language - no exact keywords needed!")

        search_input = gr.Textbox(
            label="Search Query",
            placeholder="e.g., machine learning algorithms",
            lines=2
        )

        num_results = gr.Slider(
            minimum=1,
            maximum=10,
            value=5,
            step=1,
            label="Number of results"
        )

        search_btn = gr.Button("Search", variant="primary", size="lg")
        search_output = gr.Markdown(label="Results")

        search_btn.click(
            fn=search_documents,
            inputs=[search_input, num_results],
            outputs=[search_output]
        )

        gr.Markdown("""
        ### ✨ Features:
        - **Semantic Understanding**: Finds relevant content even without exact keyword matches
        - **FAISS-Powered**: Lightning-fast vector search
        - **Ranked Results**: Most relevant passages shown first
        """)

# ==================== FEATURE 4: ACADEMIC OPTIMIZATION ====================
def generate_summary(content_type: str, topic: str) -> str:
    """Generate academic content summaries"""
    if not doc_store.documents:
        return "⚠️ Please upload PDF documents first in the 'Upload PDFs' tab."

    if not topic.strip():
        return "⚠️ Please enter a topic."

    # Search for relevant content
    results = semantic_search(topic, top_k=3)

    if not results:
        return "❌ No relevant content found for this topic."

    context = "\n\n".join([text for _, text, _, _ in results])

    prompts = {
        "Definition": f"Provide a clear, concise definition of '{topic}' based on the context below:\n\n{context}",
        "Summary": f"Summarize the key points about '{topic}' from the context below:\n\n{context}",
        "Explanation": f"Explain '{topic}' in detail for a student, based on the context below:\n\n{context}",
        "Key Points": f"List the main key points about '{topic}' from the context below:\n\n{context}"
    }

    messages = [{"role": "user", "content": prompts[content_type]}]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )

    return f"## {content_type}: {topic}\n\n{response.strip()}"

def create_academic_tab():
    with gr.Column():
        gr.Markdown("# 🎓 Academic Optimization")
        gr.Markdown("Get definitions, summaries, and explanations tailored for learning.")

        content_type = gr.Radio(
            choices=["Definition", "Summary", "Explanation", "Key Points"],
            value="Definition",
            label="Content Type"
        )

        topic_input = gr.Textbox(
            label="Topic or Concept",
            placeholder="e.g., Neural Networks",
            lines=2
        )

        generate_btn = gr.Button("Generate", variant="primary", size="lg")
        academic_output = gr.Markdown(label="Output")

        generate_btn.click(
            fn=generate_summary,
            inputs=[content_type, topic_input],
            outputs=[academic_output]
        )

        gr.Markdown("""
        ### 📚 Content Types:
        - **Definition**: Get precise academic definitions
        - **Summary**: Brief overview of key concepts
        - **Explanation**: Detailed, student-friendly explanations
        - **Key Points**: Bullet-point style main ideas
        """)

# ==================== FEATURE 5: LOCAL & SECURE PROCESSING ====================
def create_security_tab():
    with gr.Column():
        gr.Markdown("# 🔒 Local & Secure Processing")
        gr.Markdown("Your data privacy is our priority.")

        gr.Markdown("""
        ## Security Features

        ### ✅ What We Do:
        - **Local Processing**: All AI models run directly on your machine
        - **No Cloud Upload**: Your PDFs never leave your environment
        - **Memory-Only Storage**: Documents stored in RAM, not on disk
        - **Session-Based**: Data cleared when you close the application
        - **No Tracking**: No analytics or user behavior tracking

        ### 🔐 Technical Details:
        - **Model**: IBM Granite 3.3-2B (runs locally)
        - **Embedding**: SentenceTransformer (local)
        - **Vector Store**: FAISS (in-memory)
        - **Framework**: Gradio (open-source)

        ### 📊 Current Session Info:
        """)

        def get_session_info():
            num_docs = len(doc_store.documents)
            num_files = len(set([doc[0] for doc in doc_store.documents]))
            device = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"

            return f"""
            - **Documents Loaded**: {num_docs} pages from {num_files} file(s)
            - **Processing Device**: {device}
            - **Vector Index Size**: {doc_store.index.ntotal if doc_store.index else 0} embeddings
            - **Model Status**: ✅ Loaded and Ready
            """

        info_btn = gr.Button("Refresh Session Info")
        info_output = gr.Markdown()

        info_btn.click(fn=get_session_info, outputs=[info_output])

        clear_btn = gr.Button("Clear All Data", variant="stop")
        clear_output = gr.Textbox(label="Status")

        def clear_all_data():
            doc_store.clear()
            return "✅ All documents and embeddings cleared from memory."

        clear_btn.click(fn=clear_all_data, outputs=[clear_output])

# ==================== MAIN APPLICATION ====================
def create_app():
    with gr.Blocks(theme=gr.themes.Soft(), title="StudyMate - AI Academic Assistant") as app:
        gr.Markdown("""
        # 🎓 StudyMate - AI-Powered Academic Assistant
        ### Your intelligent companion for studying with PDFs
        """)

        with gr.Tabs():
            with gr.Tab("🏠 Home"):
                gr.Markdown("""
                ## Welcome to StudyMate!

                StudyMate is an AI-powered academic assistant that helps you interact with your study materials
                through natural conversation. Upload your textbooks, lecture notes, and research papers, then ask
                questions in plain English!

                ### 🚀 Quick Start Guide:

                1. **📚 Upload PDFs**: Go to the "Upload PDFs" tab and upload your study materials
                2. **💬 Ask Questions**: Use the "Q&A" tab to ask questions about your documents
                3. **🔍 Search**: Find specific information using semantic search
                4. **🎓 Study Tools**: Get definitions, summaries, and explanations
                5. **🔒 Stay Secure**: All processing happens locally on your machine

                ### ✨ Key Features:

                - **Multi-PDF Support**: Upload and query multiple documents at once
                - **Smart Understanding**: AI understands context, not just keywords
                - **Fast Search**: FAISS-powered vector search for instant results
                - **Academic Focus**: Optimized for textbooks and research papers
                - **100% Private**: Your data never leaves your computer

                ### 🎯 Best For:
                - Students studying for exams
                - Researchers reviewing literature
                - Anyone who wants to understand complex documents faster

                ---

                **Ready to get started?** Upload your PDFs in the next tab! 👉
                """)

            with gr.Tab("📚 Upload PDFs"):
                create_upload_tab()

            with gr.Tab("💬 Q&A"):
                create_qa_tab()

            with gr.Tab("🔍 Search"):
                create_search_tab()

            with gr.Tab("🎓 Academic Tools"):
                create_academic_tab()

            with gr.Tab("🔒 Security"):
                create_security_tab()

        gr.Markdown("""
        ---
        <center>
        Built with ❤️ using IBM Granite AI | Powered by Gradio
        </center>
        """)

    return app

# ==================== LAUNCH APPLICATION ====================
if __name__ == "__main__":
    app = create_app()
    app.launch(share=True, debug=True)

Loading AI models... This may take a few minutes.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Models loaded successfully!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c5b52f1e92b8723b30.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
