In [1]:
# ================================================================
# 📘 STUDYMATE – FULL GRADIO APP (Google Colab One-Cell Version)
# ================================================================

!pip install gradio transformers sentence-transformers faiss-cpu pypdf -q

# ---------------------------
# 🔥 MODEL LOADING
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import torch
import gradio as gr
import faiss
from pypdf import PdfReader

# Granite LLM
tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.3-2b-instruct")
model = AutoModelForCausalLM.from_pretrained("ibm-granite/granite-3.3-2b-instruct")

# Embedding model for semantic search
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# FAISS index storage
faiss_index = None
pdf_text_chunks = []


# ---------------------------
# 📌 UTILS
# ---------------------------
def ask_model(prompt):
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=250)
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)


def split_text(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


# ---------------------------
# 📄 MULTI-PDF UPLOAD
# ---------------------------
def process_pdfs(files):
    global faiss_index, pdf_text_chunks
    pdf_text_chunks = []

    all_text = ""

    for file in files:
        reader = PdfReader(file.name)
        for page in reader.pages:
            text = page.extract_text() or ""
            all_text += text + "\n"

    # chunk text
    pdf_text_chunks = split_text(all_text)

    # embed chunks
    embeddings = embedder.encode(pdf_text_chunks)

    # create FAISS index
    dim = embeddings.shape[1]
    faiss_index_local = faiss.IndexFlatL2(dim)
    faiss_index_local.add(embeddings)

    faiss_index = faiss_index_local

    return f"PDFs processed. Total chunks: {len(pdf_text_chunks)}"


# ---------------------------
# 🔍 SEMANTIC SEARCH
# ---------------------------
def semantic_search(query):
    if faiss_index is None:
        return "Please upload PDFs first."

    query_embed = embedder.encode([query])
    D, I = faiss_index.search(query_embed, k=3)

    result = ""
    for idx in I[0]:
        result += f"• {pdf_text_chunks[idx]}\n\n"

    return result.strip()


# ---------------------------
# 💬 CONVERSATIONAL Q&A
# ---------------------------
def conversational_qa(question):
    if faiss_index is None:
        return ask_model(question)

    # retrieve context
    context = semantic_search(question)
    prompt = f"Using this context:\n{context}\n\nAnswer the question:\n{question}"
    return ask_model(prompt)


# ---------------------------
# 📚 ACADEMIC OPTIMIZATION
# ---------------------------
def academic_optimize(text):
    return ask_model(f"Explain clearly and in simple academic language:\n\n{text}")


# ---------------------------
# 🔒 LOCAL & SECURE PROCESSING
# ---------------------------
def secure_process(text):
    return f"(Processed securely inside Colab)\n\n{text}"


# ---------------------------
# 🎨 GRADIO UI
# ---------------------------
with gr.Blocks(title="StudyMate – AI Academic Assistant") as app:

    gr.Markdown("# 📘 StudyMate – AI Academic Assistant")
    gr.Markdown("Upload PDFs and interact using multiple AI-powered tools.")

    with gr.Tab("📄 Multi-PDF Upload & Parsing"):
        file_upload = gr.File(file_count="multiple", label="Upload PDFs")
        pdf_status = gr.Textbox(label="Status")
        process_btn = gr.Button("Process PDFs")
        process_btn.click(process_pdfs, file_upload, pdf_status)

    with gr.Tab("💬 Conversational Q&A"):
        question = gr.Textbox(label="Ask a question")
        answer = gr.Textbox(label="Answer")
        ask_btn = gr.Button("Get Answer")
        ask_btn.click(conversational_qa, question, answer)

    with gr.Tab("🔍 Semantic Search"):
        search_query = gr.Textbox(label="Search query")
        search_output = gr.Textbox(label="Search results")
        search_btn = gr.Button("Search")
        search_btn.click(semantic_search, search_query, search_output)

    with gr.Tab("📚 Academic Optimization"):
        opt_input = gr.Textbox(label="Text to simplify")
        opt_out = gr.Textbox(label="Optimized output")
        opt_btn = gr.Button("Optimize")
        opt_btn.click(academic_optimize, opt_input, opt_out)

    with gr.Tab("🔒 Local & Secure Processing"):
        sec_in = gr.Textbox(label="Input")
        sec_out = gr.Textbox(label="Output")
        sec_btn = gr.Button("Process Securely")
        sec_btn.click(secure_process, sec_in, sec_out)

app.launch()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ee3e6767cf3f55d9c6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


